## GPU info

In [28]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Wed May 15 08:28:50 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   77C    P0              34W /  70W |   1191MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
# Check if GPU is available
from tensorflow.python.client import device_lib

def get_gpu_details():
    devices = device_lib.list_local_devices()
    for device in devices:
        if device.device_type == 'GPU':
            print(f"Device Name: {device.name}")
            print(f"Memory Limit: {device.memory_limit} bytes")
            print(f"Description: {device.physical_device_desc}")

get_gpu_details()


Device Name: /device:GPU:0
Memory Limit: 14626652160 bytes
Description: device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5


In [None]:
# !pip install spacy
# !python -m spacy download en_core_web_sm

In [None]:
# !pip install pyarrow

In [None]:
# !pip install fastparquet

In [None]:
import os

import re
import string
import unicodedata
import nltk
# import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, Concatenate, Layer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
import tensorflow as tf

from tensorflow.keras.models import load_model

import pickle

In [None]:
# Download necessary NLTK resources
nltk.download('punkt')  # Tokenizer
nltk.download('wordnet')  # Lemmatizer
nltk.download('stopwords')  # Stopwords
nltk.download('omw-1.4') # Ensures multilingual contexts

# Stopwords list
stop_words = set(stopwords.words('english'))

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

initial_preprocessing = True

# # Load spaCy's English NLP model
# nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


## Create prepocessing functions for initial text and later response generation preprocessing

In [None]:
def normalize_text(text: str) -> str:
    # Normalize Unicode string to NFKD form, remove non-ASCII characters, and then decode it back to a UTF-8 string
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    # Convert to lowercase
    text = text.lower()
    # Add a space before any punctuation mark (., !, or ?)
    text = re.sub(r"([.!?])", r" \1", text)
    # Handle contractions correctly by not adding space before apostrophe
    text = re.sub(r"(\b\w+)'(d|s|t|ll|ve|re)", r"\1'\2", text)
    # Replace any sequence of characters that are not letters, keep basic punctuation
    text = re.sub(r"[^a-z.,'!? ]", ' ', text)
    # Replace any sequence of whitespace characters with a single space and remove leading and trailing whitespace
    text = re.sub(r"\s+", r" ", text).strip()
    return text

def remove_names(text: str) -> str:
    # Use spaCy to detect and remove names from the text
    doc = nlp(text)
    filtered_text = ' '.join([token.text for token in doc if token.ent_type_ != 'PERSON']) # Takes really long time, exlude from chatbot input preprocessing
    return filtered_text

def preprocess_text(text: str) -> str:
    # Normalize text
    text = normalize_text(text)
    # Remove names using spaCy's NER
    if initial_preprocessing:
        text = remove_names(text)
    # # Remove punctuation
    # text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords and tokenize
    # words = word_tokenize(text) # More intelligent splitting
    # filtered_words = [word for word in words if word not in stop_words]
    # # Lemmatize words
    # lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    # Add <SOS> and <EOS> tokens, and join the list into a single string
    # return ' '.join(['sofs'] + lemmatized_words + ['eofs'])
    return 'sofs ' + text + ' eofs' # Chosen ['sofs', 'eofs'] because tokenizer removes everthing what is in <> or || and are not in dataset vocabulary

## Load the Tokenizer

In [None]:
# Load the tokenizer from file
data_dir = os.path.join(os.getcwd(), 'data')
tokenizer_path = os.path.join(data_dir, 'tokenizer.pickle')
with open(tokenizer_path, 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
print(tokenizer.word_index['sofs'], tokenizer.word_index['eofs']) # Checking if <start> and <end> tokens are in index (vocabulary)

1 2


In [None]:
# Top words in dictionary
from collections import OrderedDict

# Sort the word_counts dictionary by frequency in descending order
sorted_word_counts = OrderedDict(sorted(tokenizer.word_counts.items(), key=lambda x: x[1], reverse=True))

# Display the sorted word counts
print(list(sorted_word_counts.items())[:10])
print(list(sorted_word_counts.items())[-100:])

[('sofs', 304713), ('eofs', 304713), ('know', 22895), ('like', 15314), ('get', 15014), ('got', 13322), ('u', 13080), ('want', 12128), ('think', 11251), ('one', 11186)]
[('ese', 1), ('whatchu', 1), ('mafiya', 1), ('chechnya', 1), ('toady', 1), ('betterment', 1), ('ivans', 1), ('nihilistic', 1), ('freelancing', 1), ('gatherer', 1), ('overview', 1), ('retardant', 1), ('deploys', 1), ('beastie', 1), ('ozzfest', 1), ('russkie', 1), ('shaver', 1), ('polynesia', 1), ('mersh', 1), ('slovo', 1), ('dawning', 1), ('tshirt', 1), ('dishonorably', 1), ('vandal', 1), ('grozny', 1), ('lamborghini', 1), ('genoa', 1), ('pizda', 1), ('filament', 1), ('replicate', 1), ('solider', 1), ('secaucus', 1), ('athletics', 1), ('herded', 1), ('wolverine', 1), ('absorbs', 1), ('definitively', 1), ('poppycock', 1), ('rumous', 1), ('celery', 1), ('cerebrum', 1), ('unashamedly', 1), ('dien', 1), ('gerhart', 1), ('mending', 1), ('galvanism', 1), ('equalize', 1), ('cerebrospinal', 1), ('madein', 1), ('froderick', 1), ('

## Load the Data

In [None]:
# Loading the DataFrame
file_path_parquet = os.path.join(data_dir, 'training_df_s2s.parquet')
training_data_final = pd.read_parquet(file_path_parquet)

training_data_final.head(10)

Unnamed: 0,ID_Input,Padded_Input_Sequences,ID_Response,Padded_Target_Sequences
0,L1044,"[1, 2, 0, 0, 0, 0, 0, 0, 0, 0]",L1045,"[1, 2, 0, 0, 0, 0, 0, 0, 0, 0]"
1,L984,"[1, 38, 2, 0, 0, 0, 0, 0, 0, 0]",L985,"[1, 235, 2, 0, 0, 0, 0, 0, 0, 0]"
2,L924,"[1, 791, 2, 0, 0, 0, 0, 0, 0, 0]",L925,"[1, 28, 11, 2, 0, 0, 0, 0, 0, 0]"
3,L871,"[1, 2, 0, 0, 0, 0, 0, 0, 0, 0]",L872,"[1, 38, 45, 36, 42, 514, 421, 2, 0, 0]"
4,L870,"[1, 541, 3, 349, 590, 3, 663, 2, 0, 0]",L871,"[1, 2, 0, 0, 0, 0, 0, 0, 0, 0]"
5,L868,"[1, 131, 2, 0, 0, 0, 0, 0, 0, 0]",L869,"[1, 4, 725, 813, 2, 0, 0, 0, 0, 0]"
6,L867,"[1, 19, 205, 2, 0, 0, 0, 0, 0, 0]",L868,"[1, 131, 2, 0, 0, 0, 0, 0, 0, 0]"
7,L866,"[1, 664, 5, 19, 205, 1913, 2, 0, 0, 0]",L867,"[1, 19, 205, 2, 0, 0, 0, 0, 0, 0]"
8,L864,"[1, 6220, 2142, 4, 1551, 2, 0, 0, 0, 0]",L865,"[1, 117, 97, 132, 10, 213, 2, 0, 0, 0]"
9,L863,"[1, 913, 2, 0, 0, 0, 0, 0, 0, 0]",L864,"[1, 6220, 2142, 4, 1551, 2, 0, 0, 0, 0]"


# Encoder-decoder architecture with Attention Layer

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, Concatenate, Layer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
import tensorflow as tf

## Attention Layer

In [None]:
class Attention(Layer):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)

    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(query_with_time_axis)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

class AttentionLayer(Layer):
    def __init__(self, units):
        super(AttentionLayer, self).__init__()
        self.attention = Attention(units)

    def call(self, inputs):
        decoder_outputs, encoder_outputs = inputs
        context_vectors, _ = tf.map_fn(lambda x: self.attention(x[0], x[1]),
                                       (decoder_outputs, tf.tile(tf.expand_dims(encoder_outputs, axis=1),
                                                                 [1, tf.shape(decoder_outputs)[1], 1, 1])),
                                       fn_output_signature=(tf.TensorSpec(shape=(None, encoder_outputs.shape[-1]), dtype=tf.float32),
                                                            tf.TensorSpec(shape=(None, None, 1), dtype=tf.float32)))
        return context_vectors

## Define the Model

In [None]:
input_sequences = np.array(training_data_final['Padded_Input_Sequences'].tolist())
target_sequences = np.array(training_data_final['Padded_Target_Sequences'].tolist())

# Splitting the data into training and validation sets
input_train, input_val, target_train, target_val = train_test_split(input_sequences, target_sequences, test_size=0.1, random_state=22)

# Building the model
vocab_size = len(tokenizer.word_index) + 1

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(vocab_size, 50, mask_zero=True)(encoder_inputs)
encoder_lstm, forward_h, forward_c, backward_h, backward_c = Bidirectional(
    LSTM(256, return_state=True, return_sequences=True))(encoder_embedding)
encoder_states = [Concatenate()([forward_h, backward_h]), Concatenate()([forward_c, backward_c])]
encoder_outputs = encoder_lstm

# Attention Mechanism
attention_units = 10
attention_layer = AttentionLayer(attention_units)

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(vocab_size, 50, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(512, return_sequences=True, return_state=True)
decoder_lstm_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

# Apply attention to each time step in the decoder
context_vectors = attention_layer([decoder_lstm_outputs, encoder_outputs])

decoder_concat_input = Concatenate(axis=-1)([context_vectors, decoder_lstm_outputs])
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)

# Main Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

## Load the Model

In [None]:
data_dir = os.path.join(os.getcwd(), 'data')
file_path_h5 = os.path.join(data_dir, 's2s_model.h5')

# Load the model
model = load_model(file_path_h5)

## Train the Model

In [None]:
batch_size = 64
epochs = 3

# Prepare decoder input data that just contains the start token
decoder_input_train = np.hstack([np.zeros((target_train.shape[0], 1)), target_train[:, :-1]])
decoder_input_val = np.hstack([np.zeros((target_val.shape[0], 1)), target_val[:, :-1]])

# Ensure targets are expanded in dimension to match the output shape expected by sparse_categorical_crossentropy
target_train_exp = np.expand_dims(target_train, -1)
target_val_exp = np.expand_dims(target_val, -1)

# Checkpoint callback
checkpoint_filepath = 'model_checkpoint_epoch_{epoch:02d}.h5'
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    save_freq='epoch',
    period=5
)

# Fit the model using the original integer labels
model.fit(
    [input_train, decoder_input_train], target_train_exp,
    validation_data=([input_val, decoder_input_val], target_val_exp),
    epochs=epochs, batch_size=batch_size, verbose=1,
    callbacks=[model_checkpoint_callback]
)

## Save the model

In [None]:
data_dir = os.path.join(os.getcwd(), 'data')
file_path_h5 = os.path.join(data_dir, 's2s_model.h5')
model.save(file_path_h5)

## Generate responses

In [None]:
initial_preprocessing = False # Excepts spaCy to detect and remove names from the text

def generate_response(input_text: str) -> str:
    processed_text = preprocess_text(input_text)
    input_seq = tokenizer.texts_to_sequences([processed_text])
    input_seq = pad_sequences(input_seq, maxlen=max_length, padding='post')

    # Get the encoder states and encoder outputs
    encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_model.predict(input_seq)
    state_h = np.concatenate([forward_h, backward_h], axis=-1)
    state_c = np.concatenate([forward_c, backward_c], axis=-1)
    states_value = [state_h, state_c]

    # Prepare the target sequence with the start token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index['sofs']  # Start token index

    stop_condition = False
    decoded_sentence = ''
    tokens_generated = 0

    while not stop_condition:
        decoder_output, h, c = decoder_model.predict([target_seq] + states_value, verbose=0)
        
        # Attention mechanism
        context_vector, _ = attention_layer(decoder_output, encoder_outputs)
        decoder_output_with_context = np.concatenate([context_vector, decoder_output], axis=-1)
        
        sampled_token_index = np.argmax(decoder_output_with_context[0, -1, :])
        sampled_char = tokenizer.index_word.get(sampled_token_index, '')

        if sampled_token_index == tokenizer.word_index['eofs'] or tokens_generated > 10:  # Stop condition
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_char
            tokens_generated += 1

            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index
            states_value = [h, c]

    return decoded_sentence.strip()


## Testing

In [None]:
# Testing
print("\nUser:     Is she okay?")
print("Bot:          ", generate_response('she okay?'))
print("-----------------------------")
print("\nUser:     How are you feeling today?")
print("Bot:          ", generate_response('How are you feeling today?'))
print("-----------------------------")
print("\nUser:     Hi there!")
print("Bot:          ", generate_response('Hi there!'))
print("-----------------------------")
print("\nUser:     Can you tell me the weather forecast for today?")
print("Bot:          ", generate_response('Can you tell me the weather forecast for today?'))
print("-----------------------------")
print("\nUser:     I think artificial intelligence is changing the world.")
print("Bot:          ", generate_response('I think artificial intelligence is changing the world.'))
print("-----------------------------")
print("\nUser:     Any good movie recommendations?")
print("Bot:          ", generate_response('Any good movie recommendations?'))
print("-----------------------------")
print("\nUser:     What do you mean by that?")
print("Bot:          ", generate_response('What do you mean by that?'))
print("-----------------------------")
print("\nUser:     I'm feeling really sad today.")
print("Bot:          ", generate_response("I'm feeling really sad today."))
print("-----------------------------")
print("\nUser:     What are the implications of quantum computing on cybersecurity?")
print("Bot:          ", generate_response('What are the implications of quantum computing on cybersecurity?'))
print("-----------------------------")
print("\nUser:     Why did the chicken cross the road?")
print("Bot:          ", generate_response('Why did the chicken cross the road?'))
print("-----------------------------")
print("\nUser:     Can you explain the plot of The Matrix?")
print("Bot:          ", generate_response('Can you explain the plot of The Matrix?'))