## Loading the DataFrame

In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow

data_dir = os.path.join(os.getcwd(), 'data')
file_path_parquet = os.path.join(data_dir, 'training_data.parquet')
data = pd.read_parquet(file_path_parquet)
data.head(20)

Unnamed: 0,ID_Input,Input,Tokens_Input,ID_Response,Response,Tokens_Response
0,L1044,,"[<start>, <end>]",L1045,,"[<start>, <end>]"
1,L984,okay,"[<start>, okay, <end>]",L985,hope,"[<start>, hope, <end>]"
2,L924,wow,"[<start>, wow, <end>]",L925,lets go,"[<start>, let, go, <end>]"
3,L871,,"[<start>, <end>]",L872,okay youre gonna need learn lie,"[<start>, okay, youre, gon, na, need, learn, l..."
4,L870,im kidding know sometimes become persona dont ...,"[<start>, im, kidding, know, sometimes, become...",L871,,"[<start>, <end>]"
5,L868,real,"[<start>, real, <end>]",L869,like fear wearing pastels,"[<start>, like, fear, wearing, pastel, <end>]"
6,L867,good stuff,"[<start>, good, stuff, <end>]",L868,real,"[<start>, real, <end>]"
7,L866,figured youd get good stuff eventually,"[<start>, figured, youd, get, good, stuff, eve...",L867,good stuff,"[<start>, good, stuff, <end>]"
8,L864,endless blonde babble im like boring,"[<start>, endless, blonde, babble, im, like, b...",L865,thank god hear one story coiffure,"[<start>, thank, god, hear, one, story, coiffu..."
9,L863,crap,"[<start>, crap, <end>]",L864,endless blonde babble im like boring,"[<start>, endless, blonde, babble, im, like, b..."


In [2]:
# Check if GPU is available
import tensorflow

from tensorflow.python.client import device_lib

def get_gpu_details():
    devices = device_lib.list_local_devices()
    for device in devices:
        if device.device_type == 'GPU':
            print(f"Device Name: {device.name}")
            print(f"Memory Limit: {device.memory_limit} bytes")
            print(f"Description: {device.physical_device_desc}")

get_gpu_details()


Device Name: /device:GPU:0
Memory Limit: 4158652416 bytes
Description: device: 0, name: NVIDIA GeForce GTX 1660 Ti with Max-Q Design, pci bus id: 0000:01:00.0, compute capability: 7.5


## Encoder-decoder architecture

In [3]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

### Work on sentences

In [None]:
# # Initialize tokenizer and fit on texts
# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(['<start> <end>'] + data['Input'] + data['Response'])

# # Adding start and end tokens to each input and response
# data['Input'] = ['<start> ' + text + ' <end>' for text in data['Input']]
# data['Response'] = ['<start> ' + text + ' <end>' for text in data['Response']]

# # Convert text to sequences
# input_sequences = tokenizer.texts_to_sequences(data['Input'])
# target_sequences = tokenizer.texts_to_sequences(data['Response'])

### Work with lemmatized tokens

In [None]:
# Assuming 'data' is already loaded with 'Input' and 'Response' columns
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['Tokens_Input'].tolist() + data['Tokens_Response'].tolist())

# Convert text to sequences
input_sequences = data['Tokens_Input']
target_sequences = data['Tokens_Response']

max_length = 30

# Pad both input and target sequences to max_length
input_sequences = pad_sequences(input_sequences, maxlen=max_length, padding='post')
target_sequences = pad_sequences(target_sequences, maxlen=max_length, padding='post')

In [None]:
print(input_sequences[88])
print(target_sequences[88])

In [None]:
# Splitting the data into training and validation sets
input_train, input_val, target_train, target_val = train_test_split(input_sequences, target_sequences, test_size=0.1, random_state=22)

# Building the model
vocab_size = len(tokenizer.word_index) + 1

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(vocab_size, 128, mask_zero=True)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(256, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(vocab_size, 128, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=False)
decoder_outputs = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
output = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

## Training the model

In [None]:
# Prepare decoder input data that just contains the start token
decoder_input_train = np.hstack([np.zeros((target_train.shape[0], 1)), target_train[:, :-1]])  # shift target sequences
decoder_input_val = np.hstack([np.zeros((target_val.shape[0], 1)), target_val[:, :-1]])

# Fit model
model.fit([input_train, decoder_input_train], np.expand_dims(target_train, -1),
          validation_data=([input_val, decoder_input_val], np.expand_dims(target_val, -1)),
          epochs=3, batch_size=64)

## Generate responses

In [None]:
import re
import unicodedata
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

# Download necessary NLTK resources
nltk.download('punkt')  # Tokenizer
nltk.download('wordnet')  # Lemmatizer
nltk.download('stopwords')  # Stopwords

# Stopwords list
stop_words = set(stopwords.words('english'))

def normalize_text(text: str) -> str:
    text = text.lower().strip()
    text = re.sub(r"[^a-z0-9.',!? ]", ' ', text)
    text = re.sub(r'\d+', '<num>', text)
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    return text

def preprocess_text(text: str) -> str:
    # Normalize text
    text = normalize_text(text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Function to generate responses
def generate_response(input_text: str) -> str:
    # Preprocess the input text
    processed_text = preprocess_text(input_text)
    processed_text = '<start> ' + processed_text + ' <end>'
    
    # Tokenize and Lemmatize
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(processed_text)
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Convert to sequence
    input_seq = tokenizer.texts_to_sequences([words])
    input_seq = pad_sequences(input_seq, maxlen=max_length, padding='post')

    # Initialize the state of the decoder using the encoder's output
    states_value = encoder_model.predict(input_seq)

    # Start the sequence with the `<start>` token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index['<start>']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = tokenizer.index_word.get(sampled_token_index, '')  # Fallback to empty if unknown
        
        # Append sampled token to the decoded sentence
        if sampled_char != '<end>':
            decoded_sentence += ' ' + sampled_char
        
        # Exit condition: either hit max length or find stop token.
        if sampled_char == '<end>' or len(decoded_sentence) > max_length:
            stop_condition = True

        # Update the target sequence to the newly predicted token
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update the states
        states_value = [h, c]

    return decoded_sentence.strip()


## Testing

In [None]:
# Testing
print("User: Is she okay?")
print("Bot:", generate_response('she okay?'))

print("User: How are you feeling today?")
print("Bot:", generate_response('How are you feeling today?'))

print("User: Hi there!")
print("Bot:", generate_response('Hi there!'))

print("User: Can you tell me the weather forecast for today?")
print("Bot:", generate_response('Can you tell me the weather forecast for today?'))

print("User: I think artificial intelligence is changing the world.")
print("Bot:", generate_response('I think artificial intelligence is changing the world.'))

print("User: Any good movie recommendations?")
print("Bot:", generate_response('Any good movie recommendations?'))

print("User: What do you mean by that?")
print("Bot:", generate_response('What do you mean by that?'))

print("User: I'm feeling really sad today.")
print("Bot:", generate_response("I'm feeling really sad today."))

print("User: What are the implications of quantum computing on cybersecurity?")
print("Bot:", generate_response('What are the implications of quantum computing on cybersecurity?'))

print("User: Why did the chicken cross the road?")
print("Bot:", generate_response('Why did the chicken cross the road?'))

print("User: Can you explain the plot of The Matrix?")
print("Bot:", generate_response('Can you explain the plot of The Matrix?'))