In [10]:
# Check if GPU is available
from tensorflow.python.client import device_lib

def get_gpu_details():
    devices = device_lib.list_local_devices()
    for device in devices:
        if device.device_type == 'GPU':
            print(f"Device Name: {device.name}")
            print(f"Memory Limit: {device.memory_limit} bytes")
            print(f"Description: {device.physical_device_desc}")

get_gpu_details()


Device Name: /device:GPU:0
Memory Limit: 4158652416 bytes
Description: device: 0, name: NVIDIA GeForce GTX 1660 Ti with Max-Q Design, pci bus id: 0000:01:00.0, compute capability: 7.5


In [11]:
import os
import pandas as pd
import numpy as np

## Loading the DataFrame

In [12]:
# Loading the DataFrame
data_dir = os.path.join(os.getcwd(), 'data')

file_path_parquet = os.path.join(data_dir, 'utterances.parquet')
df_loaded_parquet = pd.read_parquet(file_path_parquet)

df_loaded_parquet.head(5)

Unnamed: 0,id,conversation_id,text,speaker,reply_to,timestamp,movie_id,tok_0_token,tok_0_tag,tok_0_dep,...,tok_121_dep,tok_122_token,tok_122_tag,tok_122_dep,tok_123_token,tok_123_tag,tok_123_dep,tok_124_token,tok_124_tag,tok_124_dep
0,L1045,L1044,They do not!,u0,L1044,,m0,They,PRP,nsubj,...,,,,,,,,,,
1,L1044,L1044,They do to!,u2,,,m0,They,PRP,nsubj,...,,,,,,,,,,
2,L985,L984,I hope so.,u0,L984,,m0,I,PRP,nsubj,...,,,,,,,,,,
3,L984,L984,She okay?,u2,,,m0,She,PRP,nsubj,...,,,,,,,,,,
4,L925,L924,Let's go.,u0,L924,,m0,Let,VB,ROOT,...,,,,,,,,,,


## Data cleaning

### Leaving only necessary data for simple Sec2Seq model
Id, conversation_id for tracking the flow of conversations and reply_to for understanding the sequence within the dialogue, and conversation text ofcourse.

In [13]:
conversations = df_loaded_parquet[['text', 'id', 'conversation_id', 'reply_to']]
conversations.head(5)

Unnamed: 0,text,id,conversation_id,reply_to
0,They do not!,L1045,L1044,L1044
1,They do to!,L1044,L1044,
2,I hope so.,L985,L984,L984
3,She okay?,L984,L984,
4,Let's go.,L925,L924,L924


## Create prepocessing functions for initial text and later response generation preprocessing

In [15]:
# !pip install spacy
# !python -m spacy download en_core_web_sm

In [16]:
import re
import string
import unicodedata
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Download necessary NLTK resources
nltk.download('punkt')  # Tokenizer
nltk.download('wordnet')  # Lemmatizer
nltk.download('stopwords')  # Stopwords
nltk.download('omw-1.4') # Ensures multilingual contexts

# Stopwords list
stop_words = set(stopwords.words('english'))

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

initial_preprocessing = True

# Load spaCy's English NLP model
nlp = spacy.load('en_core_web_sm')

def normalize_text(text: str) -> str:
    text = text.lower().strip()
    text = re.sub(r"[^a-z0-9.,!? ]", ' ', text)  # Keeps basic punctuation if remove punctuation is not applied
    text = re.sub(r'\d+', '<num>', text)
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    return text

def remove_names(text: str) -> str:
    # Use spaCy to detect and remove names from the text
    doc = nlp(text)
    filtered_text = ' '.join([token.text for token in doc if token.ent_type_ != 'PERSON']) # Takes really long time, exlude from chatbot input preprocessing
    return filtered_text

def preprocess_text(text: str) -> str:
    # Normalize text
    text = normalize_text(text)
    # Remove names using spaCy's NER
    if initial_preprocessing:
        text = remove_names(text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords and tokenize
    words = word_tokenize(text) # More intelligent splitting
    filtered_words = [word for word in words if word not in stop_words]
    # Lemmatize words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    # Add <SOS> and <EOS> tokens, and join the list into a single string
    return ' '.join(['sofs'] + lemmatized_words + ['eofs']) # Chosen ['sofs', 'eofs'] because tokenizer removes everthing what is in <> or || and are not in dataset vocabulary

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tomui\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tomui\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tomui\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\tomui\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Data preprocessing

In [None]:
# Apply the preprocessing function to each row in the 'text' column
conversations['preprocessed_text'] = conversations['text'].apply(preprocess_text)

In [None]:
conversations[1140: 1150]

## Saving the DataFrame

In [16]:
# !pip install pyarrow

In [17]:
# !pip install fastparquet

In [24]:
# Saving the DataFrame
data_dir = os.path.join(os.getcwd(), 'data')
file_path_parquet = os.path.join(data_dir, 'preprocessed_s2s.parquet')
conversations.to_parquet(file_path_parquet)

## Loading the DataFrame

In [223]:
# Loading the DataFrame
file_path_parquet = os.path.join(data_dir, 'preprocessed_s2s.parquet')
conversations = pd.read_parquet(file_path_parquet)

In [224]:
conversations[1140: 1150]

Unnamed: 0,text,id,conversation_id,reply_to,preprocessed_text
1140,Why'd you help me back there with the Chief? ...,L3229,L3229,,sofs help back chief stand like eofs
1141,How you go out on a limb for somebody is by gi...,L3228,L3223,L3227,sofs go limb somebody giving number immigratio...
1142,"But, I mean, didn't you ever go out on a limb ...",L3227,L3223,L3226,sofs mean ever go limb somebody mean shoulda h...
1143,"Well, it's not up to you to decide whether she...",L3226,L3223,L3225,sofs well decide whether innocent understand p...
1144,"I told you, you know, I thought I was doing th...",L3225,L3223,L3224,sofs told know thought right thing know think ...
1145,"No, I don't think you were a fool, I just thin...",L3224,L3223,L3223,sofs think fool think stupid mean say least ou...
1146,"Yeah, just her in the shower. Nothing happene...",L3223,L3223,,sofs yeah shower nothing happened look sure pr...
1147,Just a shower?,L3222,L3219,L3221,sofs shower eofs
1148,I took her there for a shower and that's it.,L3221,L3219,L3220,sofs took shower eofs
1149,"Well, you shoulda because nobody's gonna belie...",L3220,L3219,L3219,sofs well shoulda nobody gon na believe includ...


## Initialize and save the tokenizer

In [225]:
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle

tokenizer = Tokenizer(num_words=10000)

# Fit the tokenizer on the modified text data that includes <start> and <end> tokens
tokenizer.fit_on_texts(conversations['preprocessed_text']) # <SOS> and <EOS> == sofs an eofs  == <start> and <end>

# Determine the directory where the tokenizer will be saved
data_dir = os.path.join(os.getcwd(), 'data')
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

# Save the tokenizer using pickle
tokenizer_path = os.path.join(data_dir, 'tokenizer.pickle')
with open(tokenizer_path, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print(f"Tokenizer saved to {tokenizer_path}")

Tokenizer saved to C:\Users\tomui\Desktop\capstone_project\data\tokenizer.pickle


## Load the Tokenizer

In [226]:
# # Load the tokenizer from file
# data_dir = os.path.join(os.getcwd(), 'data')
# tokenizer_path = os.path.join(data_dir, 'tokenizer.pickle')
# with open(tokenizer_path, 'rb') as handle:
#     tokenizer = pickle.load(handle)

In [227]:
print(tokenizer.word_index['sofs'], tokenizer.word_index['eofs']) # Checking if <start> and <end> tokens are in index (vocabulary)

1 2


In [228]:
# Top words in dictionary
from collections import OrderedDict

# Sort the word_counts dictionary by frequency in descending order
sorted_word_counts = OrderedDict(sorted(tokenizer.word_counts.items(), key=lambda x: x[1], reverse=True))

# Display the sorted word counts
print(list(sorted_word_counts.items())[:10])
print(list(sorted_word_counts.items())[-100:])

[('sofs', 304713), ('eofs', 304713), ('know', 22895), ('like', 15314), ('get', 15014), ('got', 13322), ('u', 13080), ('want', 12128), ('think', 11251), ('one', 11186)]
[('ese', 1), ('whatchu', 1), ('mafiya', 1), ('chechnya', 1), ('toady', 1), ('betterment', 1), ('ivans', 1), ('nihilistic', 1), ('freelancing', 1), ('gatherer', 1), ('overview', 1), ('retardant', 1), ('deploys', 1), ('beastie', 1), ('ozzfest', 1), ('russkie', 1), ('shaver', 1), ('polynesia', 1), ('mersh', 1), ('slovo', 1), ('dawning', 1), ('tshirt', 1), ('dishonorably', 1), ('vandal', 1), ('grozny', 1), ('lamborghini', 1), ('genoa', 1), ('pizda', 1), ('filament', 1), ('replicate', 1), ('solider', 1), ('secaucus', 1), ('athletics', 1), ('herded', 1), ('wolverine', 1), ('absorbs', 1), ('definitively', 1), ('poppycock', 1), ('rumous', 1), ('celery', 1), ('cerebrum', 1), ('unashamedly', 1), ('dien', 1), ('gerhart', 1), ('mending', 1), ('galvanism', 1), ('equalize', 1), ('cerebrospinal', 1), ('madein', 1), ('froderick', 1), ('

## Pairing messages - input with responses

In [229]:
# Merging the DataFrame with itself to form pairs
pairs = pd.merge(
    conversations, conversations,
    left_on='id',
    right_on='reply_to',
    suffixes=('_input', '_response')
)

In [230]:
pairs.head()

Unnamed: 0,text_input,id_input,conversation_id_input,reply_to_input,preprocessed_text_input,text_response,id_response,conversation_id_response,reply_to_response,preprocessed_text_response
0,They do to!,L1044,L1044,,sofs eofs,They do not!,L1045,L1044,L1044,sofs eofs
1,She okay?,L984,L984,,sofs okay eofs,I hope so.,L985,L984,L984,sofs hope eofs
2,Wow,L924,L924,,sofs wow eofs,Let's go.,L925,L924,L924,sofs let go eofs
3,No,L871,L870,L870,sofs eofs,Okay -- you're gonna need to learn how to lie.,L872,L870,L871,sofs okay gon na need learn lie eofs
4,I'm kidding. You know how sometimes you just ...,L870,L870,,sofs kidding know sometimes become persona kno...,No,L871,L870,L870,sofs eofs


In [231]:
# Selecting the needed columns including IDs
training_data = pairs[['id_input', 'text_input', 'preprocessed_text_input', 'id_response', 'text_response', 'preprocessed_text_response']]

# Renaming columns for clarity
training_data.columns = ['ID_Input', 'Original_Text_Input', 'Text_Input', 'ID_Response', 'Original_Text_Response', 'Text_Response']

In [232]:
training_data

Unnamed: 0,ID_Input,Original_Text_Input,Text_Input,ID_Response,Original_Text_Response,Text_Response
0,L1044,They do to!,sofs eofs,L1045,They do not!,sofs eofs
1,L984,She okay?,sofs okay eofs,L985,I hope so.,sofs hope eofs
2,L924,Wow,sofs wow eofs,L925,Let's go.,sofs let go eofs
3,L871,No,sofs eofs,L872,Okay -- you're gonna need to learn how to lie.,sofs okay gon na need learn lie eofs
4,L870,I'm kidding. You know how sometimes you just ...,sofs kidding know sometimes become persona kno...,L871,No,sofs eofs
...,...,...,...,...,...,...
221611,L666520,"Well I assure you, Sir, I have no desire to cr...",sofs well assure sir desire create difficulty ...,L666521,"And I assure you, you do not In fact I'd be ob...",sofs assure fact obliged best advice scout see...
221612,L666371,Lord Chelmsford seems to want me to stay back ...,sofs lord chelmsford seems want stay back basu...,L666372,I think Chelmsford wants a good man on the bor...,sofs think chelmsford want good man border fea...
221613,L666370,I'm to take the Sikali with the main column to...,sofs take sikali main column river eofs,L666371,Lord Chelmsford seems to want me to stay back ...,sofs lord chelmsford seems want stay back basu...
221614,L666369,"Your orders, Mr Vereker?",sofs order mr vereker eofs,L666370,I'm to take the Sikali with the main column to...,sofs take sikali main column river eofs


In [233]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221616 entries, 0 to 221615
Data columns (total 6 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   ID_Input                221616 non-null  object
 1   Original_Text_Input     221616 non-null  object
 2   Text_Input              221616 non-null  object
 3   ID_Response             221616 non-null  object
 4   Original_Text_Response  221616 non-null  object
 5   Text_Response           221616 non-null  object
dtypes: object(6)
memory usage: 10.1+ MB


## Variables for configuration

In [234]:
max_length = 10 # Variable for padding

## Converting to indices and input-output sequences

In [235]:
# Convert texts to sequences
input_sequences = tokenizer.texts_to_sequences(training_data['Text_Input'])
target_sequences = tokenizer.texts_to_sequences(training_data['Text_Response'])

# Pad sequences
input_padded = pad_sequences(input_sequences, maxlen=max_length, padding='post')
target_padded = pad_sequences(target_sequences, maxlen=max_length, padding='post')

# Store numpy arrays directly in the DataFrame
training_data['Padded_Input_Sequences'] = list(map(np.array, input_padded))
training_data['Padded_Target_Sequences'] = list(map(np.array, target_padded))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data['Padded_Input_Sequences'] = list(map(np.array, input_padded))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data['Padded_Target_Sequences'] = list(map(np.array, target_padded))


In [236]:
training_data

Unnamed: 0,ID_Input,Original_Text_Input,Text_Input,ID_Response,Original_Text_Response,Text_Response,Padded_Input_Sequences,Padded_Target_Sequences
0,L1044,They do to!,sofs eofs,L1045,They do not!,sofs eofs,"[1, 2, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 2, 0, 0, 0, 0, 0, 0, 0, 0]"
1,L984,She okay?,sofs okay eofs,L985,I hope so.,sofs hope eofs,"[1, 38, 2, 0, 0, 0, 0, 0, 0, 0]","[1, 235, 2, 0, 0, 0, 0, 0, 0, 0]"
2,L924,Wow,sofs wow eofs,L925,Let's go.,sofs let go eofs,"[1, 791, 2, 0, 0, 0, 0, 0, 0, 0]","[1, 28, 11, 2, 0, 0, 0, 0, 0, 0]"
3,L871,No,sofs eofs,L872,Okay -- you're gonna need to learn how to lie.,sofs okay gon na need learn lie eofs,"[1, 2, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 38, 45, 36, 42, 514, 421, 2, 0, 0]"
4,L870,I'm kidding. You know how sometimes you just ...,sofs kidding know sometimes become persona kno...,L871,No,sofs eofs,"[1, 541, 3, 349, 590, 3, 663, 2, 0, 0]","[1, 2, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...,...,...,...,...,...,...
221611,L666520,"Well I assure you, Sir, I have no desire to cr...",sofs well assure sir desire create difficulty ...,L666521,"And I assure you, you do not In fact I'd be ob...",sofs assure fact obliged best advice scout see...,"[1, 13, 2210, 68, 1655, 1801, 4099, 44, 2, 0]","[1, 2210, 358, 5036, 174, 1053, 2707, 171, 2, 0]"
221612,L666371,Lord Chelmsford seems to want me to stay back ...,sofs lord chelmsford seems want stay back basu...,L666372,I think Chelmsford wants a good man on the bor...,sofs think chelmsford want good man border fea...,"[1, 602, 416, 8, 146, 27, 2, 0, 0, 0]","[19, 31, 2047, 725, 741, 3415, 2135, 1207, 422..."
221613,L666370,I'm to take the Sikali with the main column to...,sofs take sikali main column river eofs,L666371,Lord Chelmsford seems to want me to stay back ...,sofs lord chelmsford seems want stay back basu...,"[1, 29, 1285, 2892, 971, 2, 0, 0, 0, 0]","[1, 602, 416, 8, 146, 27, 2, 0, 0, 0]"
221614,L666369,"Your orders, Mr Vereker?",sofs order mr vereker eofs,L666370,I'm to take the Sikali with the main column to...,sofs take sikali main column river eofs,"[1, 383, 39, 2, 0, 0, 0, 0, 0, 0]","[1, 29, 1285, 2892, 971, 2, 0, 0, 0, 0]"


## Checking if conversion was successfull

In [237]:
# Print original and reverse-tokenized text for entries
for index, row in training_data[1130:1140].iterrows():
    print("Original Text:", row['Original_Text_Input'], 
          "\nReconstructed Text:", sequences_to_text(row['Padded_Input_Sequences']))
    print("Original Text:", row['Original_Text_Response'], 
          "\nReconstructed Text:", sequences_to_text(row['Padded_Target_Sequences']))


Original Text: But doesn't the Son of Sam Law prevent criminals from profiting from their crimes? 
Reconstructed Text: sofs son law prevent criminal crime eofs
Original Text: That doesn't apply to me because I'm not a criminal.  I'm not a criminal!  I wasn't convicted. 
Reconstructed Text: sofs apply criminal criminal convicted eofs
Original Text: We're in negotiations, that's correct. 
Reconstructed Text: sofs negotiation correct eofs
Original Text: But doesn't the Son of Sam Law prevent criminals from profiting from their crimes? 
Reconstructed Text: sofs son law prevent criminal crime eofs
Original Text: And isn't there a movie in the works about you? 
Reconstructed Text: sofs movie work eofs
Original Text: We're in negotiations, that's correct. 
Reconstructed Text: sofs negotiation correct eofs
Original Text: Look, I'm in here.  You call this a career move? 
Reconstructed Text: sofs look call career move eofs
Original Text: And isn't there a movie in the works about you? 
Reconstru

In [238]:
# Selecting the needed columns including IDs
training_data_final = training_data[['ID_Input', 'Padded_Input_Sequences', 'ID_Response', 'Padded_Target_Sequences']]

## Saving the DataFrame

In [239]:
# Saving the DataFrame
data_dir = os.path.join(os.getcwd(), 'data')
file_path_parquet = os.path.join(data_dir, 'training_df_s2s.parquet')
training_data_final.to_parquet(file_path_parquet)

## Loading the DataFrame

In [402]:
# Loading the DataFrame
file_path_parquet = os.path.join(data_dir, 'training_df_s2s.parquet')
training_data_final = pd.read_parquet(file_path_parquet)

training_data_final.head(10)

Unnamed: 0,ID_Input,Padded_Input_Sequences,ID_Response,Padded_Target_Sequences
0,L1044,"[1, 2, 0, 0, 0, 0, 0, 0, 0, 0]",L1045,"[1, 2, 0, 0, 0, 0, 0, 0, 0, 0]"
1,L984,"[1, 38, 2, 0, 0, 0, 0, 0, 0, 0]",L985,"[1, 235, 2, 0, 0, 0, 0, 0, 0, 0]"
2,L924,"[1, 791, 2, 0, 0, 0, 0, 0, 0, 0]",L925,"[1, 28, 11, 2, 0, 0, 0, 0, 0, 0]"
3,L871,"[1, 2, 0, 0, 0, 0, 0, 0, 0, 0]",L872,"[1, 38, 45, 36, 42, 514, 421, 2, 0, 0]"
4,L870,"[1, 541, 3, 349, 590, 3, 663, 2, 0, 0]",L871,"[1, 2, 0, 0, 0, 0, 0, 0, 0, 0]"
5,L868,"[1, 131, 2, 0, 0, 0, 0, 0, 0, 0]",L869,"[1, 4, 725, 813, 2, 0, 0, 0, 0, 0]"
6,L867,"[1, 19, 205, 2, 0, 0, 0, 0, 0, 0]",L868,"[1, 131, 2, 0, 0, 0, 0, 0, 0, 0]"
7,L866,"[1, 664, 5, 19, 205, 1913, 2, 0, 0, 0]",L867,"[1, 19, 205, 2, 0, 0, 0, 0, 0, 0]"
8,L864,"[1, 6220, 2142, 4, 1551, 2, 0, 0, 0, 0]",L865,"[1, 117, 97, 132, 10, 213, 2, 0, 0, 0]"
9,L863,"[1, 913, 2, 0, 0, 0, 0, 0, 0, 0]",L864,"[1, 6220, 2142, 4, 1551, 2, 0, 0, 0, 0]"


## Checking if GPU available

In [403]:
import tensorflow

from tensorflow.python.client import device_lib

def get_gpu_details():
    devices = device_lib.list_local_devices()
    for device in devices:
        if device.device_type == 'GPU':
            print(f"Device Name: {device.name}")
            print(f"Memory Limit: {device.memory_limit} bytes")
            print(f"Description: {device.physical_device_desc}")

get_gpu_details()


Device Name: /device:GPU:0
Memory Limit: 4158652416 bytes
Description: device: 0, name: NVIDIA GeForce GTX 1660 Ti with Max-Q Design, pci bus id: 0000:01:00.0, compute capability: 7.5


## Encoder-decoder architecture

In [404]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

### Splitting the data

In [405]:
input_sequences = np.array(training_data_final['Padded_Input_Sequences'].tolist())
target_sequences = np.array(training_data_final['Padded_Target_Sequences'].tolist())

In [406]:
print(type(input_sequences))
print(type(target_sequences))
print(input_sequences[88])
print(target_sequences[88])

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
[   1  141 1798 6223   34  314  139  127    2    0]
[   1  309 5414 8245    6  321 2001   11    2    0]


In [407]:
# Splitting the data into training and validation sets
input_train, input_val, target_train, target_val = train_test_split(input_sequences, target_sequences, test_size=0.1, random_state=22)

## Building the Model

## Variables for configuration

In [408]:
learning_rate = 0.01

In [409]:
# Building the model
vocab_size = len(tokenizer.word_index) + 1

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(vocab_size, 50, mask_zero=True)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(256, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

encoder_model = Model(encoder_inputs, encoder_states)

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(vocab_size, 50, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_states = [state_h, state_c]
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([decoder_inputs] + encoder_states, [decoder_outputs] + decoder_states)

# Main Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
optimizer = Adam(learning_rate=learning_rate)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "model_32"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_55 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 input_56 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 embedding_46 (Embedding)       (None, None, 50)     2050800     ['input_55[0][0]']               
                                                                                                  
 embedding_47 (Embedding)       (None, None, 50)     2050800     ['input_56[0][0]']               
                                                                                           

## Variables for configuration

In [410]:
batch_size = 64
epochs = 2

## Training the model

In [411]:
from tensorflow.keras.utils import to_categorical

# Prepare decoder input data that just contains the start token
decoder_input_train = np.hstack([np.zeros((target_train.shape[0], 1)), target_train[:, :-1]])  # shift target sequences
decoder_input_val = np.hstack([np.zeros((target_val.shape[0], 1)), target_val[:, :-1]])

# Ensure targets are expanded in dimension to match the output shape expected by sparse_categorical_crossentropy
target_train_exp = np.expand_dims(target_train, -1)
target_val_exp = np.expand_dims(target_val, -1)

# Fit the model using the original integer labels
model.fit(
    [input_train, decoder_input_train], target_train_exp,
    validation_data=([input_val, decoder_input_val], target_val_exp),
    epochs=epochs, batch_size=batch_size
)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x285a8cbc6a0>

In [412]:
# # Prepare decoder input data that just contains the start token
# decoder_input_train = np.hstack([np.zeros((target_train.shape[0], 1)), target_train[:, :-1]])  # shift target sequences
# decoder_input_val = np.hstack([np.zeros((target_val.shape[0], 1)), target_val[:, :-1]])

# # Fit model
# # model.fit([input_train, decoder_input_train], np.expand_dims(target_train, -1),
# #           validation_data=([input_val, decoder_input_val], np.expand_dims(target_val, -1)),
# #           epochs=epochs, batch_size=batch_size)
# model.fit(
#     [input_train, decoder_input_train], decoder_target_data,
#     validation_data=([input_val, decoder_input_val], to_categorical(target_val, num_classes=vocab_size)),
#     epochs=epochs, batch_size=batch_size)

## Generate responses

In [425]:
initial_preprocessing = False # Excepts spaCy to detect and remove names from the text

def generate_response(input_text: str) -> str:
    processed_text = preprocess_text(input_text)
    input_seq = tokenizer.texts_to_sequences([processed_text])
    input_seq = pad_sequences(input_seq, maxlen=max_length, padding='post')

    print(f"Processed Text: {processed_text}")
    print(f"Input Sequence: {input_seq}")

    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index['sofs']  # Start token index

    stop_condition = False
    decoded_sentence = ''
    tokens_generated = 0

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = tokenizer.index_word.get(sampled_token_index, '')

        print(f"Sampled Token Index: {sampled_token_index}")
        print(f"Sampled Char: {sampled_char}")

        if sampled_token_index == tokenizer.word_index['eofs'] or tokens_generated > 10:  # Stop condition
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_char
            tokens_generated += 1

            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index
            states_value = [h, c]

    print(f"Decoded Sentence: {decoded_sentence.strip()}")
    return decoded_sentence.strip()
    

## Testing

In [426]:
# Testing
print("User: Is she okay?")
print("Bot:", generate_response('she okay?'))

print("User: How are you feeling today?")
print("Bot:", generate_response('How are you feeling today?'))

print("User: Hi there!")
print("Bot:", generate_response('Hi there!'))

print("User: Can you tell me the weather forecast for today?")
print("Bot:", generate_response('Can you tell me the weather forecast for today?'))

print("User: I think artificial intelligence is changing the world.")
print("Bot:", generate_response('I think artificial intelligence is changing the world.'))

print("User: Any good movie recommendations?")
print("Bot:", generate_response('Any good movie recommendations?'))

print("User: What do you mean by that?")
print("Bot:", generate_response('What do you mean by that?'))

print("User: I'm feeling really sad today.")
print("Bot:", generate_response("I'm feeling really sad today."))

print("User: What are the implications of quantum computing on cybersecurity?")
print("Bot:", generate_response('What are the implications of quantum computing on cybersecurity?'))

print("User: Why did the chicken cross the road?")
print("Bot:", generate_response('Why did the chicken cross the road?'))

print("User: Can you explain the plot of The Matrix?")
print("Bot:", generate_response('Can you explain the plot of The Matrix?'))

User: Is she okay?
Processed Text: sofs okay eofs
Input Sequence: [[ 1 38  2  0  0  0  0  0  0  0]]
Sampled Token Index: 2
Sampled Char: eofs
Decoded Sentence: 
Bot: 
User: How are you feeling today?
Processed Text: sofs feeling today eofs
Input Sequence: [[  1 339 226   2   0   0   0   0   0   0]]
Sampled Token Index: 2
Sampled Char: eofs
Decoded Sentence: 
Bot: 
User: Hi there!
Processed Text: sofs hi eofs
Input Sequence: [[  1 258   2   0   0   0   0   0   0   0]]
Sampled Token Index: 2
Sampled Char: eofs
Decoded Sentence: 
Bot: 
User: Can you tell me the weather forecast for today?
Processed Text: sofs tell weather forecast today eofs
Input Sequence: [[   1   22 1134  226    2    0    0    0    0    0]]
Sampled Token Index: 2
Sampled Char: eofs
Decoded Sentence: 
Bot: 
User: I think artificial intelligence is changing the world.
Processed Text: sofs think artificial intelligence changing world eofs
Input Sequence: [[   1    9 7217 1759 2191  147    2    0    0    0]]
Sampled Token 

## Save the model

In [415]:
data_dir = os.path.join(os.getcwd(), 'data')
file_path_tf = os.path.join(data_dir, 's2s_model')
model.save(file_path_tf, save_format='tf')
file_path_h5 = os.path.join(data_dir, 's2s_model.h5')
model.save(file_path_h5)



INFO:tensorflow:Assets written to: C:\Users\tomui\Desktop\capstone_project\data\s2s_model\assets


INFO:tensorflow:Assets written to: C:\Users\tomui\Desktop\capstone_project\data\s2s_model\assets


In [416]:
# from tensorflow.keras.models import Model
# from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras.callbacks import ModelCheckpoint

# # Assume you have already defined your model architecture
# vocab_size = len(tokenizer.word_index) + 1
# encoder_inputs = Input(shape=(None,))
# encoder_embedding = Embedding(vocab_size, 100, mask_zero=True)(encoder_inputs)
# encoder_outputs, state_h, state_c = LSTM(256, return_state=True)(encoder_embedding)
# encoder_states = [state_h, state_c]

# decoder_inputs = Input(shape=(None,))
# decoder_embedding = Embedding(vocab_size, 100, mask_zero=True)(decoder_inputs)
# decoder_lstm = LSTM(256, return_sequences=True, return_state=False)(decoder_embedding, initial_state=encoder_states)
# decoder_dense = Dense(vocab_size, activation='softmax')
# output = decoder_dense(decoder_outputs)

# model = Model([encoder_inputs, decoder_inputs], output)
# model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# # Setup the ModelCheckpoint callback to save the model after each epoch
# checkpoint_callback = ModelCheckpoint(
#     'path/to/save/model_epoch_{epoch:02d}.h5',
#     save_weights_only=False,  # Set to True if you only need to save the weights, not the full model
#     save_freq='epoch',  # Save after each epoch
#     verbose=1  # Logs a message each time the model is saved
# )

# # Train the model with the checkpoint callback
# model.fit(
#     [input_train, decoder_input_train], 
#     target_train,
#     validation_data=([input_val, decoder_input_val], target_val),
#     epochs=10,  # Or however many epochs you need
#     batch_size=64,
#     callbacks=[checkpoint_callback]  # Pass the callback to the fit method
# )
