In [1]:
# Check if GPU is available
from tensorflow.python.client import device_lib

def get_gpu_details():
    devices = device_lib.list_local_devices()
    for device in devices:
        if device.device_type == 'GPU':
            print(f"Device Name: {device.name}")
            print(f"Memory Limit: {device.memory_limit} bytes")
            print(f"Description: {device.physical_device_desc}")

get_gpu_details()


Device Name: /device:GPU:0
Memory Limit: 4158652416 bytes
Description: device: 0, name: NVIDIA GeForce GTX 1660 Ti with Max-Q Design, pci bus id: 0000:01:00.0, compute capability: 7.5


In [2]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf

## Loading the DataFrame

In [3]:
# Loading the DataFrame
data_dir = os.path.join(os.getcwd(), 'data')

file_path_parquet = os.path.join(data_dir, 'utterances.parquet')
df_loaded_parquet = pd.read_parquet(file_path_parquet)

df_loaded_parquet.head(5)

Unnamed: 0,id,conversation_id,text,speaker,reply_to,timestamp,movie_id,tok_0_token,tok_0_tag,tok_0_dep,...,tok_121_dep,tok_122_token,tok_122_tag,tok_122_dep,tok_123_token,tok_123_tag,tok_123_dep,tok_124_token,tok_124_tag,tok_124_dep
0,L1045,L1044,They do not!,u0,L1044,,m0,They,PRP,nsubj,...,,,,,,,,,,
1,L1044,L1044,They do to!,u2,,,m0,They,PRP,nsubj,...,,,,,,,,,,
2,L985,L984,I hope so.,u0,L984,,m0,I,PRP,nsubj,...,,,,,,,,,,
3,L984,L984,She okay?,u2,,,m0,She,PRP,nsubj,...,,,,,,,,,,
4,L925,L924,Let's go.,u0,L924,,m0,Let,VB,ROOT,...,,,,,,,,,,


## Data cleaning

### Leaving only necessary data for simple Sec2Seq model
Id, conversation_id for tracking the flow of conversations and reply_to for understanding the sequence within the dialogue, and conversation text ofcourse.

In [4]:
conversations = df_loaded_parquet[['text', 'id', 'conversation_id', 'reply_to']]
conversations.head(5)

Unnamed: 0,text,id,conversation_id,reply_to
0,They do not!,L1045,L1044,L1044
1,They do to!,L1044,L1044,
2,I hope so.,L985,L984,L984
3,She okay?,L984,L984,
4,Let's go.,L925,L924,L924


## Create prepocessing functions for initial text and later response generation preprocessing

In [5]:
# !pip install spacy
# !python -m spacy download en_core_web_sm

In [6]:
import re
import string
import unicodedata
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
# Download necessary NLTK resources
nltk.download('punkt')  # Tokenizer
nltk.download('wordnet')  # Lemmatizer
nltk.download('stopwords')  # Stopwords
nltk.download('omw-1.4') # Ensures multilingual contexts

# Stopwords list
stop_words = set(stopwords.words('english'))

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

initial_preprocessing = True

# Load spaCy's English NLP model
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tomui\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tomui\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tomui\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\tomui\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [8]:
def normalize_text(text: str) -> str:
    # Normalize Unicode string to NFKD form, remove non-ASCII characters, and then decode it back to a UTF-8 string
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    # Convert to lowercase
    text = text.lower()
    # Add a space before any punctuation mark (., !, or ?)
    text = re.sub(r"([.!?])", r" \1", text)
    # Handle contractions correctly by not adding space before apostrophe
    text = re.sub(r"(\b\w+)'(d|s|t|ll|ve|re)", r"\1'\2", text)
    # Replace any sequence of characters that are not letters, keep basic punctuation
    text = re.sub(r"[^a-z.,'!? ]", ' ', text)
    # Replace any sequence of whitespace characters with a single space and remove leading and trailing whitespace
    text = re.sub(r"\s+", r" ", text).strip()
    return text

def remove_names(text: str) -> str:
    # Use spaCy to detect and remove names from the text
    doc = nlp(text)
    filtered_text = ' '.join([token.text for token in doc if token.ent_type_ != 'PERSON']) # Takes really long time, exlude from chatbot input preprocessing
    return filtered_text

def preprocess_text(text: str) -> str:
    # Normalize text
    text = normalize_text(text)
    # Remove names using spaCy's NER
    if initial_preprocessing:
        text = remove_names(text)
    # # Remove punctuation
    # text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords and tokenize
    # words = word_tokenize(text) # More intelligent splitting
    # filtered_words = [word for word in words if word not in stop_words]
    # # Lemmatize words
    # lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    # Add <SOS> and <EOS> tokens, and join the list into a single string
    # return ' '.join(['sofs'] + lemmatized_words + ['eofs'])
    return 'sofs ' + text + ' eofs' # Chosen ['sofs', 'eofs'] because tokenizer removes everthing what is in <> or || and are not in dataset vocabulary

## Data preprocessing

In [9]:
# Apply the preprocessing function to each row in the 'text' column
conversations['preprocessed_text'] = conversations['text'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  conversations['preprocessed_text'] = conversations['text'].apply(preprocess_text)


In [10]:
conversations[1140: 1150]

Unnamed: 0,text,id,conversation_id,reply_to,preprocessed_text
1140,Why'd you help me back there with the Chief? ...,L3229,L3229,,sofs why 'd you help me back there with the ch...
1141,How you go out on a limb for somebody is by gi...,L3228,L3223,L3227,sofs how you go out on a limb for somebody is ...
1142,"But, I mean, didn't you ever go out on a limb ...",L3227,L3223,L3226,"sofs but , i mean , did n't you ever go out on..."
1143,"Well, it's not up to you to decide whether she...",L3226,L3223,L3225,"sofs well , it 's not up to you to decide whet..."
1144,"I told you, you know, I thought I was doing th...",L3225,L3223,L3224,"sofs i told you , you know , i thought i was d..."
1145,"No, I don't think you were a fool, I just thin...",L3224,L3223,L3223,"sofs no , i do n't think you were a fool , i j..."
1146,"Yeah, just her in the shower. Nothing happene...",L3223,L3223,,"sofs yeah , just her in the shower . nothing h..."
1147,Just a shower?,L3222,L3219,L3221,sofs just a shower ? eofs
1148,I took her there for a shower and that's it.,L3221,L3219,L3220,sofs i took her there for a shower and that 's...
1149,"Well, you shoulda because nobody's gonna belie...",L3220,L3219,L3219,"sofs well , you shoulda because nobody 's gon ..."


## Saving the DataFrame

In [11]:
# !pip install pyarrow

In [12]:
# !pip install fastparquet

In [13]:
# Saving the DataFrame
data_dir = os.path.join(os.getcwd(), 'data')
file_path_parquet = os.path.join(data_dir, 'preprocessed_s2s.parquet')
conversations.to_parquet(file_path_parquet)

## Loading the DataFrame

In [14]:
# Loading the DataFrame
file_path_parquet = os.path.join(data_dir, 'preprocessed_s2s.parquet')
conversations = pd.read_parquet(file_path_parquet)

In [15]:
conversations[1140: 1150]

Unnamed: 0,text,id,conversation_id,reply_to,preprocessed_text
1140,Why'd you help me back there with the Chief? ...,L3229,L3229,,sofs why 'd you help me back there with the ch...
1141,How you go out on a limb for somebody is by gi...,L3228,L3223,L3227,sofs how you go out on a limb for somebody is ...
1142,"But, I mean, didn't you ever go out on a limb ...",L3227,L3223,L3226,"sofs but , i mean , did n't you ever go out on..."
1143,"Well, it's not up to you to decide whether she...",L3226,L3223,L3225,"sofs well , it 's not up to you to decide whet..."
1144,"I told you, you know, I thought I was doing th...",L3225,L3223,L3224,"sofs i told you , you know , i thought i was d..."
1145,"No, I don't think you were a fool, I just thin...",L3224,L3223,L3223,"sofs no , i do n't think you were a fool , i j..."
1146,"Yeah, just her in the shower. Nothing happene...",L3223,L3223,,"sofs yeah , just her in the shower . nothing h..."
1147,Just a shower?,L3222,L3219,L3221,sofs just a shower ? eofs
1148,I took her there for a shower and that's it.,L3221,L3219,L3220,sofs i took her there for a shower and that 's...
1149,"Well, you shoulda because nobody's gonna belie...",L3220,L3219,L3219,"sofs well , you shoulda because nobody 's gon ..."


## Initialize the tokenizer

In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer
from collections import OrderedDict

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(conversations['preprocessed_text'])  # <SOS> and <EOS> == sofs an eofs  == <start> and <end>

# Sort the word_counts dictionary by frequency in descending order
sorted_word_counts = OrderedDict(sorted(tokenizer.word_counts.items(), key=lambda x: x[1], reverse=True))
print(f"\nTop 15 most frequent words:\n {list(sorted_word_counts.items())[:15]}")
print(f"\nLast 100 words:\n {list(sorted_word_counts.items())[-100:]}")


Top 15 most frequent words:
 [('sofs', 304713), ('eofs', 304713), ('you', 148729), ('i', 142169), ('the', 99290), ('to', 80761), ('a', 71534), ("'s", 66252), ('it', 66206), ("n't", 55106), ('do', 47246), ('that', 46706), ('and', 46114), ('of', 39474), ('what', 37876)]

Last 100 words:
 [('nihilistic', 1), ('freelancing', 1), ('gatherer', 1), ('overview', 1), ('retardant', 1), ('deploys', 1), ('beastie', 1), ('ozzfest', 1), ('russkie', 1), ('shavers', 1), ('mersh', 1), ('slovo', 1), ('dawning', 1), ('tshirt', 1), ('dishonorably', 1), ('vandals', 1), ('grozny', 1), ('lamborghini', 1), ('genoa', 1), ('pizda', 1), ('filament', 1), ('replicate', 1), ('solider', 1), ('secaucus', 1), ('athletics', 1), ('herded', 1), ('wolverine', 1), ('absorbs', 1), ('definitively', 1), ('poppycock', 1), ('rumous', 1), ('disinfectant', 1), ('celery', 1), ('cerebrum', 1), ('unashamedly', 1), ('dien', 1), ('gerhart', 1), ('mending', 1), ('galvanism', 1), ('equalize', 1), ('cerebrospinal', 1), ('madein', 1), ('

## Filter rare words - 10000 vocabulary OK

In [17]:
from collections import OrderedDict

# Set a frequency threshold
threshold = 10

# Filter out rare words
filtered_words = {word: count for word, count in sorted_word_counts.items() if count >= threshold}

# Display the number of words before and after filtering
print(f"Total words before filtering: {len(sorted_word_counts)}")
print(f"Total words after filtering: {len(filtered_words)}")
# Display the sorted word counts
print(f"\nTop 15 most frequent words:\n {list(filtered_words.items())[:15]}")
print(f"\nLast 100 words:\n {list(filtered_words.items())[-100:]}")

Total words before filtering: 47579
Total words after filtering: 10338

Top 15 most frequent words:
 [('sofs', 304713), ('eofs', 304713), ('you', 148729), ('i', 142169), ('the', 99290), ('to', 80761), ('a', 71534), ("'s", 66252), ('it', 66206), ("n't", 55106), ('do', 47246), ('that', 46706), ('and', 46114), ('of', 39474), ('what', 37876)]

Last 100 words:
 [('warmed', 10), ('misty', 10), ('deviant', 10), ('rollo', 10), ('checkbook', 10), ('shelby', 10), ('cooperating', 10), ('puppets', 10), ('venza', 10), ('grable', 10), ('overwhelmed', 10), ('temporal', 10), ('beamed', 10), ('deflector', 10), ('cloaked', 10), ('antiques', 10), ('inoperative', 10), ('gracie', 10), ("ba'ku", 10), ('riker', 10), ('silo', 10), ('enthusiastic', 10), ('dispose', 10), ('endings', 10), ('signatures', 10), ('zander', 10), ('stirred', 10), ('sax', 10), ('roma', 10), ('commune', 10), ('eyelash', 10), ('georgina', 10), ('magneto', 10), ('dade', 10), ('throats', 10), ('productive', 10), ('kuato', 10), ('bixby', 10

## Save the tokenizer

In [18]:
import pickle

# Determine the directory where the tokenizer will be saved
data_dir = os.path.join(os.getcwd(), 'data')
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

# Save the tokenizer using pickle
tokenizer_path = os.path.join(data_dir, 'tokenizer.pickle')
with open(tokenizer_path, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print(f"Tokenizer saved to {tokenizer_path}")

Tokenizer saved to C:\Users\tomui\Desktop\capstone_project\data\tokenizer.pickle


## Load the Tokenizer

In [19]:
# Load the tokenizer from file
data_dir = os.path.join(os.getcwd(), 'data')
tokenizer_path = os.path.join(data_dir, 'tokenizer.pickle')
with open(tokenizer_path, 'rb') as handle:
    tokenizer = pickle.load(handle)

In [20]:
print(tokenizer.word_index['sofs'], tokenizer.word_index['eofs']) # Checking if <start> and <end> tokens are in index (vocabulary)

1 2


## Pairing messages - input with responses

In [21]:
# Merging the DataFrame with itself to form pairs
pairs = pd.merge(
    conversations, conversations,
    left_on='id',
    right_on='reply_to',
    suffixes=('_input', '_response')
)

In [22]:
pairs.head()

Unnamed: 0,text_input,id_input,conversation_id_input,reply_to_input,preprocessed_text_input,text_response,id_response,conversation_id_response,reply_to_response,preprocessed_text_response
0,They do to!,L1044,L1044,,sofs they do to ! eofs,They do not!,L1045,L1044,L1044,sofs they do not ! eofs
1,She okay?,L984,L984,,sofs she okay ? eofs,I hope so.,L985,L984,L984,sofs i hope so . eofs
2,Wow,L924,L924,,sofs wow eofs,Let's go.,L925,L924,L924,sofs let 's go . eofs
3,No,L871,L870,L870,sofs no eofs,Okay -- you're gonna need to learn how to lie.,L872,L870,L871,sofs okay you 're gon na need to learn how to ...
4,I'm kidding. You know how sometimes you just ...,L870,L870,,sofs i 'm kidding . you know how sometimes you...,No,L871,L870,L870,sofs no eofs


In [23]:
# Selecting the needed columns including IDs
training_data = pairs[['id_input', 'text_input', 'preprocessed_text_input', 'id_response', 'text_response', 'preprocessed_text_response']]

# Renaming columns for clarity
training_data.columns = ['ID_Input', 'Original_Text_Input', 'Text_Input', 'ID_Response', 'Original_Text_Response', 'Text_Response']

In [24]:
training_data

Unnamed: 0,ID_Input,Original_Text_Input,Text_Input,ID_Response,Original_Text_Response,Text_Response
0,L1044,They do to!,sofs they do to ! eofs,L1045,They do not!,sofs they do not ! eofs
1,L984,She okay?,sofs she okay ? eofs,L985,I hope so.,sofs i hope so . eofs
2,L924,Wow,sofs wow eofs,L925,Let's go.,sofs let 's go . eofs
3,L871,No,sofs no eofs,L872,Okay -- you're gonna need to learn how to lie.,sofs okay you 're gon na need to learn how to ...
4,L870,I'm kidding. You know how sometimes you just ...,sofs i 'm kidding . you know how sometimes you...,L871,No,sofs no eofs
...,...,...,...,...,...,...
221611,L666520,"Well I assure you, Sir, I have no desire to cr...","sofs well i assure you , sir , i have no desir...",L666521,"And I assure you, you do not In fact I'd be ob...","sofs and i assure you , you do not in fact i '..."
221612,L666371,Lord Chelmsford seems to want me to stay back ...,sofs lord chelmsford seems to want me to stay ...,L666372,I think Chelmsford wants a good man on the bor...,sofs i think chelmsford wants a good man on th...
221613,L666370,I'm to take the Sikali with the main column to...,sofs i 'm to take the sikali with the main col...,L666371,Lord Chelmsford seems to want me to stay back ...,sofs lord chelmsford seems to want me to stay ...
221614,L666369,"Your orders, Mr Vereker?","sofs your orders , mr vereker ? eofs",L666370,I'm to take the Sikali with the main column to...,sofs i 'm to take the sikali with the main col...


In [25]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221616 entries, 0 to 221615
Data columns (total 6 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   ID_Input                221616 non-null  object
 1   Original_Text_Input     221616 non-null  object
 2   Text_Input              221616 non-null  object
 3   ID_Response             221616 non-null  object
 4   Original_Text_Response  221616 non-null  object
 5   Text_Response           221616 non-null  object
dtypes: object(6)
memory usage: 10.1+ MB


## Variables for configuration

In [26]:
max_length = 15 # Variable for padding

## Converting to indices and input-output sequences

In [27]:
# Convert texts to sequences
input_sequences = tokenizer.texts_to_sequences(training_data['Text_Input'])
target_sequences = tokenizer.texts_to_sequences(training_data['Text_Response'])

# Pad sequences with pre-padding and truncating
input_padded = pad_sequences(input_sequences, maxlen=max_length, padding='pre', truncating='post')
target_padded = pad_sequences(target_sequences, maxlen=max_length, padding='pre', truncating='post')

# Store numpy arrays directly in the DataFrame
training_data['Padded_Input_Sequences'] = list(map(np.array, input_padded))
training_data['Padded_Target_Sequences'] = list(map(np.array, target_padded))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data['Padded_Input_Sequences'] = list(map(np.array, input_padded))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data['Padded_Target_Sequences'] = list(map(np.array, target_padded))


In [28]:
training_data

Unnamed: 0,ID_Input,Original_Text_Input,Text_Input,ID_Response,Original_Text_Response,Text_Response,Padded_Input_Sequences,Padded_Target_Sequences
0,L1044,They do to!,sofs they do to ! eofs,L1045,They do not!,sofs they do not ! eofs,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 37, 11, 6, 2]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 37, 11, 31, 2]"
1,L984,She okay?,sofs she okay ? eofs,L985,I hope so.,sofs i hope so . eofs,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 51, 111, 2]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 347, 46, 2]"
2,L924,Wow,sofs wow eofs,L925,Let's go.,sofs let 's go . eofs,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 897, 2]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 95, 8, 63, 2]"
3,L871,No,sofs no eofs,L872,Okay -- you're gonna need to learn how to lie.,sofs okay you 're gon na need to learn how to ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 32, 2]","[0, 0, 1, 111, 3, 26, 118, 117, 129, 6, 650, 5..."
4,L870,I'm kidding. You know how sometimes you just ...,sofs i 'm kidding . you know how sometimes you...,L871,No,sofs no eofs,"[1, 4, 24, 671, 3, 25, 55, 464, 3, 38, 711, 21...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 32, 2]"
...,...,...,...,...,...,...,...,...
221611,L666520,"Well I assure you, Sir, I have no desire to cr...","sofs well i assure you , sir , i have no desir...",L666521,"And I assure you, you do not In fact I'd be ob...","sofs and i assure you , you do not in fact i '...","[0, 1, 65, 4, 2406, 3, 151, 4, 23, 32, 2040, 6...","[1, 13, 4, 2406, 3, 3, 11, 31, 16, 517, 4, 80,..."
221612,L666371,Lord Chelmsford seems to want me to stay back ...,sofs lord chelmsford seems to want me to stay ...,L666372,I think Chelmsford wants a good man on the bor...,sofs i think chelmsford wants a good man on th...,"[0, 0, 0, 1, 757, 539, 6, 56, 17, 6, 244, 94, ...","[1, 4, 58, 334, 7, 74, 100, 30, 5, 2433, 68, 2..."
221613,L666370,I'm to take the Sikali with the main column to...,sofs i 'm to take the sikali with the main col...,L666371,Lord Chelmsford seems to want me to stay back ...,sofs lord chelmsford seems to want me to stay ...,"[0, 1, 4, 24, 6, 103, 5, 36, 5, 1473, 3579, 6,...","[0, 0, 0, 1, 757, 539, 6, 56, 17, 6, 244, 94, ..."
221614,L666369,"Your orders, Mr Vereker?","sofs your orders , mr vereker ? eofs",L666370,I'm to take the Sikali with the main column to...,sofs i 'm to take the sikali with the main col...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 28, 1074, 13...","[0, 1, 4, 24, 6, 103, 5, 36, 5, 1473, 3579, 6,..."


## Checking if conversion was successfull

In [29]:
def sequences_to_text(sequence):
    index_to_word = {index: word for word, index in tokenizer.word_index.items()}
    # Directly map sequence of indices back to words
    return ' '.join(index_to_word.get(idx, '') for idx in sequence if idx != 0)

# Print original and reverse-tokenized text for entries
for index, row in training_data[1130:1135].iterrows():
    print("Original Text:", row['Original_Text_Input'], 
          "\nReconstructed Text:", sequences_to_text(row['Padded_Input_Sequences']))
    print("\nOriginal Text:", row['Original_Text_Response'], 
          "\nReconstructed Text:", sequences_to_text(row['Padded_Target_Sequences']))


Original Text: But doesn't the Son of Sam Law prevent criminals from profiting from their crimes? 
Reconstructed Text: sofs but does n't the son of law prevent criminals from from their crimes eofs

Original Text: That doesn't apply to me because I'm not a criminal.  I'm not a criminal!  I wasn't convicted. 
Reconstructed Text: sofs that does n't apply to me because i 'm not a criminal i 'm
Original Text: We're in negotiations, that's correct. 
Reconstructed Text: sofs we 're in negotiations that 's correct eofs

Original Text: But doesn't the Son of Sam Law prevent criminals from profiting from their crimes? 
Reconstructed Text: sofs but does n't the son of law prevent criminals from from their crimes eofs
Original Text: And isn't there a movie in the works about you? 
Reconstructed Text: sofs and is n't there a movie in the works about you eofs

Original Text: We're in negotiations, that's correct. 
Reconstructed Text: sofs we 're in negotiations that 's correct eofs
Original Text: L

In [30]:
# Selecting the needed columns including IDs
training_data_final = training_data[['ID_Input', 'Padded_Input_Sequences', 'ID_Response', 'Padded_Target_Sequences']]

## Saving the DataFrame

In [31]:
# Saving the DataFrame
data_dir = os.path.join(os.getcwd(), 'data')
file_path_parquet = os.path.join(data_dir, 'training_df_s2s.parquet')
training_data_final.to_parquet(file_path_parquet)

## Loading the DataFrame

In [50]:
# Loading the DataFrame
file_path_parquet = os.path.join(data_dir, 'training_df_s2s.parquet')
training_data_final = pd.read_parquet(file_path_parquet)

training_data_final.head(10)

Unnamed: 0,ID_Input,Padded_Input_Sequences,ID_Response,Padded_Target_Sequences
0,L1044,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 37, 11, 6, 2]",L1045,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 37, 11, 31, 2]"
1,L984,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 51, 111, 2]",L985,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 347, 46, 2]"
2,L924,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 897, 2]",L925,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 95, 8, 63, 2]"
3,L871,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 32, 2]",L872,"[0, 0, 1, 111, 3, 26, 118, 117, 129, 6, 650, 5..."
4,L870,"[1, 4, 24, 671, 3, 25, 55, 464, 3, 38, 711, 21...",L871,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 32, 2]"
5,L868,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 5, 224, 3, 2]",L869,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 40, 29, 885, 14, 9..."
6,L867,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 15, 74, 309, 2]",L868,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 5, 224, 3, 2]"
7,L866,"[0, 0, 0, 1, 4, 772, 3, 80, 44, 6, 5, 74, 309,...",L867,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 15, 74, 309, 2]"
8,L864,"[0, 0, 0, 0, 1, 17, 21, 6852, 2510, 4, 24, 40,...",L865,"[0, 1, 210, 197, 49, 4, 102, 6, 226, 57, 115, ..."
9,L863,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 15, 1075, 2]",L864,"[0, 0, 0, 0, 1, 17, 21, 6852, 2510, 4, 24, 40,..."


## Checking if GPU available

In [51]:
import tensorflow

from tensorflow.python.client import device_lib

def get_gpu_details():
    devices = device_lib.list_local_devices()
    for device in devices:
        if device.device_type == 'GPU':
            print(f"Device Name: {device.name}")
            print(f"Memory Limit: {device.memory_limit} bytes")
            print(f"Description: {device.physical_device_desc}")

get_gpu_details()


Device Name: /device:GPU:0
Memory Limit: 4158652416 bytes
Description: device: 0, name: NVIDIA GeForce GTX 1660 Ti with Max-Q Design, pci bus id: 0000:01:00.0, compute capability: 7.5


# Encoder-decoder architecture with Attention Layer

In [52]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, Concatenate, Layer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
import tensorflow as tf

## Attention Layer

In [53]:
class Attention(Layer):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)

    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(query_with_time_axis)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

class AttentionLayer(Layer):
    def __init__(self, units):
        super(AttentionLayer, self).__init__()
        self.attention = Attention(units)

    def call(self, inputs):
        decoder_outputs, encoder_outputs = inputs
        context_vectors, _ = tf.map_fn(lambda x: self.attention(x[0], x[1]),
                                       (decoder_outputs, tf.tile(tf.expand_dims(encoder_outputs, axis=1),
                                                                 [1, tf.shape(decoder_outputs)[1], 1, 1])),
                                       fn_output_signature=(tf.TensorSpec(shape=(None, encoder_outputs.shape[-1]), dtype=tf.float32),
                                                            tf.TensorSpec(shape=(None, None, 1), dtype=tf.float32)))
        return context_vectors

## Define the Model

In [54]:
input_sequences = np.array(training_data_final['Padded_Input_Sequences'].tolist())
target_sequences = np.array(training_data_final['Padded_Target_Sequences'].tolist())

# Splitting the data into training and validation sets
input_train, input_val, target_train, target_val = train_test_split(input_sequences, target_sequences, test_size=0.1, random_state=22)

# Building the model
vocab_size = len(tokenizer.word_index) + 1

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(vocab_size, 50, mask_zero=True)(encoder_inputs)
encoder_lstm, forward_h, forward_c, backward_h, backward_c = Bidirectional(
    LSTM(256, return_state=True, return_sequences=True))(encoder_embedding)
encoder_states = [Concatenate()([forward_h, backward_h]), Concatenate()([forward_c, backward_c])]
encoder_outputs = encoder_lstm

# Attention Mechanism
attention_units = 10
attention_layer = AttentionLayer(attention_units)

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(vocab_size, 50, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(512, return_sequences=True, return_state=True)
decoder_lstm_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

# Apply attention to each time step in the decoder
context_vectors = attention_layer([decoder_lstm_outputs, encoder_outputs])

decoder_concat_input = Concatenate(axis=-1)([context_vectors, decoder_lstm_outputs])
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)

# Main Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_6 (Embedding)        (None, None, 50)     2379000     ['input_7[0][0]']                
                                                                                                  
 input_8 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 bidirectional_3 (Bidirectional  [(None, None, 512),  628736     ['embedding_6[0][0]']            
 )                               (None, 256),                                               

## Train the Model

In [None]:
batch_size = 64
epochs = 12

# Prepare decoder input data that just contains the start token
decoder_input_train = np.hstack([np.zeros((target_train.shape[0], 1)), target_train[:, :-1]])
decoder_input_val = np.hstack([np.zeros((target_val.shape[0], 1)), target_val[:, :-1]])

# Ensure targets are expanded in dimension to match the output shape expected by sparse_categorical_crossentropy
target_train_exp = np.expand_dims(target_train, -1)
target_val_exp = np.expand_dims(target_val, -1)

# Checkpoint callback
checkpoint_filepath = 'model_checkpoint_epoch_{epoch:02d}.h5'
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    save_freq='epoch',
    period=5
)

# Fit the model using the original integer labels
model.fit(
    [input_train, decoder_input_train], target_train_exp,
    validation_data=([input_val, decoder_input_val], target_val_exp),
    epochs=epochs, batch_size=batch_size, verbose=1,
    callbacks=[model_checkpoint_callback]
)

Epoch 1/12

## Save the model

In [None]:
data_dir = os.path.join(os.getcwd(), 'data')
file_path_h5 = os.path.join(data_dir, 's2s_model.h5')
model.save(file_path_h5)

## Generate responses

In [None]:
initial_preprocessing = False # Excepts spaCy to detect and remove names from the text

def generate_response(input_text: str) -> str:
    processed_text = preprocess_text(input_text)
    input_seq = tokenizer.texts_to_sequences([processed_text])
    input_seq = pad_sequences(input_seq, maxlen=max_length, padding='post')

    # Get the encoder states and encoder outputs
    encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_model.predict(input_seq)
    state_h = np.concatenate([forward_h, backward_h], axis=-1)
    state_c = np.concatenate([forward_c, backward_c], axis=-1)
    states_value = [state_h, state_c]

    # Prepare the target sequence with the start token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index['sofs']  # Start token index

    stop_condition = False
    decoded_sentence = ''
    tokens_generated = 0

    while not stop_condition:
        decoder_output, h, c = decoder_model.predict([target_seq] + states_value, verbose=0)
        
        # Attention mechanism
        context_vector, _ = attention_layer(decoder_output, encoder_outputs)
        decoder_output_with_context = np.concatenate([context_vector, decoder_output], axis=-1)
        
        sampled_token_index = np.argmax(decoder_output_with_context[0, -1, :])
        sampled_char = tokenizer.index_word.get(sampled_token_index, '')

        if sampled_token_index == tokenizer.word_index['eofs'] or tokens_generated > 10:  # Stop condition
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_char
            tokens_generated += 1

            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index
            states_value = [h, c]

    return decoded_sentence.strip()


## Testing

In [None]:
# Testing
print("\nUser:     Is she okay?")
print("Bot:          ", generate_response('she okay?'))
print("-----------------------------")
print("\nUser:     How are you feeling today?")
print("Bot:          ", generate_response('How are you feeling today?'))
print("-----------------------------")
print("\nUser:     Hi there!")
print("Bot:          ", generate_response('Hi there!'))
print("-----------------------------")
print("\nUser:     Can you tell me the weather forecast for today?")
print("Bot:          ", generate_response('Can you tell me the weather forecast for today?'))
print("-----------------------------")
print("\nUser:     I think artificial intelligence is changing the world.")
print("Bot:          ", generate_response('I think artificial intelligence is changing the world.'))
print("-----------------------------")
print("\nUser:     Any good movie recommendations?")
print("Bot:          ", generate_response('Any good movie recommendations?'))
print("-----------------------------")
print("\nUser:     What do you mean by that?")
print("Bot:          ", generate_response('What do you mean by that?'))
print("-----------------------------")
print("\nUser:     I'm feeling really sad today.")
print("Bot:          ", generate_response("I'm feeling really sad today."))
print("-----------------------------")
print("\nUser:     What are the implications of quantum computing on cybersecurity?")
print("Bot:          ", generate_response('What are the implications of quantum computing on cybersecurity?'))
print("-----------------------------")
print("\nUser:     Why did the chicken cross the road?")
print("Bot:          ", generate_response('Why did the chicken cross the road?'))
print("-----------------------------")
print("\nUser:     Can you explain the plot of The Matrix?")
print("Bot:          ", generate_response('Can you explain the plot of The Matrix?'))

## Save the model

In [None]:
data_dir = os.path.join(os.getcwd(), 'data')
file_path_h5 = os.path.join(data_dir, 's2s_model_2.h5')
model.save(file_path_h5)