# TESTING SIMPLE MODEL

## GPU info

In [1]:
# gpu_info = !nvidia-smi
# gpu_info = '\n'.join(gpu_info)
# if gpu_info.find('failed') >= 0:
#   print('Not connected to a GPU')
# else:
#   print(gpu_info)

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# Check if GPU is available
from tensorflow.python.client import device_lib

def get_gpu_details():
    devices = device_lib.list_local_devices()
    for device in devices:
        if device.device_type == 'GPU':
            print(f"Device Name: {device.name}")
            print(f"Memory Limit: {device.memory_limit} bytes")
            print(f"Description: {device.physical_device_desc}")

get_gpu_details()


Device Name: /device:GPU:0
Memory Limit: 4158652416 bytes
Description: device: 0, name: NVIDIA GeForce GTX 1660 Ti with Max-Q Design, pci bus id: 0000:01:00.0, compute capability: 7.5


In [4]:
# !pip install spacy
# !python -m spacy download en_core_web_sm

In [5]:
# !pip install pyarrow

In [6]:
# !pip install fastparquet

### Config

In [7]:
max_length = 15 # Length of input and target sequences, padding

### Import libraries

In [8]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from typing import List, Tuple, Dict

import re
import string
import unicodedata
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import numpy as np
import pandas as pd
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, Concatenate, Layer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
import tensorflow as tf

import pickle


In [9]:
# Download necessary NLTK resources
nltk.download('punkt')  # Tokenizer
nltk.download('wordnet')  # Lemmatizer
nltk.download('stopwords')  # Stopwords
nltk.download('omw-1.4') # Ensures multilingual contexts

# Stopwords list
stop_words = set(stopwords.words('english'))

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

initial_preprocessing = True

# Load spaCy's English NLP model
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tomui\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tomui\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tomui\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\tomui\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Create prepocessing functions for initial text and later response generation preprocessing

In [10]:
contractions = {
    "’": "'",
    "‘": "'",
    "“": '"',
    "”": '"',
    "can't": "cannot",
    "won't": "will not",
    "n't": " not",
    "i'm": "i am",
    "i'd": "i would",
    "thats's": "that is",
    "it's": "it is",
    "he's": "he is",
    "she's": "she is",
    "you're": "you are",
    "they're": "they are",
    "we're": "we are",
    "i've": "i have",
    "you've": "you have",
    "they've": "they have",
    "we've": "we have",
    "isn't": "is not",
    "aren't": "are not",
    "wasn't": "was not",
    "weren't": "were not",
    "doesn't": "does not",
    "don't": "do not",
    "didn't": "did not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "i'll": "i will",
    "you'll": "you will",
    "he'll": "he will",
    "she'll": "she will",
    "we'll": "we will",
    "they'll": "they will",
    "wouldn't": "would not",
    "shouldn't": "should not",
    "couldn't": "could not",
    "mightn't": "might not",
    "mustn't": "must not",
    "she'd": "she would",
    "he'd": "he would",
    "they'd": "they would",
    "we'd": "we would",
    "that'll": "that will",
    "there'll": "there will",
    "who'll": "who will",
    "it'll": "it will",
    "that'd": "that would",
    "there'd": "there would",
    "who'd": "who would",
    "when's": "when is",
    "where's": "where is",
    "why's": "why is",
    "how's": "how is",
    "y'all": "you all",
    "let's": "let us",
    "ma'am": "madam",
    "o'clock": "of the clock",
    "ain't": "is not",
    "could've": "could have",
    "should've": "should have",
    "would've": "would have",
    "might've": "might have",
    "must've": "must have",
    "who've": "who have",
    "oughtn't": "ought not",
    "daren't": "dare not",
    "needn't": "need not",
    "usedn't": "used not"
}

max_length_sentences = max_length - 2

def normalize_text(text: str) -> str:
    # Normalize Unicode string to NFKD form, remove non-ASCII characters, and then decode it back to a UTF-8 string
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    # Convert to lowercase
    text = text.lower()
    # Remove spaces around apostrophes
    text = re.sub(r"\s*'\s*", "'", text)
    # Add a space before and after any punctuation mark (., !, or ?)
    text = re.sub(r"\s*([.!?])\s*", r" \1 ", text)
    # Correct contractions
    for contraction, replacement in contractions.items():
        text = re.sub(re.escape(contraction), replacement, text)
    # Replace any sequence of characters that are not letters, basic punctuation
    text = re.sub(r"[^a-z.,'!? ]", ' ', text)
    # Replace any sequence of whitespace characters with a single space and remove leading and trailing whitespace
    text = re.sub(r"\s+", ' ', text).strip()
    return text

def remove_names(text: str) -> str:
    # Use spaCy to detect and remove names from the text
    doc = nlp(text)
    filtered_text = ' '.join([token.text for token in doc if token.ent_type_ != 'PERSON']) # Takes really long time, exlude from chatbot input preprocessing
    return filtered_text

def preprocess_text(text: str) -> str:
    # Normalize text
    text = normalize_text(text)
    # Remove names using spaCy's NER
    if initial_preprocessing:
        text = remove_names(text)
    # # Remove punctuation
    # text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords and tokenize
    # words = word_tokenize(text) # More intelligent splitting
    # filtered_words = [word for word in words if word not in stop_words]
    # # Lemmatize words
    # lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    # Add <SOS> and <EOS> tokens, and join the list into a single string
    # return ' '.join(['sofs'] + lemmatized_words + ['eofs'])
        # Trim the text to the desired length
    words = text.split()[:max_length_sentences]
    trimmed_text = ' '.join(words)
    return 'sofs ' + trimmed_text + ' eofs' # Chosen ['sofs', 'eofs'] because tokenizer removes everthing what is in <> or || and are not in dataset vocabulary

### Load data
https://huggingface.co/datasets/daily_dialog/tree/refs%2Fconvert%2Fparquet/default

In [11]:
def load_parquet_files(file_paths: Dict[str, str]) -> Dict[str, pd.DataFrame]:
    dataframes = {}
    for key, file_path in file_paths.items():
        try:
            df = pd.read_parquet(file_path)
            dataframes[key] = df
            print(f"Contents of {file_path}:")
            print(df.head(), "\n")
        except Exception as e:
            print(f"An error occurred while loading {file_path}: {e}")
    return dataframes

file_paths = {
    'train': 'data_dd/0000.parquet',
    'validation': 'data_dd/default_validation_0000.parquet',
    'test': 'data_dd/default_test_0000.parquet'
}

dataframes = load_parquet_files(file_paths)


Contents of data_dd/0000.parquet:
                                              dialog  \
0  [Say , Jim , how about going for a few beers a...   
1  [Can you do push-ups ? ,  Of course I can . It...   
2  [Can you study with the radio on ? ,  No , I l...   
3  [Are you all right ? ,  I will be all right so...   
4  [Hey John , nice skates . Are they new ? ,  Ye...   

                              act                         emotion  
0  [3, 4, 2, 2, 2, 3, 4, 1, 3, 4]  [0, 0, 0, 0, 0, 0, 4, 4, 4, 4]  
1              [2, 1, 2, 2, 1, 1]              [0, 0, 6, 0, 0, 0]  
2                 [2, 1, 2, 1, 1]                 [0, 0, 0, 0, 0]  
3                    [2, 1, 1, 1]                    [0, 0, 0, 0]  
4     [2, 1, 2, 1, 1, 2, 1, 3, 4]     [0, 0, 0, 0, 0, 6, 0, 6, 0]   

Contents of data_dd/default_validation_0000.parquet:
                                              dialog  \
0  [Good morning , sir . Is there a bank near her...   
1  [Good afternoon . This is Michelle Li speaking...  

### Review data

In [12]:
dataframes.keys()

dict_keys(['train', 'validation', 'test'])

In [13]:
train_df = dataframes['train']
val_df = dataframes['validation']
test_df = dataframes['test']
print(len(train_df),len(val_df), len(test_df))
train_df

11118 1000 1000


Unnamed: 0,dialog,act,emotion
0,"[Say , Jim , how about going for a few beers a...","[3, 4, 2, 2, 2, 3, 4, 1, 3, 4]","[0, 0, 0, 0, 0, 0, 4, 4, 4, 4]"
1,"[Can you do push-ups ? , Of course I can . It...","[2, 1, 2, 2, 1, 1]","[0, 0, 6, 0, 0, 0]"
2,"[Can you study with the radio on ? , No , I l...","[2, 1, 2, 1, 1]","[0, 0, 0, 0, 0]"
3,"[Are you all right ? , I will be all right so...","[2, 1, 1, 1]","[0, 0, 0, 0]"
4,"[Hey John , nice skates . Are they new ? , Ye...","[2, 1, 2, 1, 1, 2, 1, 3, 4]","[0, 0, 0, 0, 0, 6, 0, 6, 0]"
...,...,...,...
11113,"[Hello , I bought a pen in your shop just befo...","[1, 1, 1, 2, 3, 2, 1, 4, 1]","[0, 4, 0, 0, 0, 0, 0, 0, 4]"
11114,"[Do you have any seats available ? , Yes . Th...","[2, 1, 2, 1, 3, 4]","[0, 0, 0, 0, 0, 4]"
11115,"[Uncle Ben , how did the Forbidden City get th...","[2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 3, 4]","[0, 0, 6, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0]"
11116,"[May I help you , sir ? , I want a pair of lo...","[2, 3, 4, 3]","[0, 0, 0, 0]"


In [14]:
for dialogue in test_df['dialog'][101: 110]:
    print(dialogue)

['Can you tell me where the pots and pans are ? '
 ' Pots and pans are right over there . ' ' Oh , thank you . '
 ' Could I interest you in our store credit card ? '
 ' No , thanks . I already have credit cards . '
 ' But our credit card saves you 10 percent . '
 " That's a nice discount . "
 ' Here . Let me give you an application form . '
 " Thank you , but I'm just browsing today . "
 ' Okay . Enjoy your browsing . ']
['Here is the fish counter . Look at the lobsters and crabs . Shall we have some ? '
 " I'm allergic to these things , you know . "
 ' Sorry , I forgot . I don ’ t like seafood , neither . '
 ' Let ’ s go over there and get some milk , a couple dozen eggs and some orange juice . '
 " Let's get frozen juice . It is really good . We ’ Ve got enough food . Let ’ s go over to the check-out stand . "
 ' OK . But just let me pick up a bottle of cooking wine and oil as we go by . ']
['Good morning , may I speak with Professor Clark , please ? '
 ' You are speaking with Profes

In [15]:
# Cheking lengths of dialogs
lengths = []
for dialogue in train_df['dialog']:
    length = len(dialogue)
    lengths.append(length)

lengths_series = pd.Series(lengths)
print(lengths_series.describe())


count    11118.000000
mean         7.840439
std          4.007963
min          2.000000
25%          4.000000
50%          7.000000
75%         10.000000
max         35.000000
dtype: float64


### Clean data
For the first such project I will not use additional data provided such as 'act' and 'emotion'. I will use my own data split, therefore I will concatinate all data and take only dialogs.

In [16]:
all_dialogs = pd.concat([train_df['dialog'], val_df['dialog'], test_df['dialog']], ignore_index=True)
data = pd.DataFrame({'dialog': all_dialogs})
data

Unnamed: 0,dialog
0,"[Say , Jim , how about going for a few beers a..."
1,"[Can you do push-ups ? , Of course I can . It..."
2,"[Can you study with the radio on ? , No , I l..."
3,"[Are you all right ? , I will be all right so..."
4,"[Hey John , nice skates . Are they new ? , Ye..."
...,...
13113,"[Frank ’ s getting married , do you believe th..."
13114,"[OK . Come back into the classroom , class . ,..."
13115,"[Do you have any hobbies ? , Yes , I like col..."
13116,"[Jenny , what's wrong with you ? Why do you ke..."


### Preprocess data

In [17]:
print(f"The type of single dialog: {type(data['dialog'][0])}")
print(f"The type of the sentence within dialog: {type(data['dialog'][0][0])}")
data['dialog'].info()

The type of single dialog: <class 'numpy.ndarray'>
The type of the sentence within dialog: <class 'str'>
<class 'pandas.core.series.Series'>
RangeIndex: 13118 entries, 0 to 13117
Series name: dialog
Non-Null Count  Dtype 
--------------  ----- 
13118 non-null  object
dtypes: object(1)
memory usage: 102.6+ KB


In [18]:
# Dialogs are ndarrays, my preprocessing funcions are for strings
def preprocess_text_array(arr):
    dialog = arr.tolist()
    return [preprocess_text(text) for text in dialog]

In [19]:
data.loc[:, "preprocessed_dialog"] = data.loc[:, "dialog"].apply(preprocess_text_array)
data

Unnamed: 0,dialog,preprocessed_dialog
0,"[Say , Jim , how about going for a few beers a...","[sofs say , , how about going for a few beers ..."
1,"[Can you do push-ups ? , Of course I can . It...","[sofs can you do push ups ? eofs, sofs of cour..."
2,"[Can you study with the radio on ? , No , I l...","[sofs can you study with the radio on ? eofs, ..."
3,"[Are you all right ? , I will be all right so...","[sofs are you all right ? eofs, sofs i will be..."
4,"[Hey John , nice skates . Are they new ? , Ye...","[sofs , nice skates . are they new ? eofs, sof..."
...,...,...
13113,"[Frank ’ s getting married , do you believe th...","[sofs married , do you believe this ? eofs, so..."
13114,"[OK . Come back into the classroom , class . ,...","[sofs ok . come back into the classroom , clas..."
13115,"[Do you have any hobbies ? , Yes , I like col...","[sofs do you have any hobbies ? eofs, sofs yes..."
13116,"[Jenny , what's wrong with you ? Why do you ke...","[sofs , what 's wrong with you ? why do you ke..."


In [20]:
for dialogue in data['preprocessed_dialog'][121: 130]:
    print(dialogue)

['sofs please excuse me , but i really have to be going . eofs', 'sofs yes , of course . it was nice to see you . eofs', 'sofs it was nice to see you , too . and please give my eofs']
['sofs excuse me . is this seat taken ? eofs', 'sofs i am afraid so . eofs']
['sofs what do you think of the coming match ? eofs', 'sofs winning is a piece of cake to me . eofs', 'sofs you are bragging again . eofs']
['sofs what would you reckon the taxing increases ? eofs', 'sofs well , the state will benefit a lot , i suppose . eofs', 'sofs but what do most people think about it ? eofs', 'sofs ah , it s hard to say . eofs']
['sofs are you still coming to my place for dinner tomorrow night ? eofs', 'sofs of course . is the dinner still on ? eofs', 'sofs yes , i was just wondering how you and your roommate were planning eofs', 'sofs we were planning on walking both ways since the weather is still nice eofs', "sofs that 's what i thought you would do . listen , i live eofs", 'sofs it can not be that bad . 

### Pairing messages - input with responses

In [21]:
# Function to create input-response pairs
def create_pairs(dialogues):
    input_responses = []
    for dialogue in dialogues:
        for i in range(len(dialogue) - 1):
            input_responses.append((dialogue[i], dialogue[i + 1]))
    return input_responses

# Create input-response pairs
pairs = create_pairs(data['preprocessed_dialog'])

# Convert pairs to DataFrame
pairs_df = pd.DataFrame(pairs, columns=['input', 'response'])

In [22]:
pairs_df

Unnamed: 0,input,response
0,"sofs say , , how about going for a few beers a...",sofs you know that is tempting but is really n...
1,sofs you know that is tempting but is really n...,sofs what do you mean ? it will help us to rel...
2,sofs what do you mean ? it will help us to rel...,sofs do you really think so ? i do not . it wi...
3,sofs do you really think so ? i do not . it wi...,sofs i guess you are right . but what shall we...
4,sofs i guess you are right . but what shall we...,sofs i suggest a walk over to the gym where we...
...,...,...
89856,sofs why not go again to celebrate out one yea...,sofs are you kidding ? can you afford it ? do ...
89857,sofs are you kidding ? can you afford it ? do ...,"sofs never mind that , i will take care of it ..."
89858,"sofs never mind that , i will take care of it ...","sofs yeah , i think so . eofs"
89859,"sofs yeah , i think so . eofs",sofs ok . i will make the arrangements . it wi...


In [23]:
# Cheking length of sentences
lengths = []
for sentence in pairs_df['input']:
    length = len(sentence.split())
    lengths.append(length)

lengths_series = pd.Series(lengths)
print(lengths_series.describe())

count    89861.000000
mean        12.044235
std          3.269560
min          2.000000
25%          9.000000
50%         13.000000
75%         15.000000
max         15.000000
dtype: float64


In [24]:
long_monologues = pairs_df[pairs_df['input'].str.split().str.len() > 30]
print(long_monologues)
print(pairs_df['input'][89769])
len(long_monologues)

Empty DataFrame
Columns: [input, response]
Index: []
sofs besides these , the center staff also works with community organizations which provide eofs


0

### Initialize the tokenizer

In [41]:
from collections import OrderedDict

tokenizer = Tokenizer(num_words=10000, filters=' ')
tokenizer.fit_on_texts(pairs_df['input'].tolist() + pairs_df['response'].tolist())

# Sort the word_counts dictionary by frequency in descending order
sorted_word_counts = OrderedDict(sorted(tokenizer.word_counts.items(), key=lambda x: x[1], reverse=True))
print(f"\nTop 15 most frequent words:\n {list(sorted_word_counts.items())[:15]}")
print(f"\nLast 100 words:\n {list(sorted_word_counts.items())[-100:]}")


Top 15 most frequent words:
 [('sofs', 179722), ('eofs', 179722), ('.', 141525), ('i', 80118), (',', 75544), ('you', 67065), ('?', 51858), ('the', 45147), ('to', 38350), ('a', 34425), ('it', 32654), ('is', 31942), ('that', 22683), ('do', 21815), ('have', 21121)]

Last 100 words:
 [("frills'business", 1), ('lifts', 1), ('drills', 1), ('ty', 1), ('grudge', 1), ('pilferage', 1), ('irritation', 1), ('antiseptic', 1), ("the'great", 1), ("outdoors'is", 1), ('crouch', 1), ('labyrinth', 1), ('burns', 1), ('targets', 1), ('huangguoshu', 1), ('communicational', 1), ('assiduously', 1), ('rarest', 1), ('codes', 1), ('dongle', 1), ('resolving', 1), ('multitasking', 1), ('constipation', 1), ('recarpeted', 1), ('uncite', 1), ('goodnight', 1), ('singers', 1), ('reunification', 1), ('yearning', 1), ('testimonials', 1), ('informing', 1), ('robson', 1), ('macchiato', 1), ('backers', 1), ('montezuma', 1), ('revenge', 1), ('thans', 1), ('ultra', 1), ('brushed', 1), ('titanium', 1), ('kaohsiung', 1), ('mou

### Filter rare words - 10000 vocabulary OK

In [55]:
# Just reviewing what are the last words in vocabulary - do they still usable and recognizable
from collections import OrderedDict

# Set a frequency threshold
threshold = 3

# Filter out rare words
filtered_words = {word: count for word, count in sorted_word_counts.items() if count >= threshold}

# Display the number of words before and after filtering
print(f"Total words before filtering: {len(sorted_word_counts)}")
print(f"Total words after filtering: {len(filtered_words)}")
# Display the sorted word counts
print(f"\nTop 15 most frequent words:\n {list(filtered_words.items())[:15]}")
print(f"\nLast 100 words:\n {list(filtered_words.items())[-100:]}")

Total words before filtering: 14190
Total words after filtering: 9537

Top 15 most frequent words:
 [('sofs', 179722), ('eofs', 179722), ('.', 141525), ('i', 80118), (',', 75544), ('you', 67065), ('?', 51858), ('the', 45147), ('to', 38350), ('a', 34425), ('it', 32654), ('is', 31942), ('that', 22683), ('do', 21815), ('have', 21121)]

Last 100 words:
 [('trends', 3), ('chan', 3), ('greater', 3), ('workmanship', 3), ('creditor', 3), ('equity', 3), ('curd', 3), ('jeep', 3), ('theft', 3), ('danshui', 3), ('believer', 3), ('disaster', 3), ('truant', 3), ('clarity', 3), ('blanca', 3), ('frappuccino', 3), ('trail', 3), ('flames', 3), ('unfit', 3), ('trev', 3), ('pamphlets', 3), ('hasty', 3), ('expire', 3), ('overcoming', 3), ('banked', 3), ('unmarried', 3), ('scaring', 3), ('seniors', 3), ('alex', 3), ('stylus', 3), ('leaky', 3), ('advisor', 3), ('realise', 3), ('occasional', 3), ('introductory', 3), ('lithium', 3), ('bluemingdails', 3), ('spectator', 3), ('fang', 3), ('insight', 3), ('spotles

### Save the tokenizer

In [43]:
import pickle

# Determine the directory where the tokenizer will be saved
data_dir = os.path.join(os.getcwd(), 'data_dd')
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

# Save the tokenizer using pickle
tokenizer_path = os.path.join(data_dir, 'tokenizer_dd.pickle')
with open(tokenizer_path, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print(f"Tokenizer saved to {tokenizer_path}")

Tokenizer saved to C:\Users\tomui\Desktop\daily_dialogue\data_dd\tokenizer_dd.pickle


### Load the Tokenizer

In [44]:
# Load the tokenizer from file
data_dir = os.path.join(os.getcwd(), 'data_dd')
tokenizer_path = os.path.join(data_dir, 'tokenizer_dd.pickle')
with open(tokenizer_path, 'rb') as handle:
    tokenizer = pickle.load(handle)

### Converting to indices and input-target sequences

In [47]:
# Convert texts to sequences
input_sequences = tokenizer.texts_to_sequences(pairs_df['input'])
target_sequences = tokenizer.texts_to_sequences(pairs_df['response'])

# Pad sequences with pre-padding and truncating
input_padded = pad_sequences(input_sequences, maxlen=max_length, padding='pre', truncating='post')
target_padded = pad_sequences(target_sequences, maxlen=max_length, padding='pre', truncating='post')

# Store numpy arrays directly in the DataFrame
pairs_df['Padded_Input_Sequences'] = input_padded.tolist()
pairs_df['Padded_Target_Sequences'] = target_padded.tolist()

In [48]:
pairs_df

Unnamed: 0,input,response,Padded_Input_Sequences,Padded_Target_Sequences
0,"sofs say , , how about going for a few beers a...",sofs you know that is tempting but is really n...,"[1, 138, 5, 5, 36, 41, 77, 22, 10, 211, 3423, ...","[1, 6, 49, 13, 12, 3335, 32, 12, 58, 16, 48, 2..."
1,sofs you know that is tempting but is really n...,sofs what do you mean ? it will help us to rel...,"[1, 6, 49, 13, 12, 3335, 32, 12, 58, 16, 48, 2...","[0, 1, 19, 14, 6, 155, 7, 11, 25, 105, 95, 9, ..."
2,sofs what do you mean ? it will help us to rel...,sofs do you really think so ? i do not . it wi...,"[0, 1, 19, 14, 6, 155, 7, 11, 25, 105, 95, 9, ...","[1, 14, 6, 58, 46, 43, 7, 4, 14, 16, 3, 11, 25..."
3,sofs do you really think so ? i do not . it wi...,sofs i guess you are right . but what shall we...,"[1, 14, 6, 58, 46, 43, 7, 4, 14, 16, 3, 11, 25...","[1, 4, 217, 6, 18, 55, 3, 32, 19, 304, 23, 14,..."
4,sofs i guess you are right . but what shall we...,sofs i suggest a walk over to the gym where we...,"[1, 4, 217, 6, 18, 55, 3, 32, 19, 304, 23, 14,...","[1, 4, 639, 10, 434, 144, 9, 8, 978, 107, 23, ..."
...,...,...,...,...
89856,sofs why not go again to celebrate out one yea...,sofs are you kidding ? can you afford it ? do ...,"[1, 82, 16, 64, 213, 9, 1719, 89, 65, 222, 159...","[1, 18, 6, 563, 7, 24, 6, 994, 11, 7, 14, 6, 4..."
89857,sofs are you kidding ? can you afford it ? do ...,"sofs never mind that , i will take care of it ...","[1, 18, 6, 563, 7, 24, 6, 994, 11, 7, 14, 6, 4...","[1, 177, 212, 13, 5, 4, 25, 76, 349, 20, 11, 3..."
89858,"sofs never mind that , i will take care of it ...","sofs yeah , i think so . eofs","[1, 177, 212, 13, 5, 4, 25, 76, 349, 20, 11, 3...","[0, 0, 0, 0, 0, 0, 0, 1, 104, 5, 4, 46, 43, 3, 2]"
89859,"sofs yeah , i think so . eofs",sofs ok . i will make the arrangements . it wi...,"[0, 0, 0, 0, 0, 0, 0, 1, 104, 5, 4, 46, 43, 3, 2]","[1, 70, 3, 4, 25, 110, 8, 3714, 3, 11, 25, 37,..."


### Checking if conversion was successfull

In [49]:
def sequences_to_text(sequence):
    index_to_word = {index: word for word, index in tokenizer.word_index.items()}
    # Directly map sequence of indices back to words
    return ' '.join(index_to_word.get(idx, '') for idx in sequence if idx != 0)

# Print original and reverse-tokenized text for entries
for index, row in pairs_df[1130:1135].iterrows():
    print("\nOriginal Text:", row['input'], 
          "\nReconstructed Text:", sequences_to_text(row['Padded_Input_Sequences']))
    print("\nOriginal Text:", row['response'], 
          "\nReconstructed Text:", sequences_to_text(row['Padded_Target_Sequences']))



Original Text: sofs no , so it is usually boring to join my friends in the eofs 
Reconstructed Text: sofs no , so it is usually boring to join my friends in the eofs

Original Text: sofs what kind of things would you like to see on the menu ? eofs 
Reconstructed Text: sofs what kind of things would you like to see on the menu ? eofs

Original Text: sofs what kind of things would you like to see on the menu ? eofs 
Reconstructed Text: sofs what kind of things would you like to see on the menu ? eofs

Original Text: sofs maybe a fruit salad and a few different hot sandwiches at least . eofs 
Reconstructed Text: sofs maybe a fruit salad and a few different hot sandwiches at least . eofs

Original Text: sofs maybe a fruit salad and a few different hot sandwiches at least . eofs 
Reconstructed Text: sofs maybe a fruit salad and a few different hot sandwiches at least . eofs

Original Text: sofs that should not be too difficult . since this is a small neighborhood eofs 
Reconstructed Text: 

## Saving the DataFrame

In [50]:
# Saving the DataFrame
data_dir = os.path.join(os.getcwd(), 'data_dd')
file_path_parquet = os.path.join(data_dir, 'training_df_dd.parquet')
pairs_df.to_parquet(file_path_parquet)

## Loading the DataFrame

In [89]:
# Loading the DataFrame
data_dir = os.path.join(os.getcwd(), 'data_dd')
file_path_parquet = os.path.join(data_dir, 'training_df_dd.parquet')
training_data_final = pd.read_parquet(file_path_parquet)

training_data_final.head(10)

Unnamed: 0,input,response,Padded_Input_Sequences,Padded_Target_Sequences
0,"sofs say , , how about going for a few beers a...",sofs you know that is tempting but is really n...,"[1, 138, 5, 5, 36, 41, 77, 22, 10, 211, 3423, ...","[1, 6, 49, 13, 12, 3335, 32, 12, 58, 16, 48, 2..."
1,sofs you know that is tempting but is really n...,sofs what do you mean ? it will help us to rel...,"[1, 6, 49, 13, 12, 3335, 32, 12, 58, 16, 48, 2...","[0, 1, 19, 14, 6, 155, 7, 11, 25, 105, 95, 9, ..."
2,sofs what do you mean ? it will help us to rel...,sofs do you really think so ? i do not . it wi...,"[0, 1, 19, 14, 6, 155, 7, 11, 25, 105, 95, 9, ...","[1, 14, 6, 58, 46, 43, 7, 4, 14, 16, 3, 11, 25..."
3,sofs do you really think so ? i do not . it wi...,sofs i guess you are right . but what shall we...,"[1, 14, 6, 58, 46, 43, 7, 4, 14, 16, 3, 11, 25...","[1, 4, 217, 6, 18, 55, 3, 32, 19, 304, 23, 14,..."
4,sofs i guess you are right . but what shall we...,sofs i suggest a walk over to the gym where we...,"[1, 4, 217, 6, 18, 55, 3, 32, 19, 304, 23, 14,...","[1, 4, 639, 10, 434, 144, 9, 8, 978, 107, 23, ..."
5,sofs i suggest a walk over to the gym where we...,sofs that 's a good idea . i hear mary and sal...,"[1, 4, 639, 10, 434, 144, 9, 8, 978, 107, 23, ...","[1, 13, 34, 10, 48, 169, 3, 4, 229, 476, 17, 5..."
6,sofs that 's a good idea . i hear mary and sal...,sofs sounds great to me ! if they are willing ...,"[1, 13, 34, 10, 48, 169, 3, 4, 229, 476, 17, 5...","[1, 142, 100, 9, 30, 29, 63, 61, 18, 1155, 5, ..."
7,sofs sounds great to me ! if they are willing ...,sofs good . let us go now . eofs,"[1, 142, 100, 9, 30, 29, 63, 61, 18, 1155, 5, ...","[0, 0, 0, 0, 0, 0, 1, 48, 3, 75, 95, 64, 80, 3..."
8,sofs good . let us go now . eofs,sofs all right . eofs,"[0, 0, 0, 0, 0, 0, 1, 48, 3, 75, 95, 64, 80, 3...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 53, 55, 3, 2]"
9,sofs can you do push ups ? eofs,sofs of course i can . it is a piece of cake !...,"[0, 0, 0, 0, 0, 0, 0, 1, 24, 6, 14, 1595, 2298...","[1, 20, 119, 4, 24, 3, 11, 12, 10, 762, 20, 93..."


In [90]:
len(tokenizer.word_index)

14190

### Splitting the Data

In [91]:
input_sequences = np.array(training_data_final['Padded_Input_Sequences'].tolist())
target_sequences = np.array(training_data_final['Padded_Target_Sequences'].tolist())

# Splitting the data into training and validation sets
input_train, input_val, target_train, target_val = train_test_split(input_sequences, target_sequences, test_size=0.1, random_state=22)

### Defining the Model

In [92]:
# Define model parameters
latent_dim = 256
num_encoder_tokens = np.max(input_sequences) + 1
num_decoder_tokens = np.max(target_sequences) + 1
learning_rate = 0.001

# Define encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = tf.keras.layers.Embedding(num_encoder_tokens, latent_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Define decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = tf.keras.layers.Embedding(num_decoder_tokens, latent_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer=Adam(learning_rate=learning_rate), loss='sparse_categorical_crossentropy')

# Define checkpoint callback
checkpoint = ModelCheckpoint('seq2seq_dd_model.h5', save_best_only=False, monitor='val_loss', mode='min', verbose=1)

# Summary of the model
model.summary()


Model: "model_11"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_15 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 input_16 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 embedding_6 (Embedding)        (None, None, 256)    2560000     ['input_15[0][0]']               
                                                                                                  
 embedding_7 (Embedding)        (None, None, 256)    2560000     ['input_16[0][0]']               
                                                                                           

### Training the Model

In [None]:
# Training the model
history = model.fit(
    [input_train, target_train[:, :-1]],
    target_train[:, 1:, np.newaxis],
    batch_size=64,
    epochs=25,
    validation_data=([input_val, target_val[:, :-1]], target_val[:, 1:, np.newaxis]),
    # callbacks=[checkpoint]
)


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25

In [None]:
# Visualize training history
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

### Generating Responses

In [None]:
# Define encoder model for inference
encoder_model = Model(encoder_inputs, encoder_states)

# Define decoder model for inference
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs
)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)

# Save token index mappings
target_token_index = tokenizer.word_index
reverse_target_token_index = {v: k for k, v in target_token_index.items()}

In [None]:
# Function to generate responses
def generate_response(input_seq: np.ndarray, max_decoder_seq_length: int) -> str:
    # Encode the input sequence to get the internal states
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1 with only the start token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index['sofs']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token and add the corresponding character to the decoded sentence
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_token_index[sampled_token_index]
        decoded_sentence += ' ' + sampled_char

        # Exit condition: either hit max length or find stop token
        if (sampled_char == 'eofs' or len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence.strip('sofs').strip('eofs').strip()



### Test the Model

In [None]:
initial_preprocessing = False  # Expects spaCy to detect and remove names from the text
max_length = 15

# Define ten examples to test the model
test_examples = [
    "How are you doing today?",
    "What is your name?",
    "Can you help me with my homework?",
    "What is the weather like?",
    "Tell me a joke.",
    "Who is the president of the United States?",
    "What is the capital of France?",
    "Do you like pizza?",
    "What is your favorite color?",
    "Goodbye!"
]

# Preprocess input text (assuming preprocess_text is defined elsewhere)
input_text = [preprocess_text(text) for text in test_examples]
print(f"Preprocessed text: {input_text}")

# Tokenize and pad the test examples
test_sequences = tokenizer.texts_to_sequences(input_text)
print(f"Tokenizer sequences: {test_sequences}")
padded_test_sequences = pad_sequences(test_sequences, maxlen=max_length, padding='pre', truncating='post')
print(f"Padded sequences: {padded_test_sequences}")

# Generate responses
for test_seq in padded_test_sequences:
    input_seq = np.array([test_seq])
    response = generate_response(input_seq, max_length)
    print(f"Input: {test_examples[padded_test_sequences.tolist().index(test_seq.tolist())]}")
    print(f"Response: {response}")
    print("-" * 50)
