In [1]:
# Check if GPU is available
from tensorflow.python.client import device_lib

def get_gpu_details():
    devices = device_lib.list_local_devices()
    for device in devices:
        if device.device_type == 'GPU':
            print(f"Device Name: {device.name}")
            print(f"Memory Limit: {device.memory_limit} bytes")
            print(f"Description: {device.physical_device_desc}")

get_gpu_details()


Device Name: /device:GPU:0
Memory Limit: 4158652416 bytes
Description: device: 0, name: NVIDIA GeForce GTX 1660 Ti with Max-Q Design, pci bus id: 0000:01:00.0, compute capability: 7.5


In [2]:
import os
import pandas as pd
import numpy as np

## Loading the DataFrame

In [3]:
# Loading the DataFrame
data_dir = os.path.join(os.getcwd(), 'data')

file_path_parquet = os.path.join(data_dir, 'utterances.parquet')
df_loaded_parquet = pd.read_parquet(file_path_parquet)

df_loaded_parquet.head(5)

Unnamed: 0,id,conversation_id,text,speaker,reply_to,timestamp,movie_id,tok_0_token,tok_0_tag,tok_0_dep,...,tok_121_dep,tok_122_token,tok_122_tag,tok_122_dep,tok_123_token,tok_123_tag,tok_123_dep,tok_124_token,tok_124_tag,tok_124_dep
0,L1045,L1044,They do not!,u0,L1044,,m0,They,PRP,nsubj,...,,,,,,,,,,
1,L1044,L1044,They do to!,u2,,,m0,They,PRP,nsubj,...,,,,,,,,,,
2,L985,L984,I hope so.,u0,L984,,m0,I,PRP,nsubj,...,,,,,,,,,,
3,L984,L984,She okay?,u2,,,m0,She,PRP,nsubj,...,,,,,,,,,,
4,L925,L924,Let's go.,u0,L924,,m0,Let,VB,ROOT,...,,,,,,,,,,


## Data cleaning

### Leaving only necessary data for simple Sec2Seq model
Id, conversation_id for tracking the flow of conversations and reply_to for understanding the sequence within the dialogue, and conversation text ofcourse.

In [4]:
conversations = df_loaded_parquet[['text', 'id', 'conversation_id', 'reply_to']]
conversations.head(5)

Unnamed: 0,text,id,conversation_id,reply_to
0,They do not!,L1045,L1044,L1044
1,They do to!,L1044,L1044,
2,I hope so.,L985,L984,L984
3,She okay?,L984,L984,
4,Let's go.,L925,L924,L924


## Initialize and save the tokenizer

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle

tokenizer = Tokenizer(num_words=20000)

# Fit the tokenizer on the modified text data that includes <start> and <end> tokens
tokenizer.fit_on_texts(['saaat ennnd'] + conversations['text'])

# Determine the directory where the tokenizer will be saved
data_dir = os.path.join(os.getcwd(), 'data')
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

# Save the tokenizer using pickle
tokenizer_path = os.path.join(data_dir, 'tokenizer.pickle')
with open(tokenizer_path, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print(f"Tokenizer saved to {tokenizer_path}")
conversations.head(5)

Tokenizer saved to C:\Users\tomui\Desktop\capstone_project\data\tokenizer.pickle


Unnamed: 0,text,id,conversation_id,reply_to
0,They do not!,L1045,L1044,L1044
1,They do to!,L1044,L1044,
2,I hope so.,L985,L984,L984
3,She okay?,L984,L984,
4,Let's go.,L925,L924,L924


## Load the Tokenizer

In [6]:
# # Load the tokenizer from file
# data_dir = os.path.join(os.getcwd(), 'data')
# tokenizer_path = os.path.join(data_dir, 'tokenizer.pickle')
# with open(tokenizer_path, 'rb') as handle:
#     tokenizer = pickle.load(handle)

## Conversations text preprocessing

In [7]:
import re
import unicodedata
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Download necessary NLTK resources
nltk.download('punkt')  # Tokenizer
nltk.download('wordnet')  # Lemmatizer
nltk.download('stopwords')  # Stopwords

# Stopwords list
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tomui\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tomui\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tomui\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# Define preprocessing functions common for training and then for response generation with model

max_length = 40

def normalize_text(text: str) -> str:
    text = text.lower().strip()
    text = re.sub(r"[^a-z0-9.',!? ]", ' ', text) # Shall be applied if remove punctuation is not applied, keeps basic punctuation
    text = re.sub(r'\d+', '<num>', text)
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    return text

def preprocess_text(text: str) -> str:
    # Normalize text
    text = normalize_text(text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords and tokenize
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    # Add <start> and <end> tokens
    filtered_words = ['saaat'] + filtered_words + ['ennnd']
    # Convert words to a sequence of indices
    sequences = tokenizer.texts_to_sequences([filtered_words])
    # Pad sequences to a fixed length
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
    return padded_sequences


In [9]:
print(tokenizer.word_index['saaat'], tokenizer.word_index['ennnd'])

1 72


In [10]:
# Apply the preprocessing function to each row in the 'text' column
conversations['preprocessed_text'] = conversations['text'].apply(preprocess_text)
conversations

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  conversations['preprocessed_text'] = conversations['text'].apply(preprocess_text)


Unnamed: 0,text,id,conversation_id,reply_to,preprocessed_text
0,They do not!,L1045,L1044,L1044,"[[1, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
1,They do to!,L1044,L1044,,"[[1, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
2,I hope so.,L985,L984,L984,"[[1, 410, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,She okay?,L984,L984,,"[[1, 176, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,Let's go.,L925,L924,L924,"[[1, 2338, 59, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...
304708,Lord Chelmsford seems to want me to stay back ...,L666371,L666369,L666370,"[[1, 971, 662, 48, 298, 85, 72, 0, 0, 0, 0, 0,..."
304709,I'm to take the Sikali with the main column to...,L666370,L666369,L666369,"[[1, 2011, 101, 1712, 4107, 1351, 72, 0, 0, 0,..."
304710,"Your orders, Mr Vereker?",L666369,L666369,,"[[1, 1247, 158, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
304711,"Good ones, yes, Mr Vereker. Gentlemen who can ...",L666257,L666256,L666256,"[[1, 89, 853, 249, 158, 1362, 765, 650, 72, 0,..."


In [11]:
# Define a function to convert sequences back to text
def sequences_to_text(sequence):
    index_to_word = {index: word for word, index in tokenizer.word_index.items()}
    # Directly map sequence of indices back to words
    return ' '.join(index_to_word.get(idx, '') for idx in sequence if idx != 0)


# Print original and reverse-tokenized text for the first five entries
conversations[1130:1140].apply(lambda x: print("Original Text:", x['text'], "\nReconstructed Text:", sequences_to_text(x['preprocessed_text'][0])), axis=1)


Original Text: Do you have any Czech girls working for you? 
Reconstructed Text: saaat czech girls working ennnd
Original Text: Yeah.  He wanted a girl from Czechoslovakia, but I sent him Honey 'cause once they get there, you know, it doesn't really matter - Honey was killed...?  Poor girl... 
Reconstructed Text: saaat yeah wanted girl czechoslovakia sent honey cause get know doesnt really matter honey killed poor girl ennnd
Original Text: Oh my G-d.  Honey!  Honey's dead? 
Reconstructed Text: saaat oh g honey honeys dead ennnd
Original Text: We don't have her I.D. yet, but one of your girls was killed last night at the King Edward Hotel. 
Reconstructed Text: saaat dont id yet one girls killed last night king edward hotel ennnd
Original Text: What's wrong? 
Reconstructed Text: saaat whats wrong ennnd
Original Text: Homicide, Miss Hearn.  It's Detective Eddie Flemming.  Open up. 
Reconstructed Text: saaat homicide miss detective eddie flemming open ennnd
Original Text: I have nothin' to

1130    None
1131    None
1132    None
1133    None
1134    None
1135    None
1136    None
1137    None
1138    None
1139    None
dtype: object

## Pairing messages - input with responses

In [12]:
# Merging the DataFrame with itself to form pairs
pairs = pd.merge(
    conversations, conversations,
    left_on='id',
    right_on='reply_to',
    suffixes=('_input', '_response')
)

In [13]:
pairs.head()

Unnamed: 0,text_input,id_input,conversation_id_input,reply_to_input,preprocessed_text_input,text_response,id_response,conversation_id_response,reply_to_response,preprocessed_text_response
0,They do to!,L1044,L1044,,"[[1, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",They do not!,L1045,L1044,L1044,"[[1, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
1,She okay?,L984,L984,,"[[1, 176, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",I hope so.,L985,L984,L984,"[[1, 410, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,Wow,L924,L924,,"[[1, 2492, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",Let's go.,L925,L924,L924,"[[1, 2338, 59, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,No,L871,L870,L870,"[[1, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",Okay -- you're gonna need to learn how to lie.,L872,L870,L871,"[[1, 176, 2071, 10429, 129, 754, 723, 72, 0, 0..."
4,I'm kidding. You know how sometimes you just ...,L870,L870,,"[[1, 2011, 775, 17, 627, 819, 19320, 1587, 17,...",No,L871,L870,L870,"[[1, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."


In [14]:
# Selecting the needed columns including IDs
training_data = pairs[['id_input', 'text_input', 'preprocessed_text_input', 'id_response', 'text_response', 'preprocessed_text_response']]

# Renaming columns for clarity
training_data.columns = ['ID_Input', 'Input', 'Tokens_Input', 'ID_Response', 'Response', 'Tokens_Response']

In [15]:
training_data

Unnamed: 0,ID_Input,Input,Tokens_Input,ID_Response,Response,Tokens_Response
0,L1044,They do to!,"[[1, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",L1045,They do not!,"[[1, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
1,L984,She okay?,"[[1, 176, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",L985,I hope so.,"[[1, 410, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,L924,Wow,"[[1, 2492, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",L925,Let's go.,"[[1, 2338, 59, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,L871,No,"[[1, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",L872,Okay -- you're gonna need to learn how to lie.,"[[1, 176, 2071, 10429, 129, 754, 723, 72, 0, 0..."
4,L870,I'm kidding. You know how sometimes you just ...,"[[1, 2011, 775, 17, 627, 819, 19320, 1587, 17,...",L871,No,"[[1, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
...,...,...,...,...,...,...
221611,L666520,"Well I assure you, Sir, I have no desire to cr...","[[1, 134, 2778, 182, 2328, 2244, 10869, 72, 0,...",L666521,"And I assure you, you do not In fact I'd be ob...","[[1, 2778, 602, 3056, 6315, 324, 1378, 8386, 3..."
221612,L666371,Lord Chelmsford seems to want me to stay back ...,"[[1, 971, 662, 48, 298, 85, 72, 0, 0, 0, 0, 0,...",L666372,I think Chelmsford wants a good man on the bor...,"[[1, 50, 390, 89, 99, 2818, 6117, 1076, 4232, ..."
221613,L666370,I'm to take the Sikali with the main column to...,"[[1, 2011, 101, 1712, 4107, 1351, 72, 0, 0, 0,...",L666371,Lord Chelmsford seems to want me to stay back ...,"[[1, 971, 662, 48, 298, 85, 72, 0, 0, 0, 0, 0,..."
221614,L666369,"Your orders, Mr Vereker?","[[1, 1247, 158, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",L666370,I'm to take the Sikali with the main column to...,"[[1, 2011, 101, 1712, 4107, 1351, 72, 0, 0, 0,..."


In [17]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221616 entries, 0 to 221615
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   ID_Input         221616 non-null  object
 1   Input            221616 non-null  object
 2   Tokens_Input     221616 non-null  object
 3   ID_Response      221616 non-null  object
 4   Response         221616 non-null  object
 5   Tokens_Response  221616 non-null  object
dtypes: object(6)
memory usage: 10.1+ MB
