# CAPSTONE PROJECT - CHATBOT

In [1]:
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [2]:
# !pip install scikit-learn

In [3]:
# Check if GPU is available
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"CUDA is available. Device: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")

CUDA is available. Device: NVIDIA GeForce GTX 1660 Ti with Max-Q Design


## DATA
### Data sources
https://convokit.cornell.edu/documentation/movie.html <br>
https://www.cs.cornell.edu/~cristian/Chameleons_in_imagined_conversations.html 

### Install ConvoKit

In [4]:
#!pip install convokit

### Load data from source and save to 'data' folder

In [5]:
# from convokit import Corpus, download
# import os

# # Directory where to save the corpus
# data_dir = os.path.join(os.getcwd(), 'data')

# # Ensure the directory exists
# if not os.path.exists(data_dir):
#     os.makedirs(data_dir)

# # Downloading and saving the corpus
# corpus = Corpus(filename=download("movie-corpus", data_dir=data_dir))

# # Saving the corpus to the 'data' folder
# corpus_path = os.path.join(data_dir, "movie_corpus")
# corpus.dump(corpus_path)

Downloading movie-corpus to C:\Users\tomui\Desktop\capstone_project\data\movie-corpus  
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done

### Load data from 'data' folder

In [6]:
from convokit import Corpus
import os

# Directory where to load the corpus
data_dir = os.path.join(os.getcwd(), 'data')

# Load the corpus from the specified folder
loaded_corpus = Corpus(filename=os.path.join(data_dir, "movie_corpus"))
loaded_corpus.print_summary_stats()

Number of Speakers: 9035
Number of Utterances: 304713
Number of Conversations: 83097


In [7]:
type(loaded_corpus)

convokit.model.corpus.Corpus

### Data Structure and Organization
```plaintext
data/
└── movie_corpus/
    ├── conversations.json
    ├── corpus.json
    ├── index.json
    ├── speakers.json
    └── utterances.jsonl
```

Description [here](https://convokit.cornell.edu/documentation/movie.html).


### Choice for exploration
The files I need from ConvoKit corpus for my chatbot project depend on the specific functionalities I want to implement in my chatbot. I'll most likely need `utterances.json` because it contains the dialogue data. This is what I'll use to train chatbot to understand and generate human-like responses.

Description from source:  
> "Utterance-level information <br>
> For each utterance, we provide:
> - id: index of the utterance
> - speaker: the speaker who authored the utterance
> - conversation_id: id of the first utterance in the conversation this utterance belongs to
> - reply_to: id of the utterance to which this utterance replies to (None if the utterance is not a reply)
> - timestamp: time of the utterance
> - text: textual content of the utterance
> 
> Metadata for utterances include:
> - movie_idx: index of the movie from which this utterance occurs
> - parsed: parsed version of the utterance text, represented as a SpaCy Doc"


### Understanding data from `utterances.jsonl`

In [8]:
import json
from pprint import pprint as pp

# Initialize a list to hold all the utterances
utterances = []

# Open the file and read line by line
with open(os.path.join(data_dir, 'movie_corpus', 'utterances.jsonl'), 'r') as file:
    
    for line in file:
        utterance = json.loads(line)
        utterances.append(utterance)

In [9]:
print(type(utterances))

<class 'list'>


In [10]:
print(f'There are a total of {len(utterances)} lines\n')
pp(utterances[: 3])

There are a total of 304713 lines

[{'conversation_id': 'L1044',
  'id': 'L1045',
  'meta': {'movie_id': 'm0',
           'parsed': [{'rt': 1,
                       'toks': [{'dep': 'nsubj',
                                 'dn': [],
                                 'tag': 'PRP',
                                 'tok': 'They',
                                 'up': 1},
                                {'dep': 'ROOT',
                                 'dn': [0, 2, 3],
                                 'tag': 'VBP',
                                 'tok': 'do'},
                                {'dep': 'neg',
                                 'dn': [],
                                 'tag': 'RB',
                                 'tok': 'not',
                                 'up': 1},
                                {'dep': 'punct',
                                 'dn': [],
                                 'tag': '.',
                                 'tok': '!',
                           

### Understanding data from other dataset json files

In [11]:
# Load the data
# with open(os.path.join(data_dir, 'movie_corpus', 'conversations.json'), 'r') as file:
#     conversations = json.load(file)
# with open(os.path.join(data_dir, 'movie_corpus', 'corpus.json'), 'r') as file:
#     conversations = json.load(file)
# with open(os.path.join(data_dir, 'movie_corpus', 'index.json'), 'r') as file:
#     conversations = json.load(file)
# with open(os.path.join(data_dir, 'movie_corpus', 'speakers.json'), 'r') as file:
#     conversations = json.load(file)

# print(type(conversations))

# print(f'There are a total of {len(conversations)} keys in the dictionary\n')
# first_three_items = list(conversations.items())[:3]
# pp(first_three_items)

## Decision regarding data

In developing the chatbot I made the decision to collect only data from the utterances.json file to ensure the chatbot can effectively manage and understand multi-turn conversations. The essential data elements to be gathered include `'text'` for generating responses, `'conversation_id'` for tracking the flow of conversations, and `'reply_to'` for understanding response sequences within the dialogue. While initially, the chatbot will not utilize complex NLP features like parsed linguistic data, the architecture will allow for the integration of these advanced features in the future. While initially I will collect `'parsed'` and `'toks'` information from the utterances.json file, the decision on whether to use this pre-parsed data directly, generate similar data anew, or conduct comparisons between the two will be made later as the project evolves. This approach ensures flexibility in utilizing advanced NLP features as required, maintaining the adaptability of the architecture for future enhancements.

## Converting utterances data to DataFrame
Pandas provides a powerful and easy-to-use interface for data manipulation, filtering, transformation, and analysis, and integration with Python Ecosystem: seamless integration with other Python libraries for data analysis, machine learning (e.g., scikit-learn, TensorFlow), and visualization (e.g., Matplotlib, Seaborn), as well fast processing for datasets that fit comfortably in memory.

In [12]:
import numpy as np
import pandas as pd

# Flatten the data
def flatten_data(data):
    flattened_data = []
    for entry in data:
        flat_entry = {
            'id': entry['id'],
            'conversation_id': entry['conversation_id'],
            'text': entry['text'],
            'speaker': entry['speaker'],
            'reply_to': entry.get('reply-to'),
            'timestamp': entry['timestamp'],
            'movie_id': entry['meta']['movie_id'],
        }
        # Handle nested parsed data
        for parsed in entry['meta']['parsed']:
            for idx, tok in enumerate(parsed['toks']):
                flat_entry[f'tok_{idx}_token'] = tok['tok']
                flat_entry[f'tok_{idx}_tag'] = tok['tag']
                flat_entry[f'tok_{idx}_dep'] = tok['dep']
                # Add other fields from tokens as needed
        flattened_data.append(flat_entry)
    return flattened_data

# Convert to DataFrame
flattened_data = flatten_data(utterances)
df = pd.DataFrame(flattened_data)

In [13]:
# Show DataFrame to check structure
df.head()

Unnamed: 0,id,conversation_id,text,speaker,reply_to,timestamp,movie_id,tok_0_token,tok_0_tag,tok_0_dep,...,tok_121_dep,tok_122_token,tok_122_tag,tok_122_dep,tok_123_token,tok_123_tag,tok_123_dep,tok_124_token,tok_124_tag,tok_124_dep
0,L1045,L1044,They do not!,u0,L1044,,m0,They,PRP,nsubj,...,,,,,,,,,,
1,L1044,L1044,They do to!,u2,,,m0,They,PRP,nsubj,...,,,,,,,,,,
2,L985,L984,I hope so.,u0,L984,,m0,I,PRP,nsubj,...,,,,,,,,,,
3,L984,L984,She okay?,u2,,,m0,She,PRP,nsubj,...,,,,,,,,,,
4,L925,L924,Let's go.,u0,L924,,m0,Let,VB,ROOT,...,,,,,,,,,,


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304713 entries, 0 to 304712
Columns: 382 entries, id to tok_124_dep
dtypes: object(382)
memory usage: 888.1+ MB


In [15]:
# Temporarily adjust display settings to show all columns
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df.isnull().sum())
#print(df.isnull().sum())

id                      0
conversation_id         0
text                    0
speaker                 0
reply_to            83097
timestamp          304713
movie_id                0
tok_0_token           267
tok_0_tag             267
tok_0_dep             267
tok_1_token           625
tok_1_tag             625
tok_1_dep             625
tok_2_token         21729
tok_2_tag           21729
tok_2_dep           21729
tok_3_token         38515
tok_3_tag           38515
tok_3_dep           38515
tok_4_token         63316
tok_4_tag           63316
tok_4_dep           63316
tok_5_token         92404
tok_5_tag           92404
tok_5_dep           92404
tok_6_token        121962
tok_6_tag          121962
tok_6_dep          121962
tok_7_token        149962
tok_7_tag          149962
tok_7_dep          149962
tok_8_token        175281
tok_8_tag          175281
tok_8_dep          175281
tok_9_token        196627
tok_9_tag          196627
tok_9_dep          196627
tok_10_token       214433
tok_10_tag  

## Saving the DataFrame

In [16]:
# Saving the DataFrame
file_path_parquet = os.path.join(data_dir, 'utterances.parquet')
df.to_parquet(file_path_parquet)

`utterances.jsonl` - 351 404 KB, `utterances.parquet` - 28 409 KB

## Loading the DataFrame

In [17]:
# Loading the DataFrame
file_path_parquet = os.path.join(data_dir, 'utterances.parquet')
df_loaded_parquet = pd.read_parquet(file_path_parquet)

df_loaded_parquet.head(20)

Unnamed: 0,id,conversation_id,text,speaker,reply_to,timestamp,movie_id,tok_0_token,tok_0_tag,tok_0_dep,...,tok_121_dep,tok_122_token,tok_122_tag,tok_122_dep,tok_123_token,tok_123_tag,tok_123_dep,tok_124_token,tok_124_tag,tok_124_dep
0,L1045,L1044,They do not!,u0,L1044,,m0,They,PRP,nsubj,...,,,,,,,,,,
1,L1044,L1044,They do to!,u2,,,m0,They,PRP,nsubj,...,,,,,,,,,,
2,L985,L984,I hope so.,u0,L984,,m0,I,PRP,nsubj,...,,,,,,,,,,
3,L984,L984,She okay?,u2,,,m0,She,PRP,nsubj,...,,,,,,,,,,
4,L925,L924,Let's go.,u0,L924,,m0,Let,VB,ROOT,...,,,,,,,,,,
5,L924,L924,Wow,u2,,,m0,Wow,UH,ROOT,...,,,,,,,,,,
6,L872,L870,Okay -- you're gonna need to learn how to lie.,u0,L871,,m0,Okay,UH,intj,...,,,,,,,,,,
7,L871,L870,No,u2,L870,,m0,No,UH,ROOT,...,,,,,,,,,,
8,L870,L870,I'm kidding. You know how sometimes you just ...,u0,,,m0,And,CC,cc,...,,,,,,,,,,
9,L869,L866,Like my fear of wearing pastels?,u0,L868,,m0,Like,IN,ROOT,...,,,,,,,,,,


## Data cleaning

### Leaving only necessary data for initial stage of the project
Id, conversation_id for tracking the flow of conversations and reply_to for understanding the sequence within the dialogue, and conversation text ofcourse.

In [18]:
conversations = df_loaded_parquet[['text', 'id', 'conversation_id', 'reply_to']]
conversations.head(30)

Unnamed: 0,text,id,conversation_id,reply_to
0,They do not!,L1045,L1044,L1044
1,They do to!,L1044,L1044,
2,I hope so.,L985,L984,L984
3,She okay?,L984,L984,
4,Let's go.,L925,L924,L924
5,Wow,L924,L924,
6,Okay -- you're gonna need to learn how to lie.,L872,L870,L871
7,No,L871,L870,L870
8,I'm kidding. You know how sometimes you just ...,L870,L870,
9,Like my fear of wearing pastels?,L869,L866,L868


In [19]:
# Cheking if counts of None are the same with 'id' == 'conversation_id'
print("Total entries where 'id' equals 'conversation_id':", (df['id'] == df['conversation_id']).sum())
print("Total entries where 'reply_to' is None:", df['reply_to'].isnull().sum())

Total entries where 'id' equals 'conversation_id': 83097
Total entries where 'reply_to' is None: 83097


## Conclusion
Looks like the dataset is well-structured and prepared for further processing: <br>
Conversation_id and id:  <br>
When conversation_id and id are the same and there's no reply_to, this indicates the start of a new conversation, this allows to understand where each conversation begins.  <br>
Counts of None in reply_to:  <br>
The count of None in the reply_to field matches the number of conversations (83,097). This confirms that each conversation starts with a message that does not reply to any previous message, this is the first message in the thread.  <br>
Data Cleanliness:  <br>
The alignment of these counts and the consistency of data formatting suggest that dataset is clean and structured. Each message within the dataset is correctly linked to its conversation, and the flow of conversations is well-defined.  <br>

In [20]:
conversations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304713 entries, 0 to 304712
Data columns (total 4 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   text             304713 non-null  object
 1   id               304713 non-null  object
 2   conversation_id  304713 non-null  object
 3   reply_to         221616 non-null  object
dtypes: object(4)
memory usage: 9.3+ MB


In [21]:
conversations.text.describe()

count     304713
unique    265774
top        What?
freq        1684
Name: text, dtype: object

## Analyze text of conversations

In [22]:
# !pip install textblob

In [23]:
from nltk import FreqDist, word_tokenize
from nltk.tokenize import sent_tokenize
from textblob import TextBlob
import nltk

# Ensure that the punkt tokenizer is available
nltk.download('punkt')

# Basic statistics
conversations.loc[:, 'msg_length'] = conversations.loc[:, 'text'].apply(len)
conversations.loc[:, 'word_count'] = conversations.loc[:, 'text'].apply(lambda x: len(word_tokenize(x)))


print("Average message length (characters):", np.mean(conversations['msg_length']))
print("Average message length (words):", np.mean(conversations['word_count']))
print("Min message length (characters):", np.min(conversations['msg_length']))
print("Max message length (characters):", np.max(conversations['msg_length']))
print("Standard deviation (characters):", np.std(conversations['msg_length']))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tomui\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  conversations.loc[:, 'msg_length'] = conversations.loc[:, 'text'].apply(len)


Average message length (characters): 55.25953930419772
Average message length (words): 13.721094931952361
Min message length (characters): 0
Max message length (characters): 3046
Standard deviation (characters): 64.06661834805733


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  conversations.loc[:, 'word_count'] = conversations.loc[:, 'text'].apply(lambda x: len(word_tokenize(x)))


### Word frequency analysis

In [24]:
all_words = ' '.join(conversations['text']).lower()
words = word_tokenize(all_words)
freq_dist = FreqDist(words)
print("Most common words:", freq_dist.most_common(50))

Most common words: [('.', 332912), (',', 170188), ('you', 148400), ('i', 140952), ('?', 110240), ('the', 99132), ('to', 80649), ('a', 70839), ("'s", 66538), ('it', 66076), ("n't", 55224), ('...', 50796), ('do', 47049), ('that', 46582), ('and', 45934), ('of', 39338), ('!', 37866), ('what', 37719), ('in', 34129), ('me', 32203), ('is', 31639), ('we', 29291), ('he', 27408), ('--', 26662), ('this', 24616), ('for', 23415), ('have', 22934), ("'m", 22578), ("'re", 21717), ('know', 21657), ('was', 21407), ('your', 20962), ('my', 20824), ('not', 19883), ('on', 19560), ('no', 19425), ('be', 19414), ('are', 17600), ('but', 17321), ('with', 17249), ('they', 16942), ('just', 15853), ('all', 15392), ('like', 15007), ("'ll", 14613), ('did', 14547), ('there', 14446), ('get', 14152), ('about', 14000), ('so', 13447)]


### Sentiment analysis

In [25]:
conversations.loc[:, 'sentiment'] = conversations.loc[:, 'text'].apply(lambda x: TextBlob(x).sentiment.polarity)
print("Average sentiment (polarity):", np.mean(conversations['sentiment']))
print("Sentiment distribution:", conversations['sentiment'].describe())

Average sentiment (polarity): 0.04174547982992158
Sentiment distribution: count    304713.000000
mean          0.041745
std           0.246197
min          -1.000000
25%           0.000000
50%           0.000000
75%           0.013889
max           1.000000
Name: sentiment, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  conversations.loc[:, 'sentiment'] = conversations.loc[:, 'text'].apply(lambda x: TextBlob(x).sentiment.polarity)


In [26]:
# Calculating zero characters messages
zero_length_messages = conversations[conversations['text'].apply(len) == 0]
print("Number of zero-length messages:", len(zero_length_messages))
zero_length_messages.sample(10)

Number of zero-length messages: 267


Unnamed: 0,text,id,conversation_id,reply_to,msg_length,word_count,sentiment
153452,,L128985,L128985,,0,0,0.0
154160,,L127954,L127954,,0,0,0.0
153539,,L128742,L128742,,0,0,0.0
153661,,L128579,L128562,L128578,0,0,0.0
154007,,L128722,L128721,L128721,0,0,0.0
153331,,L129413,L129409,L129412,0,0,0.0
101507,,L541062,L541061,L541061,0,0,0.0
154244,,L129469,L129463,L129468,0,0,0.0
213440,,L352107,L352107,,0,0,0.0
154010,,L128708,L128708,,0,0,0.0


Same id and conversation_id with None in reply_to - these messages likely represent the start of a conversation. Removing them could impact the structure of the conversation as it might remove the entry point for a conversational thread.
Different id and conversation_id with a specific reply_to - these are responses within a conversation. Their removal might disrupt the sequence, making it difficult to follow the flow of the conversation.
Messages with a specific reply_to - these indicate replies within the conversation sequence. Removing these could create gaps in the conversation history. I've decided to leave zero text conversations for now, besides thera are only 267 of these.

In [27]:
# Calculating long messages
long_messages = conversations[conversations['msg_length'] > 300]
print("Number of long messages:", len(long_messages))


Number of long messages: 3151


In [28]:
# Print sample long messages
sampled_text = long_messages.sample(1)['text'].iloc[0]
print(sampled_text)

This is what I call my secret place 'cause I come out here when I feel like bein' by myself. I used to come here with Karen Cross. She's kind of like my girlfriend, or used to be. She says she likes Jerry Maroney now. But I'm gonna get her back 'cause I love her. We used to come here and hold hands and talk and read books to each other with a flashlight. She didn't want to have anything to do with me in front of other people 'cause I don't have any money. Well, mama and me, I mean. She seemed to like me a whole lot when we were out here though. She said she loved me, too. Out here. Settin' right on that stump you're on. See, her daddy's a dentist so they're rich. So's Jerry Maroney's daddy. He owns the ice plant. Was your folks well off?


Decided to leave for now long messages - considering to use advanced NLP models such as BERT or GPT (from the transformer family), which are adept at understanding context over longer stretches of text.

## Conversations text preprocessing

### Normalize text

In [29]:
import re
import unicodedata

# Convert to lowercase
conversations.loc[:, 'text'] = conversations.loc[:, 'text'].str.lower().str.strip()
# Function to apply the regex and normalization transformations row-wise
def normalize_text(text):
    # Remove non-alphanumeric characters except for basic punctuation
    text = re.sub(r"[^a-z0-9.',!? ]", ' ', text)
    # Replace numbers with a special token
    text = re.sub(r'\d+', '<num>', text)
    # Normalize accented characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    return text

# Apply the normalization function to each row in the 'text' column
conversations.loc[:, 'text'] = conversations.loc[:, 'text'].apply(normalize_text)
long_messages = conversations[conversations['msg_length'] > 300]
sampled_text = long_messages.sample(1)['text'].iloc[0]
print(sampled_text)

no, ninety ninety percent of them are full of baloney. they're into the power trip, not the damage. what scares me is that this guy is so sophisticated he could blow up whatever he wants, then disappear. the worst of the bunch, they love the challenge of creating the wildest device ever... and they love the carnage.


### Remove punctuation

In [30]:
# import string
# conversations.loc[:, 'text'] = conversations.loc[:, 'text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
# long_messages = conversations[conversations['msg_length'] > 300]
# sampled_text = long_messages.sample(1)['text'].iloc[0]
# print(sampled_text)

### Remove Stopwords

In [31]:
# from nltk.corpus import stopwords
# nltk.download('stopwords')
# stop_words = set(stopwords.words('english'))
# conversations.loc[:, 'text'] = conversations['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
# long_messages = conversations[conversations['msg_length'] > 300]
# sampled_text = long_messages.sample(1)['text'].iloc[0]
# print(sampled_text)

## Analyze again text of conversations

In [32]:
# # Basic statistics
# conversations.loc[:, 'msg_length'] = conversations['text'].apply(len)
# conversations.loc[:, 'word_count'] = conversations['text'].apply(lambda x: len(word_tokenize(x)))


# print("Average message length (characters):", np.mean(conversations['msg_length']))
# print("Average message length (words):", np.mean(conversations['word_count']))
# print("Min message length (characters):", np.min(conversations['msg_length']))
# print("Max message length (characters):", np.max(conversations['msg_length']))
# print("Standard deviation (characters):", np.std(conversations['msg_length']))

### Tokenize text

In [33]:
conversations.loc[:, 'tokens'] = conversations.loc[:, 'text'].apply(word_tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  conversations.loc[:, 'tokens'] = conversations.loc[:, 'text'].apply(word_tokenize)


In [34]:
conversations[1200: 1210]

Unnamed: 0,text,id,conversation_id,reply_to,msg_length,word_count,sentiment,tokens
1200,or czechoslovakia. the slavs have been fighti...,L2897,L2895,L2896,163,29,0.13,"[or, czechoslovakia, ., the, slavs, have, been..."
1201,eastern europe. like what? romania? hungary?,L2896,L2895,L2895,46,10,0.0,"[eastern, europe, ., like, what, ?, romania, ?..."
1202,maybe it's a ritual thing or someone trying to...,L2895,L2895,,228,48,-0.216667,"[maybe, it, 's, a, ritual, thing, or, someone,..."
1203,"look, i'm not even sure she has anything to do...",L2893,L2892,L2892,192,44,0.25,"[look, ,, i, 'm, not, even, sure, she, has, an..."
1204,what would you call her?,L2892,L2892,,24,6,0.0,"[what, would, you, call, her, ?]"
1205,who says she's a suspect?,L2891,L2887,L2890,25,7,0.0,"[who, says, she, 's, a, suspect, ?]"
1206,maybe you don't care about that either. prett...,L2890,L2887,L2889,78,17,0.0,"[maybe, you, do, n't, care, about, that, eithe..."
1207,hmmmm.,L2889,L2887,L2888,6,2,0.0,"[hmmmm, .]"
1208,pretty.,L2888,L2887,L2887,7,2,0.25,"[pretty, .]"
1209,the super said he'd seen her before but she di...,L2887,L2887,,61,15,0.234848,"[the, super, said, he, 'd, seen, her, before, ..."


### Lemmatize tokens

In [35]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
conversations.loc[:, 'tokens'] = conversations['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
conversations[1200: 1210]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tomui\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,text,id,conversation_id,reply_to,msg_length,word_count,sentiment,tokens
1200,or czechoslovakia. the slavs have been fighti...,L2897,L2895,L2896,163,29,0.13,"[or, czechoslovakia, ., the, slav, have, been,..."
1201,eastern europe. like what? romania? hungary?,L2896,L2895,L2895,46,10,0.0,"[eastern, europe, ., like, what, ?, romania, ?..."
1202,maybe it's a ritual thing or someone trying to...,L2895,L2895,,228,48,-0.216667,"[maybe, it, 's, a, ritual, thing, or, someone,..."
1203,"look, i'm not even sure she has anything to do...",L2893,L2892,L2892,192,44,0.25,"[look, ,, i, 'm, not, even, sure, she, ha, any..."
1204,what would you call her?,L2892,L2892,,24,6,0.0,"[what, would, you, call, her, ?]"
1205,who says she's a suspect?,L2891,L2887,L2890,25,7,0.0,"[who, say, she, 's, a, suspect, ?]"
1206,maybe you don't care about that either. prett...,L2890,L2887,L2889,78,17,0.0,"[maybe, you, do, n't, care, about, that, eithe..."
1207,hmmmm.,L2889,L2887,L2888,6,2,0.0,"[hmmmm, .]"
1208,pretty.,L2888,L2887,L2887,7,2,0.25,"[pretty, .]"
1209,the super said he'd seen her before but she di...,L2887,L2887,,61,15,0.234848,"[the, super, said, he, 'd, seen, her, before, ..."


## Learning conversation id structure more accurate

In [36]:
filtered_sorted_conversations = conversations[conversations['id'].apply(lambda x: 840 <= int(x[1:]) <= 870 if x[1:].isdigit() else False)
].sort_values(by='id', key=lambda x: x.str.extract('(\d+)', expand=False).astype(int))

filtered_sorted_conversations


Unnamed: 0,text,id,conversation_id,reply_to,msg_length,word_count,sentiment,tokens
428,you're amazingly self assured. has anyone ever...,L840,L834,L839,61,12,0.6,"[you, 're, amazingly, self, assured, ., ha, an..."
427,go to the prom with me,L841,L834,L840,22,6,0.0,"[go, to, the, prom, with, me]"
426,is that a request or a command?,L842,L842,,31,8,0.0,"[is, that, a, request, or, a, command, ?]"
425,you know what i mean,L843,L842,L842,20,5,-0.3125,"[you, know, what, i, mean]"
424,no.,L844,L842,L843,3,2,0.0,"[no, .]"
423,no what?,L845,L842,L844,8,3,0.0,"[no, what, ?]"
422,"no, i won't go with you",L846,L842,L845,23,8,0.0,"[no, ,, i, wo, n't, go, with, you]"
421,why not?,L847,L842,L846,8,3,0.0,"[why, not, ?]"
420,because i don't want to. it's a stupid tradition.,L848,L842,L847,49,13,-0.8,"[because, i, do, n't, want, to, ., it, 's, a, ..."
419,create a little drama? start a new rumor? what?,L852,L852,,49,12,-0.025568,"[create, a, little, drama, ?, start, a, new, r..."


### Adding `<start>` and `<end>` tokens to elements of conversations

In [37]:
conversations.loc[:, 'text_with_tokens'] = '<start> ' + conversations.loc[:, 'text'].astype(str) + ' <end>'
conversations.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  conversations.loc[:, 'text_with_tokens'] = '<start> ' + conversations.loc[:, 'text'].astype(str) + ' <end>'


Unnamed: 0,text,id,conversation_id,reply_to,msg_length,word_count,sentiment,tokens,text_with_tokens
0,they do not!,L1045,L1044,L1044,12,4,0.0,"[they, do, not, !]",<start> they do not! <end>
1,they do to!,L1044,L1044,,11,4,0.0,"[they, do, to, !]",<start> they do to! <end>
2,i hope so.,L985,L984,L984,10,4,0.0,"[i, hope, so, .]",<start> i hope so. <end>
3,she okay?,L984,L984,,9,3,0.5,"[she, okay, ?]",<start> she okay? <end>
4,let's go.,L925,L924,L924,9,4,0.0,"[let, 's, go, .]",<start> let's go. <end>
5,wow,L924,L924,,3,1,0.1,[wow],<start> wow <end>
6,okay you're gonna need to learn how to lie.,L872,L870,L871,46,13,0.5,"[okay, you, 're, gon, na, need, to, learn, how...",<start> okay you're gonna need to learn how...
7,no,L871,L870,L870,2,1,0.0,[no],<start> no <end>
8,i'm kidding. you know how sometimes you just ...,L870,L870,,101,25,0.0,"[i, 'm, kidding, ., you, know, how, sometimes,...",<start> i'm kidding. you know how sometimes y...
9,like my fear of wearing pastels?,L869,L866,L868,32,7,0.0,"[like, my, fear, of, wearing, pastel, ?]",<start> like my fear of wearing pastels? <end>


id: Unique identifier for each message. <br>
conversation_id: Identifier for the conversation to which the message belongs. All messages within the same conversation share this ID. <br>
reply_to: ID of the message to which the current message is a response. If this is None, the message is the start of a conversation thread.

## Pairing messages - input with responses

In [38]:
# Merging the DataFrame with itself to form pairs
pairs = pd.merge(
    conversations, conversations,
    left_on='id',
    right_on='reply_to',
    suffixes=('_input', '_response')
)

In [39]:
pairs.head()

Unnamed: 0,text_input,id_input,conversation_id_input,reply_to_input,msg_length_input,word_count_input,sentiment_input,tokens_input,text_with_tokens_input,text_response,id_response,conversation_id_response,reply_to_response,msg_length_response,word_count_response,sentiment_response,tokens_response,text_with_tokens_response
0,they do to!,L1044,L1044,,11,4,0.0,"[they, do, to, !]",<start> they do to! <end>,they do not!,L1045,L1044,L1044,12,4,0.0,"[they, do, not, !]",<start> they do not! <end>
1,she okay?,L984,L984,,9,3,0.5,"[she, okay, ?]",<start> she okay? <end>,i hope so.,L985,L984,L984,10,4,0.0,"[i, hope, so, .]",<start> i hope so. <end>
2,wow,L924,L924,,3,1,0.1,[wow],<start> wow <end>,let's go.,L925,L924,L924,9,4,0.0,"[let, 's, go, .]",<start> let's go. <end>
3,no,L871,L870,L870,2,1,0.0,[no],<start> no <end>,okay you're gonna need to learn how to lie.,L872,L870,L871,46,13,0.5,"[okay, you, 're, gon, na, need, to, learn, how...",<start> okay you're gonna need to learn how...
4,i'm kidding. you know how sometimes you just ...,L870,L870,,101,25,0.0,"[i, 'm, kidding, ., you, know, how, sometimes,...",<start> i'm kidding. you know how sometimes y...,no,L871,L870,L870,2,1,0.0,[no],<start> no <end>


In [40]:
# Selecting the needed columns including IDs
training_data = pairs[['id_input', 'text_with_tokens_input', 'tokens_input', 'sentiment_input', 'id_response', 'text_with_tokens_response', 'tokens_response', 'sentiment_response']]

# Renaming columns for clarity
training_data.columns = ['ID_Input', 'Input', 'Tokens_Input', 'Sentiment_Input', 'ID_Response', 'Response', 'Tokens_Response', 'Sentiment_Response']

In [41]:
training_data

Unnamed: 0,ID_Input,Input,Tokens_Input,Sentiment_Input,ID_Response,Response,Tokens_Response,Sentiment_Response
0,L1044,<start> they do to! <end>,"[they, do, to, !]",0.000000,L1045,<start> they do not! <end>,"[they, do, not, !]",0.000000
1,L984,<start> she okay? <end>,"[she, okay, ?]",0.500000,L985,<start> i hope so. <end>,"[i, hope, so, .]",0.000000
2,L924,<start> wow <end>,[wow],0.100000,L925,<start> let's go. <end>,"[let, 's, go, .]",0.000000
3,L871,<start> no <end>,[no],0.000000,L872,<start> okay you're gonna need to learn how...,"[okay, you, 're, gon, na, need, to, learn, how...",0.500000
4,L870,<start> i'm kidding. you know how sometimes y...,"[i, 'm, kidding, ., you, know, how, sometimes,...",0.000000,L871,<start> no <end>,[no],0.000000
...,...,...,...,...,...,...,...,...
221611,L666520,"<start> well i assure you, sir, i have no desi...","[well, i, assure, you, ,, sir, ,, i, have, no,...",0.000000,L666521,"<start> and i assure you, you do not in fact i...","[and, i, assure, you, ,, you, do, not, in, fac...",1.000000
221612,L666371,<start> lord chelmsford seems to want me to st...,"[lord, chelmsford, seems, to, want, me, to, st...",0.000000,L666372,<start> i think chelmsford wants a good man on...,"[i, think, chelmsford, want, a, good, man, on,...",0.355556
221613,L666370,<start> i'm to take the sikali with the main c...,"[i, 'm, to, take, the, sikali, with, the, main...",0.166667,L666371,<start> lord chelmsford seems to want me to st...,"[lord, chelmsford, seems, to, want, me, to, st...",0.000000
221614,L666369,"<start> your orders, mr vereker? <end>","[your, order, ,, mr, vereker, ?]",0.000000,L666370,<start> i'm to take the sikali with the main c...,"[i, 'm, to, take, the, sikali, with, the, main...",0.166667


In [42]:
# Checking how many pairs I shall get
len(conversations) - len(conversations.loc[:, 'conversation_id'].unique())

221616

In [43]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221616 entries, 0 to 221615
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   ID_Input            221616 non-null  object 
 1   Input               221616 non-null  object 
 2   Tokens_Input        221616 non-null  object 
 3   Sentiment_Input     221616 non-null  float64
 4   ID_Response         221616 non-null  object 
 5   Response            221616 non-null  object 
 6   Tokens_Response     221616 non-null  object 
 7   Sentiment_Response  221616 non-null  float64
dtypes: float64(2), object(6)
memory usage: 13.5+ MB


In [44]:
training_data.describe()

Unnamed: 0,Sentiment_Input,Sentiment_Response
count,221616.0,221616.0
mean,0.03737,0.042546
std,0.24235,0.24588
min,-1.0,-1.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.025
max,1.0,1.0


### Checking token length distribution

In [45]:
# Assuming 'data' is your DataFrame
token_lengths_input = training_data['Tokens_Input'].apply(len)
token_lengths_response = training_data['Tokens_Response'].apply(len)

print("Input Token Lengths - Statistics:")
print(token_lengths_input.describe())

print("\nResponse Token Lengths - Statistics:")
print(token_lengths_response.describe())


Input Token Lengths - Statistics:
count    221616.000000
mean         13.191800
std          13.791245
min           0.000000
25%           5.000000
50%           9.000000
75%          16.000000
max         369.000000
Name: Tokens_Input, dtype: float64

Response Token Lengths - Statistics:
count    221616.000000
mean         13.674523
std          14.757182
min           0.000000
25%           5.000000
50%           9.000000
75%          17.000000
max         673.000000
Name: Tokens_Response, dtype: float64


## Saving the DataFrame

In [46]:
file_path_parquet = os.path.join(data_dir, 'training_data.parquet')
training_data.to_parquet(file_path_parquet)

## Loading the DataFrame

In [47]:
file_path_parquet = os.path.join(data_dir, 'training_data.parquet')
data = pd.read_parquet(file_path_parquet)
data.head(5)

Unnamed: 0,ID_Input,Input,Tokens_Input,Sentiment_Input,ID_Response,Response,Tokens_Response,Sentiment_Response
0,L1044,<start> they do to! <end>,"[they, do, to, !]",0.0,L1045,<start> they do not! <end>,"[they, do, not, !]",0.0
1,L984,<start> she okay? <end>,"[she, okay, ?]",0.5,L985,<start> i hope so. <end>,"[i, hope, so, .]",0.0
2,L924,<start> wow <end>,[wow],0.1,L925,<start> let's go. <end>,"[let, 's, go, .]",0.0
3,L871,<start> no <end>,[no],0.0,L872,<start> okay you're gonna need to learn how...,"[okay, you, 're, gon, na, need, to, learn, how...",0.5
4,L870,<start> i'm kidding. you know how sometimes y...,"[i, 'm, kidding, ., you, know, how, sometimes,...",0.0,L871,<start> no <end>,[no],0.0


In [48]:
# Check if GPU is available
import torch
import numpy as np
import pandas as pd

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"CUDA is available. Device: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")

CUDA is available. Device: NVIDIA GeForce GTX 1660 Ti with Max-Q Design


## Create vocabulary and dataset for chatbot

In [49]:
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
import random

class Vocabulary:
    def __init__(self):
        self.word2index = {"<pad>": 0, "<start>": 1, "<end>": 2}
        self.index2word = {0: "<pad>", 1: "<start>", 2: "<end>"}
        self.num_words = 3

    def add_sentence(self, sentence: str) -> None:
        for word in sentence.split():
            if word not in self.word2index:
                self.word2index[word] = self.num_words
                self.index2word[self.num_words] = word
                self.num_words += 1

    def sentence_to_indices(self, sentence: str) -> list:
        return [self.word2index[word] for word in sentence.split() if word in self.word2index]

class ChatDataset(Dataset):
    def __init__(self, data, vocab):
        self.inputs = [torch.tensor(vocab.sentence_to_indices(sentence)) for sentence in data['Input']]
        self.responses = [torch.tensor(vocab.sentence_to_indices(sentence)) for sentence in data['Response']]

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.responses[idx]

def collate_fn(batch):
    inputs, targets = zip(*batch)
    # Pad the sequences with 0 (the index for <pad>)
    padded_inputs = pad_sequence(inputs, batch_first=True, padding_value=0)
    padded_targets = pad_sequence(targets, batch_first=True, padding_value=0)
    return padded_inputs, padded_targets

vocab = Vocabulary()
for sentence in data['Input'] + data['Response']:
    vocab.add_sentence(sentence)

dataset = ChatDataset(data, vocab)
indices = list(range(len(dataset)))

## Splitting the data

In [50]:
train_indices, test_indices = train_test_split(indices, test_size=0.15, random_state=22)
train_indices, val_indices = train_test_split(train_indices, test_size=0.176, random_state=22)  # Adjusting to maintain 70-15-15 split

train_loader = DataLoader([dataset[i] for i in train_indices], batch_size=2, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader([dataset[i] for i in val_indices], batch_size=2, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader([dataset[i] for i in test_indices], batch_size=2, shuffle=False, collate_fn=collate_fn)

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)

    def forward(self, input, hidden):
        embedded = self.embedding(input)  # [batch_size, seq_len, hidden_size]
        output, hidden = self.gru(embedded, hidden)  # hidden [1, batch_size, hidden_size]
        return output, hidden

    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size)  # [1, batch_size, hidden_size]

class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        embedded = self.embedding(input)  # Embed the input, expecting [batch_size, seq_len] where seq_len=1
        output, hidden = self.gru(embedded, hidden)  # Process the GRU step
        output = self.softmax(self.out(output.squeeze(1)))  # Adjust softmax layer
        return output, hidden


# Define the size of the hidden layer
hidden_size = 256
encoder = Encoder(vocab.num_words, hidden_size)
decoder = Decoder(hidden_size, vocab.num_words)

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    batch_size = input_tensor.size(0)
    encoder_hidden = encoder.init_hidden(batch_size)

    # Encoder forward pass
    encoder_output, encoder_hidden = encoder(input_tensor, encoder_hidden)

    # Initializing decoder input properly as [batch_size, 1]
    decoder_input = torch.tensor([[vocab.word2index["<start>"]] for _ in range(batch_size)], dtype=torch.long, device=input_tensor.device)
    decoder_hidden = encoder_hidden  # Direct transfer of hidden state from encoder to decoder

    loss = 0
    for di in range(target_tensor.size(1)):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        topv, topi = decoder_output.topk(1)
        # Correctly reshaping decoder_input to maintain [batch_size, 1]
        decoder_input = topi.squeeze().detach().unsqueeze(1)  # Ensure this remains [batch_size, 1]

        loss += criterion(decoder_output, target_tensor[:, di])

    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_tensor.size(1)

encoder_optimizer = Adam(encoder.parameters())
decoder_optimizer = Adam(decoder.parameters())
criterion = nn.NLLLoss()

# Example training loop
for epoch in range(1, 11):  # Training for 10 epochs for demonstration
    total_loss = 0
    for input_tensor, target_tensor in train_loader:
        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        total_loss += loss
    print(f'Epoch {epoch}, Loss: {total_loss / len(train_loader)}')