In [1]:
import nltk
from nltk.sentiment import vader
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem.snowball import EnglishStemmer
import spacy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import brown
from nltk.tokenize import sent_tokenize
import string
import sklearn
from sklearn.metrics import classification_report
from collections import namedtuple, defaultdict, Counter
from math import log10, sqrt
from simpletransformers.classification import ClassificationModel, ClassificationArgs

In [2]:
#vader_model = SentimentIntensityAnalyzer()
#nlp = spacy.load('en_core_web_sm') # 'en_core_web_sm'

In [3]:
# lowercase all text
df = pd.read_csv("cleaned_data.csv")
df['Text'] = df['Text'].str.lower()
df.drop('Unnamed: 0', axis=1, inplace=True)
display(df)

Unnamed: 0,Text,Speaker
0,there's nothing to tell! he's just some guy i...,monica
1,"c'mon, you're going out with the guy! there's...",joey
2,so does he have a hump? a hump and a hairpiece?,chandler
3,"wait, does he eat chalk?",phoebe
4,"just, 'cause, i don't want her to go through ...",phoebe
...,...,...
52557,"no, no, no, no, no!",rachel
52558,"no, no, no, no, no!",phoebe
52559,"no, no, no, no, no!",joey
52560,"no, no, no, no, no!",chandler


In [4]:
# Text per person, tokenized and stemmed
stemmer = EnglishStemmer()
monica_text=[]
chandler_text=[]
joey_text=[]
phoebe_text=[]
rachel_text=[]
ross_text=[]

for i in range(1,len(df)):
    text = df["Text"][i]
    tokenized_text = nltk.word_tokenize(text)  #tokenize text
    #clean tokenized text
    table = {ord(char): '' for char in string.punctuation} 
    cleaned_messy_sentence = []
    for messy_word in tokenized_text:   
        cleaned_word = messy_word.translate(table) # the translate method allows us to remove all unwanted charachters
        cleaned_word_stemmed = stemmer.stem(cleaned_word)
        cleaned_messy_sentence.append(cleaned_word_stemmed)
    cleaned_sentence = [token for token in cleaned_messy_sentence if token != ''] 
    
    speaker = df["Speaker"][i]
    if speaker == "monica":
        monica_text.append(cleaned_sentence)
    elif speaker == "chandler":
        chandler_text.append(cleaned_sentence)
    elif speaker == "joey":
        joey_text.append(cleaned_sentence)
    elif speaker == "phoebe":
        phoebe_text.append(cleaned_sentence)
    elif speaker == "rachel":
        rachel_text.append(cleaned_sentence)
    elif speaker == "ross":
        ross_text.append(cleaned_sentence)
    else: 
        print("SOMETHING WENT WRONG!!!")
        break

#print(monica_text)

## Document Frequency and Term Frequency

In [5]:
all_text_classified = []

# For each group, the term frequency.
# [0] = monica, [1]=chandler, etc.
all_tokens_tf = []
# Add all classes into one list
all_text_classified.append(monica_text)
all_text_classified.append(chandler_text)
all_text_classified.append(joey_text)
all_text_classified.append(phoebe_text)
all_text_classified.append(rachel_text)
all_text_classified.append(ross_text)

In [6]:
class_names = ['monica', 'chandler', 'joey', 'phoebe', 'rachel', 'ross']
number_of_speakers = len(class_names)
inverted_index = defaultdict(list)
tf_matrix = defaultdict(Counter)

for index in range(len(all_text_classified)):
    group = all_text_classified[index]         # Group = all sentences of a speaker; monica, chandler, etc.
    all_tokens = []
    for sentence in group:
        for token in sentence:
            all_tokens.append(token)
            
    # Document Frequency
    term_set = set(all_tokens)
    for term in term_set:
        inverted_index[term].append(class_names[index])
        
    # Term Frequency
    tf_matrix[class_names[index]] = Counter(all_tokens)   
    
def tf(term,character):
    return float(tf_matrix[character][term])            

def df(term):
    return float(len(inverted_index[term]))

In [7]:
# Example TF with text 'term' for a speaker
# In order words, how many times does speaker say 'term' (in our data)
# NOTE: The speaker does not say the particular term -> (tf = 0, thus tfidf = 0)
term = 'sweeti'
for speaker in class_names:
    print(speaker, term, tf(term, speaker))
    
# Example DF with text 'term'
# Number indicates number of speakers saying the text at least once.
# NOTE: All speakers say the particular term -> (idf = 0, thus tfidf = 0)
print('Number of speakers saying the term: ', df(term))

monica sweeti 49.0
chandler sweeti 3.0
joey sweeti 2.0
phoebe sweeti 4.0
rachel sweeti 14.0
ross sweeti 24.0
Number of speakers saying the term:  6.0


In [8]:
# IDF and TFIDF, with add-one smoothening (taking into account null-values)
# NOTE: idf-score = 0 (and thus tfidf when all speakers say the term
def idf(term):
    return log10((number_of_speakers + 1) / (df(term) + 1))

def tfidf(term, speaker):
    return tf(term, speaker)*idf(term)

In [9]:
def score(query_terms, speaker):  # ntn.nnn
    score = 0
    for term in query_terms:
        score = score + tfidf(term,speaker)
    return score

In [10]:
def retrieve_speakers(query):
    speakers = []
    
    for term in query:
        all_instances = inverted_index[term]
        for character in all_instances:
            if character not in speakers:
                speakers.append(character)
    return speakers

In [11]:
def scoring_query(query):
    speakers_and_score = []
    tokenized_query = nltk.word_tokenize(query)
    
    # Turn into lowercase
    lower_query = []
    for token in tokenized_query:
        lower_query.append(token.lower())
        
    #clean query
    table = {ord(char): '' for char in string.punctuation} 
    cleaned_messy_query = []
    for messy_word in tokenized_query:   
        cleaned_word = messy_word.translate(table) # the translate method allows us to remove all unwanted charachters
        cleaned_word_stemmed = stemmer.stem(cleaned_word)
        cleaned_messy_query.append(cleaned_word_stemmed)
    cleaned_query = [token for token in cleaned_messy_query if token != '']
    
    
    speaker_list = retrieve_speakers(cleaned_query)
    
    for speaker in speaker_list:
        tfidf_score = score(cleaned_query, speaker)
        speakers_and_score.append((speaker, tfidf_score))
        
    ranked = sorted(speakers_and_score, key=lambda x:x[1], reverse=True)
    return ranked

In [12]:
# Score is 0 either when:
# 1. The speaker does not say the particular term (tf = 0, thus tfidf = 0)
# 2. All speakers say the particular term (idf = 0, thus tfidf = 0)
scoring_query('hi bold')

[('chandler', 0.36797678529459443),
 ('ross', 0.36797678529459443),
 ('monica', 0.0),
 ('joey', 0.0),
 ('phoebe', 0.0),
 ('rachel', 0.0)]

## Training Bert

In [13]:
# Model configuration # https://simpletransformers.ai/docs/usage/#configuring-a-simple-transformers-model 
#model_args = ClassificationArgs()

#model_args.overwrite_output_dir=True # overwrite existing saved models in the same directory
#model_args.evaluate_during_training=True # to perform evaluation while training the model
# (eval data should be passed to the training method)

#model_args.num_train_epochs=10 # number of epochs
#model_args.train_batch_size=32 # batch size
#model_args.learning_rate=4e-6 # learning rate
#model_args.max_seq_length=256 # maximum sequence length
# Note! Increasing max_seq_len may provide better performance, but training time will increase. 
# For educational purposes, we set max_seq_len to 256.

# Early stopping to combat overfitting: https://simpletransformers.ai/docs/tips-and-tricks/#using-early-stopping
#model_args.use_early_stopping=True
#model_args.early_stopping_delta=0.01 # "The improvement over best_eval_loss necessary to count as a better checkpoint"
#model_args.early_stopping_metric='eval_loss'
#model_args.early_stopping_metric_minimize=True
#model_args.early_stopping_patience=2
#model_args.evaluate_during_training_steps=32 # how often you want to run validation in terms of training steps (or batches)

In [14]:
#train_data = []
#train_target = []
#for i in range(100):
#    for word in monica_text[i]:
#        string = ''
#        string.append(word)
#        print(string)
#        train_data.append(string)
#        train_target.append('monica')
#    for word in rachel_text[i]:
#        string = ''
#        string.append(word)
#        print(string)
#        train_data.append(string)
#        train_target.append('rachel')
#    for word in phoebe_text[i]:
#        string = ''
#        string.append(word)
#        print(string)
#        train_data.append(string)
#        train_target.append('phoebe')
#train = pd.DataFrame({'text': train_data, 'labels': train_target})

#test_data = []
#test_target = []
#for i in range(100):
#    for word in monica_text[i]:
#        string = ''
#        string.append(word)
#        print(string)
#        test_data.append(string)
#        test_target.append('monica')
#    for word in rachel_text[i]:
#        string = ''
#        string.append(word)
#        print(string)
#        test_data.append(string)
#        test_target.append('rachel')
#    for word in phoebe_text[i]:
#        string = ''
#        string.append(word)
#        print(string)
#        test_data.append(string)
#        test_target.append('phoebe')
#test = pd.DataFrame({'text': test_data, 'labels': test_target})

In [15]:
# Checking steps per epoch
#steps_per_epoch = int(np.ceil(len(train) / float(model_args.train_batch_size)))
#print('Each epoch will have {:,} steps.'.format(steps_per_epoch)) # 64 steps = validating 2 times per epoch

In [16]:
#from sklearn.model_selection import train_test_split

#train, dev = train_test_split(train, test_size=0.1, random_state=0, 
#                               stratify=train[['labels']])

In [17]:
#model = ClassificationModel('bert', 'bert-base-cased', num_labels=3, args=model_args, use_cuda=False) # CUDA is enabled

In [18]:
#_, history = model.train_model(train, eval_df=dev) 

In [19]:
# Training and evaluation loss
#train_loss = history['train_loss']
#eval_loss = history['eval_loss']
#plt.plot(train_loss, label='Training loss')
#plt.plot(eval_loss, label='Evaluation loss')
#plt.title('Training and evaluation loss')
#plt.legend()


# Evaluate the model
#result, model_outputs, wrong_predictions = model.eval_model(dev)
#result

In [20]:
#predicted, probabilities = model.predict(test.text.to_list())
#test['predicted'] = predicted