# Setup

In [None]:
from data_utils import character_dict
from data_utils import random_state
from frequency_utils import filter_by_weights, get_word_frequency, get_tfidfs, FrequencyChatbotClassifier
from metric_utils import freq_pairwise_sim
import pandas as pd
from tqdm import tqdm
import re
import numpy as np
import matplotlib.pyplot as plt

In [None]:
characters = list(character_dict.keys())
characters.remove('Default')

In [None]:
# Mount google drive, if in Colaboratory environment
import os
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    base_folder = '/content/drive/My Drive/unibo/NLP_project/BarneyBot'
    os.system("pip install datasets")
    os.system("pip install transformers")
    os.system("pip install rouge_score")
    os.system("pip install -U sentence-transformers")
else:
    # base_folder = os.getcwd()
    base_folder = '..'

out_folder = os.path.join(base_folder, 'Data', 'Characters')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')

In [None]:
character_docs = dict()
for character in characters:
    df = pd.read_csv(os.path.join(out_folder, character, f'{character}.csv'))
    df_train, df_test = train_test_split(df, test_size=0.33, random_state=random_state)
    character_docs[character] = {'train': df_train['response'].tolist(), 
                                 'test':  df_test['response'].tolist()}

In [None]:
len(character_docs['Vader']['train']), len(character_docs['Vader']['test'])

In [None]:
for character in tqdm(characters):
    for i in range(len(character_docs[character]['train'])):
        character_docs[character]['train'][i] = re.sub(r'[^A-Za-z\s]', ' ', character_docs[character]['train'][i])
        character_docs[character]['train'][i] = re.sub(r'\s+', ' ', character_docs[character]['train'][i])
    for i in range(len(character_docs[character]['test'])):
        character_docs[character]['test'][i] = re.sub(r'[^A-Za-z\s]', ' ', character_docs[character]['test'][i])
        character_docs[character]['test'][i] = re.sub(r'\s+', ' ', character_docs[character]['test'][i])

# Word Frequency

In [None]:
wordfreqs_train = dict()
for character in tqdm(characters):
    wordfreqs_train[character] = get_word_frequency(' '.join(character_docs[character]['train']), f_sorted=True)

wordfreqs_test = dict()
for character in tqdm(characters):
    wordfreqs_test[character] = get_word_frequency(' '.join(character_docs[character]['test']), f_sorted=True)

In [None]:
wordfreqs_reduced_train = dict()
for character in characters:
    wordfreqs_reduced_train[character] = filter_by_weights(wordfreqs_train[character], mass=0.3)

wordfreqs_reduced_test = dict()
for character in characters:
    wordfreqs_reduced_test[character] = filter_by_weights(wordfreqs_test[character], mass=0.3)

# TF-IDF

In [None]:
tfidf_vectorizer = TfidfVectorizer(input='content', stop_words='english')

In [None]:
tfidfs = get_tfidfs([' '.join(character_docs[character]['train']) for character in characters], characters, tfidf_vectorizer)

In [None]:
tfidfs_reduced = dict()
for character in characters:
    tfidfs_reduced[character] = filter_by_weights(tfidfs[character], mass=0.3)

# WordCloud Plot

In [None]:
from wordcloud import WordCloud

def plot_word_cloud(freqdict, cmap='viridis', title=None, plot=False):
    wordcloud = WordCloud(background_color = 'black', width = 800, height = 400,
                      colormap = cmap, max_words = 180, contour_width = 3,
                      max_font_size = 80, contour_color = 'steelblue',
                      random_state = 0)

    wordcloud.generate_from_frequencies(freqdict)
    if title:
        plt.title(title)
    plt.imshow(wordcloud, interpolation = 'bilinear')
    plt.axis("off")
    plt.figure()

In [None]:
plot_word_cloud(tfidfs_reduced['Barney'])

# Frequency Pairwise Similarity

In [None]:
freq_pairwise_sim(tfidfs_reduced['Fry'], tfidfs_reduced['Barney'])

# Frequency Classifier

In [None]:
wf_classifier = FrequencyChatbotClassifier(characters, mode='word frequency')
# wf_classifier.train(list(character_docs.values()))
wf_classifier.train([character_docs[character]['train'] for character in characters])
print(wf_classifier.predict(character_docs['Barney']['test'], mass=0.3))

In [None]:
tfidf_classifier = FrequencyChatbotClassifier(characters, mode='tf-idf')
tfidf_classifier.train([character_docs[character]['train'] for character in characters])
print(tfidf_classifier.predict(character_docs['Barney']['test'], mass=0.3))

# Test performances

In [None]:
from sklearn.metrics import confusion_matrix

## Word Frequency

In [None]:
wf_classifier = FrequencyChatbotClassifier(characters, mode='word frequency')
# wf_classifier.train(list(character_docs.values()))
wf_classifier.train([character_docs[character]['train'] for character in characters])

In [None]:
y_true = range(0, len(characters))
y_pred = [np.argmax(list(wf_classifier.predict(character_docs[character]['test'], mass=0.3).values())
                    ) for character in characters]

In [None]:
confusion_matrix(y_true, y_pred)

## TF-IDF

In [None]:
wf_classifier = FrequencyChatbotClassifier(characters, mode='tf-idf')
# wf_classifier.train(list(character_docs.values()))
wf_classifier.train([character_docs[character]['train'] for character in characters])

In [None]:
y_true = range(0, len(characters))
y_pred = [np.argmax(list(wf_classifier.predict(character_docs[character]['test'], mass=0.3).values())
                    ) for character in characters]

In [None]:
confusion_matrix(y_true, y_pred)