# Setup

In [None]:
import pandas as pd
from tqdm import tqdm
import re
import json
import nltk
import numpy as np
import matplotlib.pyplot as plt

from random import shuffle

from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import DataCollatorForLanguageModeling, AutoTokenizer, TFAutoModelForCausalLM, AdamWeightDecay
from nltk.corpus import stopwords
from os.path import join, exists

In [None]:
from data_utils import character_dict, model_name, preprocess_function, load_df, get_predictions_cached, get_dataframe_for_metrics

from metrics import freq_pairwise_sim, filter_by_weights, get_word_frequency, get_tfidfs, FrequencyChatbotClassifier

In [None]:
nltk.download('stopwords')

In [None]:
characters = list(character_dict.keys())
characters.remove('Default')

mass_value = 0.3

In [None]:
# Mount google drive, if in Colaboratory environment
import os
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    base_folder = '/content/drive/My Drive/unibo/NLP_project/BarneyBot'
    os.system("pip install datasets")
    os.system("pip install transformers")
    os.system("pip install rouge_score")
    os.system("pip install -U sentence-transformers")
else:
    # base_folder = os.getcwd()
    base_folder = '..'

out_folder = os.path.join(base_folder, 'Data', 'Characters')

In [None]:
def sentence_preprocessing(sentence):
    sentence = re.sub(r'[^A-Za-z\s]', ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)
    return sentence

In [None]:
character_docs = dict()
for character in characters:
    df = pd.read_csv(os.path.join(out_folder, character, f'{character}.csv'))
    character_docs[character] = df['response'].tolist()

In [None]:
for character in characters:
    for i in tqdm(range(len(character_docs[character]))):
        character_docs[character][i] = sentence_preprocessing(character_docs[character][i])

Divide in train and test

In [None]:
test_size = 0.33
character_docs_train = {}
character_docs_test = {}
for c in characters:
    shuffle(character_docs[c])
    end_idx = int(len(character_docs[c]) * test_size)
    character_docs_train[c] = character_docs[c][end_idx:]
    character_docs_test[c] = character_docs[c][:end_idx]

# Word Frequency

In [None]:
wordfreqs = dict()
for character in tqdm(characters):
    wordfreqs[character] = get_word_frequency(' '.join(character_docs[character]))

In [None]:
wordfreqs_reduced = dict()
for character in characters:
    wordfreqs_reduced[character] = filter_by_weights(wordfreqs[character], mass=mass_value)

# TF-IDF

In [None]:
tfidf_vectorizer = TfidfVectorizer(input='content', stop_words='english')

In [None]:
tfidfs = get_tfidfs([' '.join(character_docs[character]) for character in characters], characters, tfidf_vectorizer)

In [None]:
tfidfs_reduced = dict()
for character in characters:
    tfidfs_reduced[character] = filter_by_weights(tfidfs[character], mass=mass_value)

# WordCloud Plot

In [None]:
from wordcloud import WordCloud

def plot_word_cloud(freqdict, cmap='viridis', title=None, plot=False):
    wordcloud = WordCloud(background_color = 'black', width = 800, height = 400,
                      colormap = cmap, max_words = 180, contour_width = 3,
                      max_font_size = 80, contour_color = 'steelblue',
                      random_state = 0)

    wordcloud.generate_from_frequencies(freqdict)
    if title:
        plt.title(title)
    plt.imshow(wordcloud, interpolation = 'bilinear')
    plt.axis("off")
    plt.figure()

In [None]:
plot_word_cloud(tfidfs_reduced['Barney'])

# Frequency Pairwise Similarity

In [None]:
freq_pairwise_sim(tfidfs_reduced['Fry'], tfidfs_reduced['Barney'])

# Test classifiers on test set 

In [None]:
wf_classifier = FrequencyChatbotClassifier(characters, mode='word frequency')
wf_classifier.train(list(character_docs_train.values()))
predictions = []
for c in tqdm(characters):
    prediction = wf_classifier.predict(character_docs_test[c], mass=mass_value)
    predictions.append(
        int(max(prediction, key=prediction.get) == c)
    )

print('Frequency classifier test accuracy: {:.2f}'.format(sum(predictions)/len(predictions)))

In [None]:
tfidf_classifier = FrequencyChatbotClassifier(characters, mode='tf-idf')
tfidf_classifier.train(list(character_docs_train.values()))
predictions = []
for c in characters:
    prediction = tfidf_classifier.predict(character_docs_test[c], mass=mass_value)
    predictions.append(
        int(max(prediction, key=prediction.get) == c)
    )

print('TF-IDF classifier test accuracy: {:.2f}'.format(sum(predictions)/len(predictions)))

# Test classifiers on chatbot sentences

In [None]:
tfidf_classifier = FrequencyChatbotClassifier(characters, mode='tf-idf')
tfidf_classifier.train(list(character_docs.values()))

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=join("cache"))
tokenizer.pad_token = '#'
data_collator = DataCollatorForLanguageModeling(mlm=False, tokenizer=tokenizer, return_tensors='tf')

In [None]:
### create dataset
n_tests = 10
doc_test = []
batch_size = 8
override_predictions = True
predictions = {c:[] for c in characters}
raw_predictions = {c:[] for c in characters}
print('Creating dataset...')
if n_tests > 1 and not override_predictions:
    raise Exception('must override previous predictions if you need more tests')

for i in range(n_tests):
    print(f'Run {i}/{n_tests}')
    for character in tqdm(characters):
        character_checkpoint = join(out_folder, character, character_dict[character]['checkpoint_folder'])
        model_chatbot = TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=character_checkpoint) if override_predictions else None
        if model_chatbot:
            model_chatbot.compile(optimizer=AdamWeightDecay(learning_rate=2e-5))

        character_hg = load_df(character)
        # This transform in a sequence of tokens ours dataset
        tokenized_character_hg = character_hg.map(preprocess_function, batched=False)

        # Define tensorflow datasets
        encoded_test_set = tokenized_character_hg["test"].to_tf_dataset(
            columns=["input_ids", "attention_mask", "labels"],
            shuffle=False,
            batch_size=batch_size,
            collate_fn=data_collator,
        )

        # Takes the testset as sample question 
        sample_questions = character_hg['test']['context/0']

        # Sampling generation method
        predictions_sampling = get_predictions_cached(
            sample_questions,
            model_chatbot,
            character_dict[character]['prediction_filename'] + '_sampling.json',
            "Sampling",
            character,
            tokenizer,
            override_predictions=override_predictions
        )
                                                    
        sentences = get_dataframe_for_metrics(character_hg['test'], None, None, predictions_sampling, tokenizer)['prd_sampling'].tolist()
        doc_test.append([sentence_preprocessing(s) for s in sentences])

In [None]:
### prediction on last test
print('Classification...')
for c in tqdm(range(len(characters))):
    prediction = tfidf_classifier.predict(doc_test[c], mass=mass_value)
    raw_predictions[characters[c]].append(prediction)
    predictions[characters[c]].append(
        int(max(prediction, key=prediction.get) == characters[c])
    )

In [None]:
raw_predictions

In [None]:
predictions

In [None]:
print('TF-IDF classifier test accuracy: {:.2f}'.format(sum([char_pred[-1] for char_pred in predictions.values()])/len(predictions)))

In [None]:
### save predictions
append_predictions = True
override_predictions = False
predictions_file = join('..', 'Data', 'tfidf_predictions.json')

if append_predictions and exists(predictions_file):
    with open(predictions_file, 'r', encoding='utf-8') as file:
        predictions_dict = json.load(file)
elif override_predictions or not exists(predictions_file):
    predictions_dict = {'one_hot':{c:[] for c in characters}, 'raw_predictions': {c:[] for c in characters}}
else:
    raise NotImplementedError
    
for c in characters:
    predictions_dict['one_hot'][c] += predictions[c]
    predictions_dict['raw_predictions'][c] += raw_predictions[c]

with open(predictions_file, 'w', encoding='utf-8') as file:
    json.dump(predictions_dict, file)



In [None]:
predictions_dict