In [1]:
# Mount google drive
import os
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    base_folder = '/content/drive/My Drive/unibo/NLP_project/BarneyBot'
    os.system("pip install datasets")
    os.system("pip install transformers")
    os.system("pip install rouge_score")
    os.system("pip install -U sentence-transformers")
else:
    base_folder = os.getcwd()
    
in_folder = os.path.join(base_folder, "in")
if not os.path.exists(in_folder):
    os.makedirs(in_folder)
out_folder = os.path.join(base_folder, "out")
if not os.path.exists(out_folder):
    os.makedirs(out_folder)

In [2]:
import torch
import torchvision
import torchvision.transforms as transforms
import pandas as pd
import numpy as np
from tqdm import tqdm

In [3]:
character_dict = {
    'Barney':{
        'classifier_name': 'barney_classifier',
        'series_df_filename': 'HIMYM_preprocessed.csv',
        'classifier_df': 'barney_classifier.csv',
        'encoded_lines_filename': 'barney_encoded_lines.npy'
    },
    'Sheldon':{
        'classifier_name': 'sheldon_classifier'
    },
    'Harry':{
        'classifier_name': 'harry_classifier'
    },
    'Fry':{
        'classifier_name': 'fry_classifier'
    }
             }

In [4]:
batch_size = 16
epochs = 15
lr = 1e-6

train_size = 0.85
test_size = 0.10

from_saved_embeddings = False

character = 'Barney'

In [5]:
character_folder = os.path.join(base_folder, "Datasets", "Characters", character)

model_path = os.path.join(character_folder, character_dict[character]['classifier_name'])

# Dataset

In [6]:
series_df = pd.read_csv(os.path.join(character_folder, character_dict[character]['series_df_filename']))

In [7]:
series_df[series_df['character']==character]

Unnamed: 0,episode,line,character
14,01x01,"hey, so you know how I've always had a thing f...",Barney
16,01x01,"Okay, meet me at the bar in fifteen minutes, a...",Barney
18,01x01,Where's your suit!? Just once when I say suit ...,Barney
20,01x01,It was a blazer!,Barney
22,01x01,I see what this is about. Have you forgotten w...,Barney
...,...,...,...
31210,08x24,I'm probably saying some political stuff right...,Barney
31214,08x24,Whoa. Is there going to be a fight?,Barney
31449,09x10,"Karate Kid bad boy Billy Zabka, a shifty-eyed ...",Barney
31557,09x15,me or you?,Barney


In [8]:
series_df['character'] = series_df['character'].apply(lambda x: 1 if x==character else 0)

In [9]:
series_df[series_df['character']==1]

Unnamed: 0,episode,line,character
14,01x01,"hey, so you know how I've always had a thing f...",1
16,01x01,"Okay, meet me at the bar in fifteen minutes, a...",1
18,01x01,Where's your suit!? Just once when I say suit ...,1
20,01x01,It was a blazer!,1
22,01x01,I see what this is about. Have you forgotten w...,1
...,...,...,...
31210,08x24,I'm probably saying some political stuff right...,1
31214,08x24,Whoa. Is there going to be a fight?,1
31449,09x10,"Karate Kid bad boy Billy Zabka, a shifty-eyed ...",1
31557,09x15,me or you?,1


In [10]:
series_df = series_df.drop(columns=['episode'])

In [11]:
classes = (0, 1)

# Model

## Sentence Transformer

In [12]:
from sentence_transformers import SentenceTransformer

if not from_saved_embeddings:
    sentence_transformer = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")

In [13]:
if not from_saved_embeddings:
    sentence_transformer

## Sentence Encoding

In [14]:
if not from_saved_embeddings:

    series_df['encoded_line'] = [sentence_transformer.encode(line) for line in tqdm(series_df['line'])]

    # save sentences dataset
    series_df[['line', 'character']].to_csv(
        os.path.join(character_folder, character_dict[character]['classifier_df']), 
        index = False
    )

    np.save(
        os.path.join(character_folder, character_dict[character]['encoded_lines_filename']),
        series_df['encoded_line'].to_numpy()
    )

100%|████████████████████████████████████████████████████████████████████████████| 31776/31776 [06:06<00:00, 86.59it/s]


In [15]:
from numba import cuda

if not from_saved_embeddings: 
    # free gpu memory
    cuda.select_device(0)
    cuda.close()

In [16]:
# read sentences dataaset
series_df = pd.read_csv(
    os.path.join(character_folder, character_dict[character]['classifier_df']),
    dtype={'line': str,
           'character': int
          }
)

series_df['encoded_line'] = np.load(
    os.path.join(character_folder, character_dict[character]['encoded_lines_filename']), 
    allow_pickle=True
)

In [17]:
series_df

Unnamed: 0,line,character,encoded_line
0,"Kids, I'm going to tell you an incredible stor...",0,"[-0.0060349987, 0.3398651, -0.013500607, 0.051..."
1,Are we being punished for something?,0,"[0.24517804, 0.06231432, -0.015702646, 0.03040..."
2,No,0,"[0.010471302, 0.074272856, -0.015337698, 0.099..."
3,"Yeah, is this going to take a while?",0,"[0.25085106, 0.3401538, -0.0066933706, -0.1009..."
4,"Yes. Twenty-five years ago, before I was dad,...",0,"[0.07280163, 0.24518006, -0.009924358, -0.1859..."
...,...,...,...
31771,Aunt Lily wasn't wrong.,0,"[0.008160515, 0.09477018, -0.01352677, 0.13088..."
31772,"00 a.m. Christmas morning, every sleepy Sunday...",0,"[-0.17137574, -0.0025946363, -0.008149227, 0.0..."
31773,T.M.,0,"[-0.025203055, -0.27510342, -0.010736528, 0.03..."
31774,T.M.,0,"[-0.025203055, -0.27510342, -0.010736528, 0.03..."


In [18]:
type(series_df['encoded_line'][0])

numpy.ndarray

## Create Classification Dataset

In [19]:
series_shuffle_df = series_df.sample(frac=1).reset_index(drop=True)

In [20]:
tot_len = len(series_df)
train_len = int(tot_len*train_size)
test_len = int(tot_len*test_size)
val_len = tot_len - train_len - test_len

print(tot_len, train_len, test_len, val_len)

31776 27009 3177 1590


In [21]:
X_train = np.array([[float(e) for e in s] for s in series_shuffle_df['encoded_line'][:train_len]])
y_train = np.array([c for c in series_shuffle_df['character'][:train_len]])

X_test = np.array([[float(e) for e in s] for s in series_shuffle_df['encoded_line'][:test_len]])
y_test = np.array([c for c in series_shuffle_df['character'][:test_len]])

X_val = np.array([[float(e) for e in s] for s in series_shuffle_df['encoded_line'][:val_len]])
y_val = np.array([c for c in series_shuffle_df['character'][:val_len]])

## Classification Model

In [22]:
# Import keras/tensorflow libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import callbacks

In [23]:
# create model
inputs = keras.Input(shape=(len(X_train[0],)))
x = layers.Dense(256, activation='relu')(inputs)
x = layers.Dense(128, activation='relu')(inputs)
out = layers.Dense(1, activation='sigmoid')(x)

classifier_model = keras.Model(inputs, out)
classifier_model.compile(
    loss = keras.losses.BinaryCrossentropy(),
    optimizer = keras.optimizers.Adam(learning_rate = lr),
    metrics = ['binary_accuracy']
)

## Training

In [24]:
earlystop_callback = callbacks.EarlyStopping(
        monitor="val_binary_accuracy",
        min_delta=0,
        patience=2,
        verbose=0,
        mode="max",
        baseline=None,
        restore_best_weights=True,
    )

In [25]:
train_history = classifier_model.fit(
    X_train, 
    y_train,
    validation_data = (X_val, y_val),
    epochs= epochs,
    verbose = 1, 
    callbacks=[earlystop_callback],
    batch_size = batch_size
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15


In [26]:
classifier_path = os.path.join(character_folder, character_dict[character]['classifier_name'])
classifier_model.save(classifier_path)

INFO:tensorflow:Assets written to: C:\Users\david\Documents\unibo\natural_language_processing\project\BarneyBot\Datasets\Characters\Barney\barney_classifier\assets
