In [1]:
# Mount google drive
import os
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    base_folder = '/content/drive/My Drive/unibo/NLP_project/BarneyBot'
    os.system("pip install datasets")
    os.system("pip install transformers")
    os.system("pip install rouge_score")
    os.system("pip install -U sentence-transformers")
else:
    base_folder = os.getcwd()
    
in_folder = os.path.join(base_folder, "in")
if not os.path.exists(in_folder):
    os.makedirs(in_folder)
out_folder = os.path.join(base_folder, "out")
if not os.path.exists(out_folder):
    os.makedirs(out_folder)

In [2]:
import torch
import torchvision
import torchvision.transforms as transforms
import pandas as pd

In [121]:
batch_size = 8
epochs = 1

# Dataset

In [4]:
himym_path = os.path.join(base_folder, "Datasets", "Characters", "Barney")
himym_df = pd.read_csv(os.path.join(himym_path, "HIMYM_preprocessed.csv"))

In [5]:
himym_df[himym_df['character']=='Barney']

Unnamed: 0,episode,line,character
14,01x01,"hey, so you know how I've always had a thing f...",Barney
16,01x01,"Okay, meet me at the bar in fifteen minutes, a...",Barney
18,01x01,Where's your suit!? Just once when I say suit ...,Barney
20,01x01,It was a blazer!,Barney
22,01x01,I see what this is about. Have you forgotten w...,Barney
...,...,...,...
31210,08x24,I'm probably saying some political stuff right...,Barney
31214,08x24,Whoa. Is there going to be a fight?,Barney
31449,09x10,"Karate Kid bad boy Billy Zabka, a shifty-eyed ...",Barney
31557,09x15,me or you?,Barney


In [6]:
himym_df['character'] = himym_df['character'].apply(lambda x: 1 if x=='Barney' else 0)

In [7]:
# himym_df['line'] = himym_df['line'].apply(lambda x: x[:128])

In [8]:
himym_df[himym_df['character']==1]

Unnamed: 0,episode,line,character
14,01x01,"hey, so you know how I've always had a thing f...",1
16,01x01,"Okay, meet me at the bar in fifteen minutes, a...",1
18,01x01,Where's your suit!? Just once when I say suit ...,1
20,01x01,It was a blazer!,1
22,01x01,I see what this is about. Have you forgotten w...,1
...,...,...,...
31210,08x24,I'm probably saying some political stuff right...,1
31214,08x24,Whoa. Is there going to be a fight?,1
31449,09x10,"Karate Kid bad boy Billy Zabka, a shifty-eyed ...",1
31557,09x15,me or you?,1


In [9]:
himym_df = himym_df.drop(columns=['episode'])

In [10]:
from sklearn.model_selection import train_test_split
himym_train, himym_test = train_test_split(himym_df, test_size=0.15)

In [11]:
himym_train = [(line['line'], line['character']) for _,line in himym_train.iterrows()]

In [12]:
himym_test = [(line['line'], line['character']) for _,line in himym_test.iterrows()]

In [125]:
trainloader = torch.utils.data.DataLoader(himym_train, 
                                          batch_size=batch_size,
                                          shuffle=True, 
                                          num_workers=2)

In [126]:
testloader = torch.utils.data.DataLoader(himym_test, 
                                         batch_size=batch_size,
                                         shuffle=False, 
                                         num_workers=2)

In [127]:
classes = (0, 1)

# Model

In [66]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")

In [67]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [69]:
model.encode('ciao').shape

(768,)

In [138]:
import torch.nn as nn

class MyModel(nn.Module):
    def __init__(self):
#         super(MyModel, self).__init__()
        super().__init__()
        self.fc = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        y = x['sentence_embedding']
        print('\n\n x shape:\t', y.shape, '\n\n')
        y = self.fc(y)
        y = self.sigmoid(y)
        # x = nn.Softmax()
        x['classifier'] = y
        return x

<bound method Module.parameters of Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: XLMRobertaModel >

In [139]:
model.add_module('2', MyModel())
model[0].train(mode=False)
# for param in model[0].parameters():
#     param.requires_grad = True
model[1].train(mode=False)
# for param in model[1].parameters():
#     param.requires_grad = True
model[2].train(mode=True)
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): MyModel(
    (fc): Linear(in_features=768, out_features=1, bias=True)
    (sigmoid): Sigmoid()
  )
)

# Training

In [136]:
import torch.nn.functional as F
import torch.optim as optim

criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [141]:
for epoch in range(epochs):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0): 
        
        # print(data)
        
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        
        inputs = list(inputs)
        
#         print('\n\ninputs:\n', inputs,
#               '\nlabels:\n', labels)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        # print('input shape:\t', inputs.shape)
        outputs = model.encode(inputs, output_value='classifier')
        outputs = torch.concat(outputs)
        labels = labels.type(torch.FloatTensor)
#         outputs = model.encode('inputs')
        print(outputs, labels)
        loss = criterion(outputs, labels)
        loss.requires_grad = True
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

print('Finished Training')



inputs:
 ["Yeah. I'll be bringing the crudits.", 'Working? You are retired.', 'They tracked!', 'I think so.', 'And... send!', 'Unsubscribe.', "Yeah, hell, yeah. I got 'em all: Teamwork, Courage, Awesomeness...", 'Come on, man. People want to see Bruce Banner.They want Hulk. What?'] 
labels:
 tensor([0, 0, 0, 0, 0, 0, 1, 1])


 x shape:	 torch.Size([8, 768]) 


tensor([0.4871, 0.4938, 0.4782, 0.4909, 0.4890, 0.4955, 0.4862, 0.5026]) tensor([0., 0., 0., 0., 0., 0., 1., 1.])


inputs:
 ["They're barbecue.", 'Probably!', 'He will watch it.', 'Bluntly.', 'Your move.', 'Mmm. Your dad?', 'And it does not stop there. We argue non-stop since.', "OK, I don't really know how to tap-dance."] 
labels:
 tensor([0, 0, 0, 0, 0, 0, 1, 0])


 x shape:	 torch.Size([8, 768]) 


tensor([0.4851, 0.4940, 0.5036, 0.5002, 0.5050, 0.5053, 0.5061, 0.5160]) tensor([0., 0., 0., 0., 0., 0., 1., 0.])


inputs:
 ['Dear boy, what else?', 'We need it here. It is in hiding.', 'You hate that job.', "Oh, my God, that su

KeyboardInterrupt: 