In [1]:
# Mount google drive
import os
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    base_folder = '/content/drive/My Drive/unibo/NLP_project/BarneyBot'
    os.system("pip install datasets")
    os.system("pip install transformers")
    os.system("pip install rouge_score")
    os.system("pip install -U sentence-transformers")
else:
    base_folder = os.getcwd()
    
in_folder = os.path.join(base_folder, "in")
if not os.path.exists(in_folder):
    os.makedirs(in_folder)
out_folder = os.path.join(base_folder, "out")
if not os.path.exists(out_folder):
    os.makedirs(out_folder)

In [2]:
import torch
import torchvision
import torchvision.transforms as transforms
import pandas as pd

In [26]:
batch_size = 1
epochs = 1

# Dataset

In [4]:
himym_path = os.path.join(base_folder, "Datasets", "Characters", "Barney")
himym_df = pd.read_csv(os.path.join(himym_path, "HIMYM_preprocessed.csv"))

In [5]:
himym_df[himym_df['character']=='Barney']

Unnamed: 0,episode,line,character
14,01x01,"hey, so you know how I've always had a thing f...",Barney
16,01x01,"Okay, meet me at the bar in fifteen minutes, a...",Barney
18,01x01,Where's your suit!? Just once when I say suit ...,Barney
20,01x01,It was a blazer!,Barney
22,01x01,I see what this is about. Have you forgotten w...,Barney
...,...,...,...
30279,07x03,Because you are sat here all night. You could ...,Barney
30281,07x03,I will prove it. I'll show you how much I'm se...,Barney
30283,07x03,I do not laugh. It is a restaurant 24h 24. I w...,Barney
30307,07x03,Oh! Hi guys! Oh! My neck! Uh... oh! Hey! Look ...,Barney


In [6]:
himym_df['character'] = himym_df['character'].apply(lambda x: 1 if x=='Barney' else 0)

In [7]:
# himym_df['line'] = himym_df['line'].apply(lambda x: x[:128])

In [8]:
himym_df[himym_df['character']==1]

Unnamed: 0,episode,line,character
14,01x01,"hey, so you know how I've always had a thing f...",1
16,01x01,"Okay, meet me at the bar in fifteen minutes, a...",1
18,01x01,Where's your suit!? Just once when I say suit ...,1
20,01x01,It was a blazer!,1
22,01x01,I see what this is about. Have you forgotten w...,1
...,...,...,...
30279,07x03,Because you are sat here all night. You could ...,1
30281,07x03,I will prove it. I'll show you how much I'm se...,1
30283,07x03,I do not laugh. It is a restaurant 24h 24. I w...,1
30307,07x03,Oh! Hi guys! Oh! My neck! Uh... oh! Hey! Look ...,1


In [9]:
himym_df = himym_df.drop(columns=['episode'])

In [10]:
from sklearn.model_selection import train_test_split
himym_train, himym_test = train_test_split(himym_df, test_size=0.15)

In [11]:
himym_train = [(line['line'], line['character']) for _,line in himym_train.iterrows()]

In [12]:
himym_test = [(line['line'], line['character']) for _,line in himym_test.iterrows()]

In [27]:
trainloader = torch.utils.data.DataLoader(himym_train, 
                                          batch_size=batch_size,
                                          shuffle=True, 
                                          num_workers=2)

In [28]:
testloader = torch.utils.data.DataLoader(himym_test, 
                                         batch_size=batch_size,
                                         shuffle=False, 
                                         num_workers=2)

In [15]:
classes = (0, 1)

# Model

In [16]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")

In [17]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [22]:
import torch.nn as nn

class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.fc = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = x['sentence_embedding']
        print('\n\n x shape:\t', x.shape, '\n\n')
        x = self.fc(x)
        x = self.sigmoid(x)
        # x = nn.Softmax()
        return x

In [23]:
model.add_module('2', MyModel())
model[0].train(mode=False)
model[1].train(mode=False)
model[2].train(mode=True)
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): MyModel(
    (fc): Linear(in_features=768, out_features=1, bias=True)
    (sigmoid): Sigmoid()
  )
)

# Training

In [29]:
import torch.nn.functional as F
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [32]:
for epoch in range(epochs):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0): 
        
        # print(data)
        
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        
        inputs = list(inputs)
        
        print('\n\ninputs:\n', inputs,
              '\nlabels:\n', labels)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        # print('input shape:\t', inputs.shape)
        outputs = model.encode(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

print('Finished Training')



inputs:
 ["No, this is good. Let's get it all out of the way. Robin, I'm looking for something serious."] 
labels:
 tensor([0])


 x shape:	 torch.Size([1, 768]) 




IndexError: too many indices for tensor of dimension 2