In [16]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizerFast, DistilBertModel, DistilBertConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, jaccard_score
from sklearn.preprocessing import MultiLabelBinarizer
import matplotlib.pyplot as plt

In [17]:
data = pd.read_csv('../../data/preprocessed_data.csv')
data.head()

Unnamed: 0,movie,plot,genres
0,"""#7DaysLater"" (2013)",days later interactive comedy series featuring...,['Comedy']
1,"""#Cake"" (2015)",cake hour long serial narrative comedy manhunt...,['Comedy']
2,"""#DaddyLeaks"" (????)",life four close friends late thirties change f...,['Comedy']
3,"""#Elmira"" (2014)",elmira follows story bunch strangers respond c...,['Comedy']
4,"""#Fuga"" (2016)",months apocalyptic event group survivors find ...,"['Action', 'Drama', 'Horror']"


In [18]:
# get maximum length of the sentences in data['plot']
max_len = np.max(data['plot'].apply(lambda x: len(x.split())))
max_len

269

In [19]:
# load data into MovieDataset
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [20]:
def tokenize(batch):
    return tokenizer(batch, padding=True, truncation=True)

In [21]:
data['plot'] = data['plot'].apply(lambda x: tokenize(x))

In [22]:
from transformers import AutoModel

model_ckpt = "distilbert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using {} device".format(device))

model = AutoModel.from_pretrained(model_ckpt).to(device)

Using cuda device


In [23]:
text = "this is a test"
sample_inputs = tokenizer(text, return_tensors="pt")
print(f"Input tensor shape: {sample_inputs['input_ids'].size()}\n"
      f"Tokenized text: {tokenizer.convert_ids_to_tokens(sample_inputs['input_ids'][0])}")

Input tensor shape: torch.Size([1, 6])
Tokenized text: ['[CLS]', 'this', 'is', 'a', 'test', '[SEP]']


In [24]:
inputs = {k:v.to(device) for k,v in sample_inputs.items()}

with torch.no_grad():
    outputs = model(**inputs)
    print(outputs)

BaseModelOutput(last_hidden_state=tensor([[[-0.1565, -0.1862,  0.0528,  ..., -0.1188,  0.0662,  0.5470],
         [-0.3575, -0.6484, -0.0618,  ..., -0.3040,  0.3508,  0.5221],
         [-0.2772, -0.4459,  0.1818,  ..., -0.0948, -0.0076,  0.9958],
         [-0.2841, -0.3917,  0.3753,  ..., -0.2151, -0.1173,  1.0526],
         [ 0.2661, -0.5094, -0.3180,  ..., -0.4203,  0.0144, -0.2149],
         [ 0.9441,  0.0112, -0.4714,  ...,  0.1439, -0.7288, -0.1619]]],
       device='cuda:0'), hidden_states=None, attentions=None)


In [25]:
outputs.last_hidden_state[:,0].shape

torch.Size([1, 768])

In [49]:
# data['plot'][0]['input_ids'] 
# convert all input_ids to tensors
data['input_ids'] = data['plot'].apply(lambda x: x['input_ids'])
data['input_ids'] = data['input_ids'].apply(lambda x: torch.tensor(x).to(device))

In [52]:
def extract_hidden_states(batch):
    batch = batch.to(device)
    
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state

    return last_hidden_state[:, 0]

In [53]:
data['hidden state'] = data['input_ids'].apply(lambda x: extract_hidden_states(x))

In [64]:
# concatenate all hidden states into one tensor using torch.cat
embeddings = torch.cat(data['hidden state'].tolist(), dim=0)

In [67]:
embeddings.shape

torch.Size([259557, 768])

In [69]:
np.save('../../vectorised_data/X_bert.npy', embeddings.cpu().numpy())

: 