In [45]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizerFast, DistilBertModel, DistilBertConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, jaccard_score
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('../../data/preprocessed_data.csv')
data.head()

Unnamed: 0,movie,plot,genres
0,"""#7DaysLater"" (2013)",days later interactive comedy series featuring...,['Comedy']
1,"""#Cake"" (2015)",cake hour long serial narrative comedy manhunt...,['Comedy']
2,"""#DaddyLeaks"" (????)",life four close friends late thirties change f...,['Comedy']
3,"""#Elmira"" (2014)",elmira follows story bunch strangers respond c...,['Comedy']
4,"""#Fuga"" (2016)",months apocalyptic event group survivors find ...,"['Action', 'Drama', 'Horror']"


In [3]:
# get maximum length of the sentences in data['plot']
max_len = np.max(data['plot'].apply(lambda x: len(x.split())))
max_len


269

In [54]:
# load the data into torch datasets
class MovieDataset(Dataset):
    def __init__(self, plot, genre, tokenizer, max_len):
        self.plot = plot
        self.genre = genre
        self.tokenizer = tokenizer
        self.max_len = max_len
        # tokenize whole data at once and store in self.encodings
        self.encodings = self.tokenizer.batch_encode_plus(
            # convert self.plot to list
            self.plot.tolist(),
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

    def __len__(self):
        return len(self.plot)

    def __getitem__(self, item):
        plot = str(self.plot[item])
        genre = self.genre[item]

        # encoding = self.tokenizer.encode_plus(
        #     plot,
        #     add_special_tokens=True,
        #     max_length=self.max_len,
        #     return_token_type_ids=False,
        #     pad_to_max_length=True,
        #     return_attention_mask=True,
        #     return_tensors='pt'
        # )

        return {
            'plot': plot,
            'input_ids': self.encodings['input_ids'].flatten(),
            'attention_mask': self.encodings['attention_mask'].flatten(),
            'genre': torch.tensor(genre, dtype=torch.long)
        }

In [55]:
# load data into MovieDataset
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [56]:
def tokenize(batch):
    return tokenizer(batch, padding=True, truncation=True)

In [57]:
# apply tokenize function to data['plot'] in batches
batch_size = 16
train_size = 0.8

train_data, test_data = train_test_split(data, train_size=train_size, random_state=42)
train_data, val_data = train_test_split(train_data, train_size=train_size, random_state=42)


In [58]:
data_loader = MovieDataset(
    plot=data['plot'].to_numpy(),
    genre=data['genres'].to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


TypeError: batch_text_or_text_pairs has to be a list or a tuple (got <class 'numpy.ndarray'>)

In [24]:
from transformers import AutoModel

model_ckpt = "distilbert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using {} device".format(device))

model = AutoModel.from_pretrained(model_ckpt).to(device)

Using cuda device
