# Importing Libraries

In [35]:
import transformers
import pandas as pd
import datasets
import matplotlib.pyplot as plt
from sklearn.metrics import *
import numpy as np
import torch

# Preparing data

## Loading dataset

In [36]:
df = pd.read_csv("../Classical NLP Approach/Data/cleaned_data.csv")
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...
1,1908,The Call of the Wild,American,D. W. Griffith,Charles Inslee,adventure,https://en.wikipedia.org/wiki/The_Call_of_the_...,A white girl (Florence Lawrence) rejects a pro...
2,1908,The Fight for Freedom,American,D. W. Griffith,"Florence Auer, John G. Adolfi",western,https://en.wikipedia.org/wiki/The_Fight_for_Fr...,The film opens in a town on the Mexican border...
3,1912,Dr. Jekyll and Mr. Hyde,American,Lucius Henderson,James Cruze,horror,https://en.wikipedia.org/wiki/Dr._Jekyll_and_M...,White-haired Dr. Jekyll has secretly locked hi...
4,1913,Dr. Jekyll and Mr. Hyde,American,Herbert Brenon and Carl Laemmle,King Baggot,horror,https://en.wikipedia.org/wiki/Dr._Jekyll_and_M...,Dr. Henry Jekyll (King Baggot) sends a note to...


In [37]:
data = datasets.Dataset.from_pandas(df)
data

Dataset({
    features: ['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast', 'Genre', 'Wiki Page', 'Plot'],
    num_rows: 7874
})

## Remove unnecessary columns

In [38]:
used_data = data.remove_columns(column_names=['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast', 'Wiki Page'])
used_data

Dataset({
    features: ['Genre', 'Plot'],
    num_rows: 7874
})

## Train Test Split

In [39]:
used_data = used_data.train_test_split(test_size=0.2, shuffle=True)
used_data

DatasetDict({
    train: Dataset({
        features: ['Genre', 'Plot'],
        num_rows: 6299
    })
    test: Dataset({
        features: ['Genre', 'Plot'],
        num_rows: 1575
    })
})

# Preparing Model and Tokenizer

In [40]:
model = transformers.DistilBertModel.from_pretrained('distilbert-base-uncased')
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [41]:
tokenizer.vocab_size, tokenizer.model_max_length

(30522, 512)

## Tokenizing Data

In [43]:
def tokenize(batch):
    return tokenizer(batch['Plot'], padding=True, truncation=True)

used_data_encoded = used_data.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/6299 [00:00<?, ? examples/s]

Map:   0%|          | 0/1575 [00:00<?, ? examples/s]

## Extracting last hidden state

In [None]:
def extract_hidden_states(batch):
    inputs = {k:v.to('cpu') for k, v in batch.items() if k in tokenizer.model_input_names}
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [None]:
used_data_encoded.set_format("torch", columns=["input_ids", "attention_mask", "Genre"])
used_data_hidden = used_data_encoded.map(extract_hidden_states, batched=True, batch_size=500)

Map:   0%|          | 0/6299 [00:00<?, ? examples/s]

Map:   0%|          | 0/1575 [00:00<?, ? examples/s]

In [None]:
used_data_hidden

DatasetDict({
    train: Dataset({
        features: ['Genre', 'Plot', 'input_ids', 'attention_mask', 'hidden_state'],
        num_rows: 6299
    })
    test: Dataset({
        features: ['Genre', 'Plot', 'input_ids', 'attention_mask', 'hidden_state'],
        num_rows: 1575
    })
})

In [None]:
x_train = np.array(used_data_hidden['train']['hidden_state'])
x_test = np.array(used_data_hidden['test']['hidden_state'])

In [None]:
y_train = np.array(used_data_hidden['train']['Genre'])
y_test = np.array(used_data_hidden['test']['Genre'])

In [None]:
from sklearn.preprocessing import LabelEncoder

en = LabelEncoder()
y_train = en.fit_transform(y_train)
y_test = en.transform(y_test)

In [None]:
np.save("Data/xtrain.npy", x_train)
np.save("Data/xtest.npy", x_test)
np.save("Data/ytrain.npy", y_train)
np.save("Data/ytest.npy", y_test)

In [None]:
used_data_encoded.save_to_disk('Data/bert training data')

Saving the dataset (0/1 shards):   0%|          | 0/6299 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1575 [00:00<?, ? examples/s]

In [49]:
a = np.unique(en.inverse_transform(y_train))

In [50]:
a

array(['action', 'adventure', 'crime', 'film noir', 'horror', 'musical',
       'mystery', 'romance', 'science fiction', 'thriller', 'western'],
      dtype='<U15')

In [51]:
keys_dict = {}
for i in range(11):
    keys_dict[i] = a[i]

keys_dict

{0: 'action',
 1: 'adventure',
 2: 'crime',
 3: 'film noir',
 4: 'horror',
 5: 'musical',
 6: 'mystery',
 7: 'romance',
 8: 'science fiction',
 9: 'thriller',
 10: 'western'}