# Importing Libraries

In [12]:
import transformers
import pandas as pd
import datasets
import matplotlib.pyplot as plt
from sklearn.metrics import *
import torch
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB

# Preparing data

## Loading dataset

In [4]:
df = pd.read_csv("cleaned_data.csv")
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...
1,1904,The Suburbanite,American,Wallace McCutcheon,,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,The film is about a family who move to the sub...
2,1907,How Brown Saw the Baseball Game,American,Unknown,Unknown,comedy,https://en.wikipedia.org/wiki/How_Brown_Saw_th...,Before heading out to a baseball game at a nea...
3,1907,Laughing Gas,American,Edwin Stanton Porter,"Bertha Regustus, Edward Boulden",comedy,https://en.wikipedia.org/wiki/Laughing_Gas_(fi...,The plot is that of a black woman going to the...
4,1908,The Adventures of Dollie,American,D. W. Griffith,"Arthur V. Johnson, Linda Arvidson",drama,https://en.wikipedia.org/wiki/The_Adventures_o...,On a beautiful summer day a father and mother ...


In [18]:
data = datasets.Dataset.from_pandas(df)
data

Dataset({
    features: ['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast', 'Genre', 'Wiki Page', 'Plot'],
    num_rows: 17562
})

## Remove unnecessary columns

In [11]:
used_data = data.remove_columns(column_names=['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast', 'Wiki Page'])
used_data

Dataset({
    features: ['Genre', 'Plot'],
    num_rows: 17562
})

## Train Test Split

In [20]:
used_data = used_data.train_test_split(test_size=0.2, shuffle=True)
used_data

DatasetDict({
    train: Dataset({
        features: ['Genre', 'Plot'],
        num_rows: 14049
    })
    test: Dataset({
        features: ['Genre', 'Plot'],
        num_rows: 3513
    })
})

# Preparing Model and Tokenizer

In [22]:
model = transformers.DistilBertModel.from_pretrained('distilbert-base-uncased')
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [23]:
tokenizer.vocab_size, tokenizer.model_max_length

(30522, 512)

## Tokenizing Data

In [25]:
def tokenize(batch):
    return tokenizer(batch['Plot'], padding=True, truncation=True)

used_data_encoded = used_data.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/14049 [00:00<?, ? examples/s]

Map:   0%|          | 0/3513 [00:00<?, ? examples/s]

## Extracting last hidden state

In [31]:
def extract_hidden_states(batch):
    inputs = {k:v.to('cpu') for k, v in batch.items() if k in tokenizer.model_input_names}
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [32]:
used_data_encoded.set_format("torch", columns=["input_ids", "attention_mask", "Genre"])
used_data_hidden = used_data_encoded.map(extract_hidden_states, batched=True, batch_size=500)

Map:   0%|          | 0/14049 [00:00<?, ? examples/s]