In [1]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification

In [2]:
movies = pd.read_csv("movies-extended.csv")

In [3]:
import re

key_genres = {
    'Action': ['action', 'martial arts', 'kung fu', 'fight', 'superhero', 'spy', 'war', 'swashbuckler'],
    'Adventure': ['adventure', 'treasure', 'survival', 'expedition'],
    'Comedy': ['comedy', 'satire', 'slapstick', 'parody'],
    'Drama': ['drama', 'melodrama', 'biographical', 'biography', 'biopic', 'historical drama', 'period drama'],
    'Horror': ['horror', 'slasher', 'supernatural'],
    'Romance': ['romance', 'romantic'],
    'Sci-Fi': ['sci-fi', 'science fiction', 'space', 'cyberpunk', 'time travel'],
    'Documentary': ['documentary', 'docu'],
    'Animation': ['animation', 'animated', 'cartoon', 'anime'],
    'Mystery/Thriller': ['mystery', 'thriller', 'noir', 'crime', 'suspense'],
    'Western': ['western'],
    'Musical': ['musical', 'music'],
    'Fantasy': ['fantasy', 'myth', 'fairy tale', 'magic', 'supernatural fantasy'],
    'War': ['war', 'ww1', 'ww2', 'military', 'propaganda'],
}

# Function to map to broad category
def map_key_genre(genre_str):
    genre = genre_str.lower()
    for category, keywords in key_genres.items():
        for keyword in keywords:
            if re.search(r'\b' + re.escape(keyword) + r'\b', genre):
                return category
    return 'Other'

# Apply mapping
movies['New-Genre'] = movies['Genre'].apply(map_key_genre)

# Check mapping results
broad_genre_counts = movies['New-Genre'].value_counts()
broad_genre_counts


New-Genre
Drama               6132
Comedy              5655
Mystery/Thriller    2091
Other               1831
Action              1611
Horror              1148
Western              881
Adventure            679
Animation            640
Sci-Fi               634
Musical              472
Romance              284
Fantasy              191
Documentary           81
War                   16
Name: count, dtype: int64

In [4]:
# For sub-genre classification, we'll keep one label per movie
# If multiple genres are listed in the original, we'll just take the first meaningful one

def extract_primary_subgenre(genre_str):
    if pd.isna(genre_str):
        return None
    # Normalize separators
    cleaned = re.sub(r'[,/–]', ' ', genre_str.lower())
    parts = cleaned.split()
    # Return the first significant term (not 'short', 'film', etc.)
    for p in parts:
        if p not in ['short', 'film', 'movie', 'series', 'mini-series']:
            return p.strip()
    return None

movies['Sub-Genre'] = movies['Genre'].apply(extract_primary_subgenre)

# Drop rows with no sub-genre
subgenre_df = movies.dropna(subset=['Sub-Genre'])

# Check how many unique sub-genres remain
unique_subgenres = subgenre_df['Sub-Genre'].unique()
len(unique_subgenres), list(unique_subgenres)[:50]


(211,
 ['unknown',
  'western',
  'comedy',
  'action',
  'biographical',
  'drama',
  'adventure',
  'fantasy',
  'silent',
  'horror',
  'crime',
  'historical',
  'documentary',
  'epic',
  'biography',
  'romantic',
  'mystery',
  'romance',
  'sexual',
  'war',
  'spy',
  'propaganda',
  'ww1',
  'biopic',
  'animated',
  'melodrama',
  'period',
  'swashbuckler',
  'thriller',
  'dramatic',
  'american',
  'semi-staged',
  'biblical',
  'race',
  'musical',
  'operetta',
  'detective',
  'costume',
  'prison',
  'noir',
  'sports',
  'animation',
  'science',
  'sci-fi',
  'exploitation',
  'murder',
  'comedy-drama',
  'sport',
  'serial',
  'military'])

In [5]:
movies

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,Title-And-Plot,Genre-And-Plot,New-Genre,Sub-Genre
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr...",Kansas Saloon Smashers : A bartender is workin...,"A bartender is working at a saloon, serving dr...",Other,unknown
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov...","Love by the Light of the Moon : The moon, pain...","The moon, painted with a smiling face hangs ov...",Other,unknown
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed...","The Martyred Presidents : The film, just over ...","The film, just over a minute long, is composed...",Other,unknown
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...,"Terrible Teddy, the Grizzly King : Lasting jus...",Lasting just 61 seconds and consisting of two ...,Other,unknown
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...,Jack and the Beanstalk : The earliest known ad...,The earliest known adaptation of the classic f...,Other,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...
22341,2017,"Hochelaga, Land of Souls (Hochelaga terre des ...",Canadian,François Girard,"Raoul Max Trujillo, Tanaya Beatty, David La Haye",historical drama,"https://en.wikipedia.org/wiki/Hochelaga,_Land_...","One night on the campus of McGill University, ...","Hochelaga, Land of Souls (Hochelaga terre des ...",historical drama : One night on the campus of ...,Drama,historical
22342,2017,Indian Horse,Canadian,Stephen Campanelli,"Forrest Goodluck, Michiel Huisman, Michael Mur...",drama,https://en.wikipedia.org/wiki/Indian_Horse_(film),"The Indian Horse family, including six-year-ol...","Indian Horse : The Indian Horse family, includ...","drama : The Indian Horse family, including six...",Drama,drama
22343,2017,The Little Girl Who Was Too Fond of Matches (L...,Canadian,Simon Lavoie,,unknown,https://en.wikipedia.org/wiki/The_Little_Girl_...,"In rural 1930s Quebec, Alice lives in house wi...",The Little Girl Who Was Too Fond of Matches (L...,"In rural 1930s Quebec, Alice lives in house wi...",Other,unknown
22344,2017,Meditation Park,Canadian,Mina Shum,"Sandra Oh, Liane Balaban, Don McKellar",drama,https://en.wikipedia.org/wiki/Meditation_Park,"Opened by Mandarin theme song, Meditation Park...",Meditation Park : Opened by Mandarin theme son...,"drama : Opened by Mandarin theme song, Meditat...",Drama,drama


In [6]:
movies = movies.drop(columns=['Title-And-Plot', 'Genre-And-Plot'])

In [7]:
movies['Title-Genre-Plot'] = movies['Title'] + ' - ' + movies['Genre'] + ' : ' + movies['Plot']

In [8]:
training_df = movies.dropna(subset=['Plot', 'New-Genre', 'Sub-Genre'])
training_df = training_df[training_df['Sub-Genre'] != 'Unknown']

In [9]:
training_df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,New-Genre,Sub-Genre,Title-Genre-Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr...",Other,unknown,Kansas Saloon Smashers - unknown : A bartender...
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov...",Other,unknown,Love by the Light of the Moon - unknown : The ...
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed...",Other,unknown,"The Martyred Presidents - unknown : The film, ..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...,Other,unknown,"Terrible Teddy, the Grizzly King - unknown : L..."
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...,Other,unknown,Jack and the Beanstalk - unknown : The earlies...


In [10]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset, TensorDataset

In [11]:
tokeniser = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from sklearn.preprocessing import LabelEncoder

label_encoder_key = LabelEncoder()
label_encoder_sub = LabelEncoder()

training_df['key_label'] = label_encoder_key.fit_transform(training_df['New-Genre'])
training_df['sub_label'] = label_encoder_sub.fit_transform(training_df['Sub-Genre'])

In [13]:
X_train, X_test, y_train, y_test = train_test_split(training_df['Title-Genre-Plot'], training_df['key_label'], test_size=0.2, random_state=42)

In [14]:
train_encoded = tokeniser(list(X_train), truncation=True, padding=True, max_length=512, return_tensors='pt')
test_encoded = tokeniser(list(X_test), truncation=True, padding=True, max_length=512, return_tensors='pt')

In [15]:
train_dataset = TensorDataset(train_encoded['input_ids'], train_encoded['attention_mask'], torch.tensor(y_train.values))
test_dataset = TensorDataset(test_encoded['input_ids'], test_encoded['attention_mask'], torch.tensor(y_test.values))

In [16]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(key_genres))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
tokeniser

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [19]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# Training

In [21]:
class Movie_Classification(nn.Module):
    def __init__(self, num_labels):
        super(Movie_Classification, self).__init__()
        self.num_labels = num_labels
        self.classifier = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.classifier(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
