In [2]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification

In [3]:
key_genres = {
    "sci-fi": "Sci-Fi",
    "science fiction": "Sci-Fi",
    "scifi": "Sci-Fi",
    "romcom": "Romance",
    "rom-com": "Romance",
    "doc": "Documentary",
    "bio": "Biopic",
    "biography": "Biopic",
    "war": "War",
    "thriller": "Thriller",
    "mystery": "Mystery",
    "crime": "Crime",
    "western": "Western",
    "fantasy": "Fantasy",
    "horror": "Horror",
    "comedy": "Comedy",
    "drama": "Drama",
    "action": "Action",
    "adventure": "Adventure",
    "animation": "Animation",
    "anime" : "Animation",
    "family": "Family",
    "music": "Music/Musical",
    "musical": "Musical/Musical",
    "history": "History",
    "sport": "Sport",
    "sports": "Sport",
    "romance": "Romance",
}

In [4]:
movies = pd.read_csv("movies-extended.csv")

# Text Cleanup

In [14]:
import re
sep_regex = re.compile(r"[,\|/;&\s]+")

In [15]:
def normalise_genre_token(token):
    t = token.strip().lower()
    if not t:
        return ""
    return key_genres.get(t, token.strip().title())

def parse_genre(cell):
    if not isinstance(cell, str):
        return []
    parts = sep_regex.split(cell)
    out = [normalise_genre_token(part) for part in parts if part.strip()]
    return sorted(list(dict.fromkeys(out)))

In [16]:
if "Plot" not in movies.columns:
    raise ValueError(f"Column 'Plot' not found in movies data! Columns available: {list(movies.columns)}")
if "Genre" not in movies.columns:
    raise ValueError(f"Column 'Genre' not found in movies data! Columns available: {list(movies.columns)}")

In [17]:
movies['Plot'] = movies['Plot'].astype(str).fillna("").str.strip()
movies = movies[movies['Plot'].str.len() > 0].copy()

In [18]:
# Picking out all known genres vs unknown genres (simple negation below)
known_genres = movies['Genre'].astype(str).str.lower().ne('unknown')
unknown_genres = ~known_genres

In [19]:
movies_known = movies[known_genres].copy()

In [20]:
movies_known['labels'] = movies_known['Genre'].apply(parse_genre)

In [21]:
movies_known

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,Title-And-Plot,Genre-And-Plot,New-Genre,labels
6,1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...,The Great Train Robbery : The film opens with ...,western : The film opens with two bandits brea...,western,[Western]
7,1904,The Suburbanite,American,Wallace McCutcheon,,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,The film is about a family who move to the sub...,The Suburbanite : The film is about a family w...,comedy : The film is about a family who move t...,comedy,[Comedy]
10,1906,Dream of a Rarebit Fiend,American,Wallace McCutcheon and Edwin S. Porter,,short,https://en.wikipedia.org/wiki/Dream_of_a_Rareb...,The Rarebit Fiend gorges on Welsh rarebit at a...,Dream of a Rarebit Fiend : The Rarebit Fiend g...,short : The Rarebit Fiend gorges on Welsh rare...,,[Short]
11,1906,From Leadville to Aspen: A Hold-Up in the Rockies,American,Francis J. Marion and Wallace McCutcheon,,short action/crime western,https://en.wikipedia.org/wiki/From_Leadville_t...,The film features a train traveling through th...,From Leadville to Aspen: A Hold-Up in the Rock...,short action/crime western : The film features...,,"[Action, Crime, Short, Western]"
12,1906,Kathleen Mavourneen,American,Edwin S. Porter,,short film,https://en.wikipedia.org/wiki/Kathleen_Mavourn...,Irish villager Kathleen is a tenant of Captain...,Kathleen Mavourneen : Irish villager Kathleen ...,short film : Irish villager Kathleen is a tena...,,"[Film, Short]"
...,...,...,...,...,...,...,...,...,...,...,...,...
22340,2017,Goon: Last of the Enforcers,Canadian,Jay Baruchel,"Seann William Scott, Liev Schrieber, Elisha Cu...","comedy, sports",https://en.wikipedia.org/wiki/Goon:_Last_of_th...,"During a pro hockey lockout, Doug ""The Thug"" G...",Goon: Last of the Enforcers : During a pro hoc...,"comedy, sports : During a pro hockey lockout, ...",,"[Comedy, Sport]"
22341,2017,"Hochelaga, Land of Souls (Hochelaga terre des ...",Canadian,François Girard,"Raoul Max Trujillo, Tanaya Beatty, David La Haye",historical drama,"https://en.wikipedia.org/wiki/Hochelaga,_Land_...","One night on the campus of McGill University, ...","Hochelaga, Land of Souls (Hochelaga terre des ...",historical drama : One night on the campus of ...,historical,"[Drama, Historical]"
22342,2017,Indian Horse,Canadian,Stephen Campanelli,"Forrest Goodluck, Michiel Huisman, Michael Mur...",drama,https://en.wikipedia.org/wiki/Indian_Horse_(film),"The Indian Horse family, including six-year-ol...","Indian Horse : The Indian Horse family, includ...","drama : The Indian Horse family, including six...",drama,[Drama]
22344,2017,Meditation Park,Canadian,Mina Shum,"Sandra Oh, Liane Balaban, Don McKellar",drama,https://en.wikipedia.org/wiki/Meditation_Park,"Opened by Mandarin theme song, Meditation Park...",Meditation Park : Opened by Mandarin theme son...,"drama : Opened by Mandarin theme song, Meditat...",drama,[Drama]


In [24]:
movies_known.loc[movies_known['Genre'].str.contains(r'\bsci[-\s]?fi\b', case=False, na=False)].head(1)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,Title-And-Plot,Genre-And-Plot,New-Genre,labels
1346,1932,The Mask of Fu Manchu,American,Charles Brabin,"Boris Karloff, Lewis Stone, Karen Morley","horror, sci-fi",https://en.wikipedia.org/wiki/The_Mask_of_Fu_M...,Sir Denis Nayland Smith (Lewis Stone) of the B...,The Mask of Fu Manchu : Sir Denis Nayland Smit...,"horror, sci-fi : Sir Denis Nayland Smith (Lewi...",,"[Horror, Science Fiction]"
