In [1]:
import torch
import torchmetrics
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, DistilBertForSequenceClassification
from tqdm import tqdm
from ast import literal_eval

In [2]:
df1 = pd.read_csv('movie_5000/tmdb_5000_movies.csv')
df2 = pd.read_csv('movie_45000/movies_metadata.csv')

In [3]:
df1['genres'] = df1['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [4]:
df2['genres'] = df2['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
df2 = df2[df2.genres.apply(lambda x: len(x) > 0)]
df2.reset_index(inplace = True)
df2 = df2.drop(columns='index')

In [5]:
df1 = df1.drop(columns='id')
df2 = df2.drop(columns='id')

In [6]:
# Check if all df1 movies are in df2
df1['title'] = df1['title'].str.lower()
df2['title'] = df2['title'].str.lower()

# Make every movie in df1 unique by title
df1 = df1.drop_duplicates(subset=['title'], keep='first')
df1.reset_index(inplace = True)
df1 = df1.drop(columns='index')

# Make every movie in df2 unique by title
df2 = df2.drop_duplicates(subset=['title'], keep='first')
df2.reset_index(inplace = True)
df2 = df2.drop(columns='index')

len(df1), len(df2)

(4800, 39990)

In [7]:
df = pd.concat([df1, df2], ignore_index=True)
df = df.drop_duplicates(subset=['title'], keep='first')
df.reset_index(inplace = True)
df.drop(columns='index', inplace=True)
df

Unnamed: 0,title,genres,overview
0,avatar,"[Action, Adventure, Fantasy, Science Fiction]","In the 22nd century, a paraplegic Marine is di..."
1,pirates of the caribbean: at world's end,"[Adventure, Fantasy, Action]","Captain Barbossa, long believed to be dead, ha..."
2,spectre,"[Action, Adventure, Crime]",A cryptic message from Bond’s past sends him o...
3,the dark knight rises,"[Action, Crime, Drama, Thriller]",Following the death of District Attorney Harve...
4,john carter,"[Action, Adventure, Science Fiction]","John Carter is a war-weary, former military ca..."
...,...,...,...
40192,shadow of the blair witch,"[Mystery, Horror]","In this true-crime documentary, we delve into ..."
40193,the burkittsville 7,[Horror],A film archivist revisits the story of Rustin ...
40194,caged heat 3000,[Science Fiction],It's the year 3000 AD. The world's most danger...
40195,subdue,"[Drama, Family]",Rising and falling between a man and woman.


In [8]:
# Remove movies in df that has no overview
final_df = df.copy()
final_df = final_df[final_df['overview'].notnull()]
final_df.reset_index(inplace = True)
final_df = final_df.drop(columns='index')
final_df

Unnamed: 0,title,genres,overview
0,avatar,"[Action, Adventure, Fantasy, Science Fiction]","In the 22nd century, a paraplegic Marine is di..."
1,pirates of the caribbean: at world's end,"[Adventure, Fantasy, Action]","Captain Barbossa, long believed to be dead, ha..."
2,spectre,"[Action, Adventure, Crime]",A cryptic message from Bond’s past sends him o...
3,the dark knight rises,"[Action, Crime, Drama, Thriller]",Following the death of District Attorney Harve...
4,john carter,"[Action, Adventure, Science Fiction]","John Carter is a war-weary, former military ca..."
...,...,...,...
39527,shadow of the blair witch,"[Mystery, Horror]","In this true-crime documentary, we delve into ..."
39528,the burkittsville 7,[Horror],A film archivist revisits the story of Rustin ...
39529,caged heat 3000,[Science Fiction],It's the year 3000 AD. The world's most danger...
39530,subdue,"[Drama, Family]",Rising and falling between a man and woman.


In [9]:
# Get all genres
genres = ["Crime", "Thriller", "Fantasy", "Horror", "Sci-Fi", "Comedy", "Documentary", "Adventure", "Film-Noir", "Animation", "Romance", "Drama", "Western", "Musical", "Action", "Mystery", "War", "Children\'s"]
mapping = {}
for i in range(len(genres)):
    mapping[i] = genres[i]
mapping

for i in range(len(final_df)):
    gens = final_df['genres'][i]
    for g in gens:
        if g == 'Science Fiction':
            final_df['genres'][i].remove(g)
            final_df['genres'][i].append('Sci-Fi')
        else:
            if g not in genres:
                final_df['genres'][i].remove(g)

# Remove movies in df that has no genres
final_df = final_df[final_df['genres'].apply(lambda x: len(x) > 0)]
final_df.reset_index(inplace = True)
final_df = final_df.drop(columns='index')
final_df

Unnamed: 0,title,genres,overview
0,avatar,"[Action, Adventure, Fantasy, Sci-Fi]","In the 22nd century, a paraplegic Marine is di..."
1,pirates of the caribbean: at world's end,"[Adventure, Fantasy, Action]","Captain Barbossa, long believed to be dead, ha..."
2,spectre,"[Action, Adventure, Crime]",A cryptic message from Bond’s past sends him o...
3,the dark knight rises,"[Action, Crime, Drama, Thriller]",Following the death of District Attorney Harve...
4,john carter,"[Action, Adventure, Sci-Fi]","John Carter is a war-weary, former military ca..."
...,...,...,...
39274,shadow of the blair witch,"[Mystery, Horror]","In this true-crime documentary, we delve into ..."
39275,the burkittsville 7,[Horror],A film archivist revisits the story of Rustin ...
39276,caged heat 3000,[Sci-Fi],It's the year 3000 AD. The world's most danger...
39277,subdue,[Drama],Rising and falling between a man and woman.


In [10]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model1 = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", problem_type="multi_label_classification", num_labels=18)
model1.config.id2label = mapping
model2 = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", problem_type="multi_label_classification", num_labels=18)
model2.config.id2label = mapping

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
def preprocess(df, genres = genres) -> pd.DataFrame:
    df['label'] = df.genres.apply(lambda x: [1 if genre in x else 0 for genre in genres])
    df.drop(columns=['genres'], inplace=True)
    df = df.reset_index(drop=True)
    return df

In [12]:
final_df = preprocess(final_df)

In [13]:
final_df

Unnamed: 0,title,overview,label
0,avatar,"In the 22nd century, a paraplegic Marine is di...","[0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
1,pirates of the caribbean: at world's end,"Captain Barbossa, long believed to be dead, ha...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
2,spectre,A cryptic message from Bond’s past sends him o...,"[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
3,the dark knight rises,Following the death of District Attorney Harve...,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, ..."
4,john carter,"John Carter is a war-weary, former military ca...","[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
...,...,...,...
39274,shadow of the blair witch,"In this true-crime documentary, we delve into ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
39275,the burkittsville 7,A film archivist revisits the story of Rustin ...,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
39276,caged heat 3000,It's the year 3000 AD. The world's most danger...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
39277,subdue,Rising and falling between a man and woman.,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."


In [14]:
#split df into train and test datasets with 90% and 10% of the data randomly
trainset = final_df.sample(frac=0.85, random_state=42)
testset = final_df.drop(trainset.index).reset_index(drop=True)

In [15]:
trainset

Unnamed: 0,title,overview,label
11905,pretty maids all in a row,"In a California high school, a married teacher...","[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
36465,fist fight,"When one school teacher gets the other fired, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5662,somebody is waiting,An alcoholic father must take control of his w...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
8844,ringu 2,While investigating the horrifying death of he...,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
25031,white cargo,"In Africa early in World War II, a British rub...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, ..."
...,...,...,...
8624,company business,"An aging agent is called back by ""the Company""...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
25494,intimidation,"Koreyoshi Kurahara’s ingeniously plotted, pock...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
22821,charlie chan's chance,"Charlie is the intended murder victim here, an...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5565,the quiet man,Sean Thornton has returned from America to rec...,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."


In [16]:
#change title and overview columns to list of string type
trainset['title'] = trainset['title'].astype(str).tolist()
trainset['overview'] = trainset['overview'].astype(str).tolist()
testset['title'] = testset['title'].astype(str).tolist()
testset['overview'] = testset['overview'].astype(str).tolist()

# Hard Code

In [17]:
class Poroset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len=256):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        title = row['title']
        overview = row['overview']
        label = row['label']
        title_encoding = self.tokenizer(title, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
        overview_encoding = self.tokenizer(overview, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
        return {
            'title': title,
            'overview': overview,
            'title_input_ids': title_encoding['input_ids'].squeeze(),
            'title_attention_mask': title_encoding['attention_mask'].squeeze(),
            'overview_input_ids': overview_encoding['input_ids'].squeeze(),
            'overview_attention_mask': overview_encoding['attention_mask'].squeeze(),
            'label': torch.FloatTensor(label)
        }

In [18]:
trainset = Poroset(trainset, tokenizer)
testset = Poroset(testset, tokenizer)

In [19]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=32, shuffle=True)

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model1.to(device)
model2.to(device)
device

device(type='cuda')

In [21]:
class multimodel(torch.nn.Module):
    def __init__(self, model1, model2):
        super().__init__()
        self.model1 = model1
        self.model2 = model2
        self.linear = torch.nn.Linear(18, 18)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, title_input_ids, title_attention_mask, overview_input_ids, overview_attention_mask):
        title_output = self.model1(title_input_ids, title_attention_mask)
        overview_output = self.model2(overview_input_ids, overview_attention_mask)
        output = (title_output.logits + overview_output.logits) / 2
        output = self.linear(output)
        output = self.sigmoid(output)
        return output

In [22]:
model = multimodel(model1, model2)
model.to(device)

multimodel(
  (model1): DistilBertForSequenceClassification(
    (distilbert): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0-5): 6 x TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (ffn

In [23]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-5)

In [24]:
def train(epoch):
    model.train()
    for _, data in tqdm(enumerate(trainloader, 0), total=len(trainloader)):
        title_input_ids = data['title_input_ids'].to(device)
        title_attention_mask = data['title_attention_mask'].to(device)
        overview_input_ids = data['overview_input_ids'].to(device)
        overview_attention_mask = data['overview_attention_mask'].to(device)
        targets = data['label'].to(device)

        outputs = model(title_input_ids, title_attention_mask, overview_input_ids, overview_attention_mask)
        loss = loss_fn(outputs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch} loss: {loss.item()}")

In [25]:
for epoch in range(4):
    train(epoch)

  0%|          | 0/1044 [00:00<?, ?it/s]

  1%|          | 8/1044 [00:37<1:21:32,  4.72s/it]


KeyboardInterrupt: 

In [None]:
# Validation
def validation(testing_loader):
    model.eval()
    fin_targets = []
    fin_outputs = []

    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader), total=len(testing_loader)):
            title_input_ids = data['title_input_ids'].to(device)
            title_attention_mask = data['title_attention_mask'].to(device)
            overview_input_ids = data['overview_input_ids'].to(device)
            overview_attention_mask = data['overview_attention_mask'].to(device)
            targets = data['label'].to(device)

            outputs = model(title_input_ids, title_attention_mask, overview_input_ids, overview_attention_mask)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(outputs.cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets

In [None]:
outputs, targets = validation(testloader)

outputs = np.array(outputs) >= 0.5

100%|██████████| 185/185 [01:52<00:00,  1.65it/s]


In [None]:
# Multi-label F1 score, macro-averaged
from torchmetrics.classification import MultilabelF1Score
f1 = MultilabelF1Score(num_labels=18, threshold=0.5, average='macro')
f1.update(torch.tensor(outputs), torch.tensor(targets))

f1.compute()

tensor(0.)

In [None]:
# Save model
model.save_pretrained('model')

In [None]:
# Inferencing title and overview
def inference(title, overview = None):
    model.eval()
    with torch.no_grad():
        title_encoding = tokenizer(title, truncation=True, padding='max_length', max_length=256, return_tensors='pt')
        title_input_ids = title_encoding['input_ids'].to(device)
        title_attention_mask = title_encoding['attention_mask'].to(device)
        if overview is not None:
            overview_encoding = tokenizer(overview, truncation=True, padding='max_length', max_length=256, return_tensors='pt')
            overview_input_ids = overview_encoding['input_ids'].to(device)
            overview_attention_mask = overview_encoding['attention_mask'].to(device)
            outputs = model(title_input_ids, title_attention_mask, overview_input_ids, overview_attention_mask)
        else:
            outputs = model(title_input_ids, title_attention_mask, None, None)
        outputs = outputs.cpu().detach().numpy().tolist()
        outputs = np.array(outputs) >= 0.5
        return outputs

In [None]:
_overview = "1920's prohibition Chicago is corrupt from the judges downward. So in going up against Al Capone, Treasury Agent Eliot Ness picks just two cops to help him and his accountant colleague. One is a sharp-shooting rookie, the other a seen-it-all beat man. The four of them are ready to battle Capone and his empire, but it could just be that guns are not the best way to get him."
inference('The Untouchables (1987)', overview=_overview)