In [1]:
import torch
import torchmetrics
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, DistilBertForSequenceClassification
from tqdm import tqdm
from ast import literal_eval

In [2]:
df1 = pd.read_csv('movie_5000/tmdb_5000_movies.csv')
df2 = pd.read_csv('movie_45000/movies_metadata.csv')

In [3]:
df1['genres'] = df1['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
df2['genres'] = df2['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
df2 = df2[df2.genres.apply(lambda x: len(x) > 0)]
df2.reset_index(inplace = True)
df2 = df2.drop(columns='index')

In [4]:
df1 = df1.drop(columns='id')
df2 = df2.drop(columns='id')

In [5]:
# Check if all df1 movies are in df2
df1['title'] = df1['title'].str.lower()
df2['title'] = df2['title'].str.lower()

# Make every movie in df1 unique by title
df1 = df1.drop_duplicates(subset=['title'], keep='first')
df1.reset_index(inplace = True)
df1 = df1.drop(columns='index')

# Make every movie in df2 unique by title
df2 = df2.drop_duplicates(subset=['title'], keep='first')
df2.reset_index(inplace = True)
df2 = df2.drop(columns='index')

len(df1), len(df2)

(4800, 39990)

In [6]:
df = pd.concat([df1, df2], ignore_index=True)
df = df.drop_duplicates(subset=['title'], keep='first')
df.reset_index(inplace = True)
df.drop(columns='index', inplace=True)

In [7]:
# Remove movies in df that has no overview
final_df = df.copy()
final_df = final_df[final_df['overview'].notnull()]
final_df.reset_index(inplace = True)
final_df = final_df.drop(columns='index')
final_df = final_df[final_df['genres'].apply(lambda x: len(x) > 0)]
final_df.reset_index(inplace = True)
final_df = final_df.drop(columns='index')
final_df

Unnamed: 0,title,genres,overview
0,avatar,"[Action, Adventure, Fantasy, Science Fiction]","In the 22nd century, a paraplegic Marine is di..."
1,pirates of the caribbean: at world's end,"[Adventure, Fantasy, Action]","Captain Barbossa, long believed to be dead, ha..."
2,spectre,"[Action, Adventure, Crime]",A cryptic message from Bond’s past sends him o...
3,the dark knight rises,"[Action, Crime, Drama, Thriller]",Following the death of District Attorney Harve...
4,john carter,"[Action, Adventure, Science Fiction]","John Carter is a war-weary, former military ca..."
...,...,...,...
39499,shadow of the blair witch,"[Mystery, Horror]","In this true-crime documentary, we delve into ..."
39500,the burkittsville 7,[Horror],A film archivist revisits the story of Rustin ...
39501,caged heat 3000,[Science Fiction],It's the year 3000 AD. The world's most danger...
39502,subdue,"[Drama, Family]",Rising and falling between a man and woman.


In [8]:
genres = ["Crime", "Thriller", "Fantasy", "Horror", "Sci-Fi", "Comedy", "Documentary", "Adventure", "Film-Noir", "Animation", "Romance", "Drama", "Western", "Musical", "Action", "Mystery", "War", "Children\'s"]
mapping = {}
for i in range(len(genres)):
    mapping[i] = genres[i]
mapping

for i in range(len(final_df)):
    gens = final_df['genres'][i]
    for g in gens:
        if g == 'Science Fiction':
            final_df['genres'][i].remove(g)
            final_df['genres'][i].append('Sci-Fi')
        else:
            if g not in genres:
                final_df['genres'][i].remove(g)

final_df = final_df[final_df['genres'].apply(lambda x: len(x) > 0)]
final_df.reset_index(inplace = True)
final_df = final_df.drop(columns='index')
final_df

Unnamed: 0,title,genres,overview
0,avatar,"[Action, Adventure, Fantasy, Sci-Fi]","In the 22nd century, a paraplegic Marine is di..."
1,pirates of the caribbean: at world's end,"[Adventure, Fantasy, Action]","Captain Barbossa, long believed to be dead, ha..."
2,spectre,"[Action, Adventure, Crime]",A cryptic message from Bond’s past sends him o...
3,the dark knight rises,"[Action, Crime, Drama, Thriller]",Following the death of District Attorney Harve...
4,john carter,"[Action, Adventure, Sci-Fi]","John Carter is a war-weary, former military ca..."
...,...,...,...
39274,shadow of the blair witch,"[Mystery, Horror]","In this true-crime documentary, we delve into ..."
39275,the burkittsville 7,[Horror],A film archivist revisits the story of Rustin ...
39276,caged heat 3000,[Sci-Fi],It's the year 3000 AD. The world's most danger...
39277,subdue,[Drama],Rising and falling between a man and woman.


In [9]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model1 = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", problem_type="multi_label_classification", num_labels=18)
model1.config.id2label = mapping
model2 = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", problem_type="multi_label_classification", num_labels=18)
model2.config.id2label = mapping

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
def preprocess(df, genres = genres) -> pd.DataFrame:
    df['label'] = df.genres.apply(lambda x: [1 if genre in x else 0 for genre in genres])
    df.drop(columns=['genres'], inplace=True)
    df = df.reset_index(drop=True)
    return df

In [11]:
final_df = preprocess(final_df)
final_df

Unnamed: 0,title,overview,label
0,avatar,"In the 22nd century, a paraplegic Marine is di...","[0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
1,pirates of the caribbean: at world's end,"Captain Barbossa, long believed to be dead, ha...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
2,spectre,A cryptic message from Bond’s past sends him o...,"[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
3,the dark knight rises,Following the death of District Attorney Harve...,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, ..."
4,john carter,"John Carter is a war-weary, former military ca...","[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
...,...,...,...
39274,shadow of the blair witch,"In this true-crime documentary, we delve into ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
39275,the burkittsville 7,A film archivist revisits the story of Rustin ...,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
39276,caged heat 3000,It's the year 3000 AD. The world's most danger...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
39277,subdue,Rising and falling between a man and woman.,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."


In [12]:
trainset = final_df.sample(frac=0.85, random_state=42)
testset = final_df.drop(trainset.index).reset_index(drop=True)

In [13]:
testset

Unnamed: 0,title,overview,label
0,batman v superman: dawn of justice,Fearing the actions of a god-like Super Hero l...,"[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
1,the avengers,When an unexpected enemy emerges and threatens...,"[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
2,brave,Brave is set in the mystical Scottish Highland...,"[0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, ..."
3,a christmas carol,Miser Ebenezer Scrooge is awakened on Christma...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, ..."
4,up,Carl Fredricksen spent his entire life dreamin...,"[0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, ..."
...,...,...,...
5887,pro lyuboff,"У девушки Даши, приехавшей с подругой «покорят...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
5888,kuka,A woman and a young girl in different cities a...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
5889,"a black rose is an emblem of sorrow, a red ros...","The story happens mainly around ""The Old Arbat...","[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
5890,phobos. fear kills,Rainy summer evening... Young people are arriv...,"[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [14]:
#change title and overview columns to list of string type
trainset['title'] = trainset['title'].astype(str).tolist()
trainset['overview'] = trainset['overview'].astype(str).tolist()
testset['title'] = testset['title'].astype(str).tolist()
testset['overview'] = testset['overview'].astype(str).tolist()

In [15]:
class Poroset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len=256):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        title = row['title']
        overview = row['overview']
        label = row['label']
        title_encoding = self.tokenizer(title, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
        overview_encoding = self.tokenizer(overview, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
        return {
            'title': title,
            'overview': overview,
            'title_input_ids': title_encoding['input_ids'].squeeze(),
            'title_attention_mask': title_encoding['attention_mask'].squeeze(),
            'overview_input_ids': overview_encoding['input_ids'].squeeze(),
            'overview_attention_mask': overview_encoding['attention_mask'].squeeze(),
            'label': torch.FloatTensor(label)
        }

In [16]:
trainset = Poroset(trainset, tokenizer)
testset = Poroset(testset, tokenizer)

In [17]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=32, shuffle=True)

In [18]:
class multimodel(torch.nn.Module):
    def __init__(self, model1, model2):
        super().__init__()
        self.model1 = model1
        self.model2 = model2
        self.linear1 = torch.nn.Linear(18, 18)
        self.linear2 = torch.nn.Linear(18, 18)

    def forward(self, title_input_ids, title_attention_mask, overview_input_ids, overview_attention_mask):
        title_output = self.model1(title_input_ids, title_attention_mask)
        overview_output = self.model2(overview_input_ids, overview_attention_mask)
        title_output = self.linear1(title_output[0])
        overview_output = self.linear2(overview_output[0])
        output = torch.add(title_output, overview_output)
        return output

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = multimodel(model1, model2)
model.load_state_dict(torch.load("model.pt"))
model.to(device)

multimodel(
  (model1): DistilBertForSequenceClassification(
    (distilbert): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0-5): 6 x TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (ffn

In [20]:
#using the three above functions to calculate the those scores for the model
def validation(testing_loader):
    model.eval()
    fin_targets = []
    fin_outputs = []

    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader), total=len(testing_loader)):
            title_input_ids = data['title_input_ids'].to(device)
            title_attention_mask = data['title_attention_mask'].to(device)
            overview_input_ids = data['overview_input_ids'].to(device)
            overview_attention_mask = data['overview_attention_mask'].to(device)
            targets = data['label'].to(device)

            outputs = model(title_input_ids, title_attention_mask, overview_input_ids, overview_attention_mask)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(outputs.cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets

In [21]:
outputs, targets = validation(testloader)

outputs = np.array(outputs) >= 0.5

100%|██████████| 185/185 [01:04<00:00,  2.86it/s]


In [22]:
# Multi-label F1 score, macro-averaged
from torchmetrics.classification import MultilabelF1Score
f1 = MultilabelF1Score(num_labels=18, threshold=0.5, average='micro')
f1.update(torch.tensor(outputs), torch.tensor(targets))

f1.compute()

tensor(0.5324)

In [35]:
# calculate Normalized Discounted Cumulative Gain at k
from sklearn.metrics import ndcg_score
ndcg_score(targets, outputs, ignore_ties=True)


0.7212866732043367

In [36]:
# calculate Mean Average Precision
from sklearn.metrics import average_precision_score
average_precision_score(targets, outputs, average='micro')

0.3621910459646464