In [1]:
import torch
import torchmetrics
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, DistilBertForSequenceClassification
from tqdm import tqdm
from ast import literal_eval
from torchmetrics.classification import MultilabelF1Score

In [2]:
df1 = pd.read_csv('movie_5000/tmdb_5000_movies.csv')
df2 = pd.read_csv('movie_45000/movies_metadata.csv')
df3 = pd.read_csv('data.csv')

In [3]:
df1['genres'] = df1['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
df2['genres'] = df2['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
df2 = df2[df2.genres.apply(lambda x: len(x) > 0)]
df3['context'] = df3['context'].apply(lambda x: x.replace('<pad>',''))
df3['context'] = df3['context'].apply(lambda x: x.replace('</s>',''))
df2.reset_index(inplace = True)
df2 = df2.drop(columns='index')

In [4]:
#change the position of the columns 'label' and 'context' in df3 so that the result is text -> context -> label
cols = list(df3.columns)
cols = [cols[0]] + [cols[-1]] + cols[1:-1]
df3 = df3[cols]
df3['label'] = df3['label'].apply(literal_eval)
df3

Unnamed: 0,text,context,label
0,Washington Square,Washington Square is a movie about a man name...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
1,"Net, The",Net is a science fiction movie about a group ...,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Batman Returns,Batman Returns is a superhero movie about a y...,"[1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
3,"Boys from Brazil, The",Boys from Brazil is a movie about a group of ...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Dear Jesse,Dear Jesse is a movie about a man named Jesse...,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
27800,Kein Bund für's Leben,Kein Bund für's Leben is a German film about ...,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
27801,"Feuer, Eis & Dosenbier","Feuer, Eis & Dosenbier is a movie about a gro...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
27802,The Pirates,The Pirates is a movie about a group of pirat...,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
27803,Rentun Ruusu,Rentun Ruusu is a movie about a young woman n...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [5]:
df1 = df1.drop(columns='id')
df2 = df2.drop(columns='id')

In [6]:
# Check if all df1 movies are in df2
df1['title'] = df1['title'].str.lower()
df2['title'] = df2['title'].str.lower()

# Make every movie in df1 unique by title
df1 = df1.drop_duplicates(subset=['title'], keep='first')
df1.reset_index(inplace = True)
df1 = df1.drop(columns='index')

# Make every movie in df2 unique by title
df2 = df2.drop_duplicates(subset=['title'], keep='first')
df2.reset_index(inplace = True)
df2 = df2.drop(columns='index')

len(df1), len(df2)

(4800, 39990)

In [7]:
df = pd.concat([df1, df2], ignore_index=True)
df = df.drop_duplicates(subset=['title'], keep='first')
df.reset_index(inplace = True)
df.drop(columns='index', inplace=True)

In [8]:
# Remove movies in df that has no overview
final_df = df.copy()
final_df = final_df[final_df['overview'].notnull()]
final_df.reset_index(inplace = True)
final_df = final_df.drop(columns='index')
final_df = final_df[final_df['genres'].apply(lambda x: len(x) > 0)]
final_df.reset_index(inplace = True)
final_df = final_df.drop(columns='index')

In [9]:
genres = ["Crime", "Thriller", "Fantasy", "Horror", "Sci-Fi", "Comedy", "Documentary", "Adventure", "Film-Noir", "Animation", "Romance", "Drama", "Western", "Musical", "Action", "Mystery", "War", "Children\'s"]
mapping = {}
for i in range(len(genres)):
    mapping[i] = genres[i]
mapping

for i in range(len(final_df)):
    gens = final_df['genres'][i]
    for g in gens:
        if g == 'Science Fiction':
            final_df['genres'][i].remove(g)
            final_df['genres'][i].append('Sci-Fi')
        else:
            if g not in genres:
                final_df['genres'][i].remove(g)

final_df = final_df[final_df['genres'].apply(lambda x: len(x) > 0)]
final_df.reset_index(inplace = True)
final_df = final_df.drop(columns='index')

In [10]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", problem_type="multi_label_classification", num_labels=18)
model.config.id2label = mapping

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
def preprocess(df, genres = genres) -> pd.DataFrame:
    df['label'] = df.genres.apply(lambda x: [1 if genre in x else 0 for genre in genres])
    df.drop(columns=['genres'], inplace=True)
    df = df.reset_index(drop=True)
    return df

In [12]:
final_df = preprocess(final_df)

In [13]:
#merge df3 and final_df
final_df = final_df.rename(columns={'title':'text'})
final_df = final_df.rename(columns={'overview':'context'})
final_df

Unnamed: 0,text,context,label
0,avatar,"In the 22nd century, a paraplegic Marine is di...","[0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
1,pirates of the caribbean: at world's end,"Captain Barbossa, long believed to be dead, ha...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
2,spectre,A cryptic message from Bond’s past sends him o...,"[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
3,the dark knight rises,Following the death of District Attorney Harve...,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, ..."
4,john carter,"John Carter is a war-weary, former military ca...","[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
...,...,...,...
39274,shadow of the blair witch,"In this true-crime documentary, we delve into ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
39275,the burkittsville 7,A film archivist revisits the story of Rustin ...,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
39276,caged heat 3000,It's the year 3000 AD. The world's most danger...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
39277,subdue,Rising and falling between a man and woman.,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."


In [14]:
#add df3 to final_df
final_df = pd.concat([final_df, df3], ignore_index=True)
final_df = final_df.reset_index(drop=True)
final_df

Unnamed: 0,text,context,label
0,avatar,"In the 22nd century, a paraplegic Marine is di...","[0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
1,pirates of the caribbean: at world's end,"Captain Barbossa, long believed to be dead, ha...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
2,spectre,A cryptic message from Bond’s past sends him o...,"[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
3,the dark knight rises,Following the death of District Attorney Harve...,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, ..."
4,john carter,"John Carter is a war-weary, former military ca...","[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
...,...,...,...
67079,Kein Bund für's Leben,Kein Bund für's Leben is a German film about ...,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
67080,"Feuer, Eis & Dosenbier","Feuer, Eis & Dosenbier is a movie about a gro...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
67081,The Pirates,The Pirates is a movie about a group of pirat...,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
67082,Rentun Ruusu,Rentun Ruusu is a movie about a young woman n...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [15]:
#split final_df into train, val and test with the portion 0.8, 0.1, 0.1 randomly
trainset = final_df
validset = final_df.sample(frac=0.1, random_state=42)
testset = final_df.sample(frac=0.1, random_state=32)

In [16]:
class Poroset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len=256):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        context = row.context
        label = row.label

        if len(context) > self.max_len:
            context = context[:self.max_len]
        
        encoding = self.tokenizer.encode_plus(
            context,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            pad_to_max_length=True,
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt'
        )
        return {
            'context': context,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [17]:
trainset = Poroset(trainset, tokenizer)
validset = Poroset(validset, tokenizer)
testset = Poroset(testset, tokenizer)

In [18]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=True)
validloader = torch.utils.data.DataLoader(validset, batch_size=32, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=32, shuffle=True)

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.load_state_dict(torch.load('model.pt'))
model.to(device)
device

device(type='cuda')

In [20]:
#validate the model with the testset
def test(testing_loader):
    model.eval()
    fin_targets = []
    fin_outputs = []

    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0), total=len(testing_loader)):
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['label'].to(device)

            outputs = model(input_ids, attention_mask)
            fin_targets.extend(labels.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs.logits).cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets

In [21]:
outputs, targets = test(testloader)

outputs = np.array(outputs) >= 0.5

100%|██████████| 210/210 [00:35<00:00,  5.88it/s]


In [22]:
# Multi-label F1 score, macro-averaged
f1 = MultilabelF1Score(num_labels=18, threshold=0.5, average='macro')
f1.update(torch.tensor(outputs), torch.tensor(targets))

f1.compute()

tensor(0.9887)

In [23]:
#create config.json file for the model
model.config.save_pretrained('model')

In [24]:
#push model to huggingface
model.push_to_hub('distilBert-movie-genre-classification')

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/NowT-coding/distilBert-movie-genre-classification/commit/c86fb1413e6c28361fbc33d7b5c0194de9fc6d6d', commit_message='Upload DistilBertForSequenceClassification', commit_description='', oid='c86fb1413e6c28361fbc33d7b5c0194de9fc6d6d', pr_url=None, pr_revision=None, pr_num=None)

In [25]:
tokenizer.push_to_hub('distilBert-movie-genre-classification')

CommitInfo(commit_url='https://huggingface.co/NowT-coding/distilBert-movie-genre-classification/commit/fba33b557de41c07c42458e8808872bc68b88df6', commit_message='Upload tokenizer', commit_description='', oid='fba33b557de41c07c42458e8808872bc68b88df6', pr_url=None, pr_revision=None, pr_num=None)