# Importing the required libraries

In [1]:
import torch
import pandas as pd
import numpy as np
import os
import warnings

from transformers import AutoTokenizer, DistilBertForSequenceClassification, AutoModelForSeq2SeqLM
from tqdm import tqdm
from torchvision import models
from torchvision.transforms import v2
from torch.utils.data import Dataset, DataLoader
from keras.preprocessing import image
from torchmetrics.classification import MultilabelF1Score
from ast import literal_eval




In [2]:
warnings.filterwarnings("ignore")

# Data Preprocessing

In [3]:
with open('ml1m/content/dataset/genres.txt', 'r') as f:
    genre_all = f.readlines()
genres = [genre.strip() for genre in genre_all]

mapping = {}
for genre, i in enumerate(genres):
    mapping[genre] = i

In [4]:
#read testset and trainset into dataframes
train_df = pd.read_csv("trainset.csv", sep="|")
test_df = pd.read_csv("testset.csv", sep="|")

In [5]:
def adjustData(df: pd.DataFrame) -> pd.DataFrame:
    #remove special characters from the dataframes
    df['context'] = df['context'].apply(lambda x: x.replace('<pad>',''))
    df['context'] = df['context'].apply(lambda x: x.replace('</s>',''))
    #change every "\" in the image links to "/"
    df['img_path'] = df['img_path'].apply(lambda x: x.replace('\\','/'))
    #make the label in df a python list
    df['label'] = df['label'].apply(literal_eval)
    return df

In [6]:
train_df = adjustData(train_df)
test_df = adjustData(test_df)

In [7]:
len(train_df), len(test_df)

(3106, 777)

In [8]:
tokenizer_title = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model_title = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", problem_type="multi_label_classification", num_labels=18)
model_title.config.id2label = mapping

tokenizer_overview = AutoTokenizer.from_pretrained("NowT-coding/distilBert-movie-genre-classification")
model_overview = DistilBertForSequenceClassification.from_pretrained("NowT-coding/distilBert-movie-genre-classification", problem_type="multi_label_classification", num_labels=18)
model_overview.config.id2label = mapping

model_image = models.resnet101(pretrained=False)
model_image.fc = torch.nn.Linear(2048, len(genres))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_title.to(device)
model_overview.to(device)
model_image.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [9]:
class Multimodal(torch.nn.Module):
    def __init__(self, model1, model2, model3):
        super().__init__()
        self.model1 = model1
        self.model2 = model2
        self.model3 = model3
        self.fc1 = torch.nn.Linear(18, 18)
        self.fc2 = torch.nn.Linear(18, 18)
        self.fc3 = torch.nn.Linear(18, 18)

    def forward(self, title_input_ids, title_attention_mask, context_input_ids, context_attention_mask, image_input):
        title_output = self.model1(title_input_ids, title_attention_mask)
        context_output = self.model2(context_input_ids, context_attention_mask)
        image_output = self.model3(image_input)

        title_output = self.fc1(title_output.logits)
        context_output = self.fc2(context_output.logits)
        image_output = self.fc3(image_output)
        
        output = torch.add(title_output, context_output)
        output = torch.add(output, image_output)
        return output

In [10]:
class Poroset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer1, tokenizer2, max_len_title=64, max_len_context=256,
                device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
        self.df = df
        self.tokenizer1 = tokenizer1
        self.tokenizer2 = tokenizer2
        self.max_len_title = max_len_title
        self.max_len_context = max_len_context
        self.image = image
        self.device = device
        self.transform = v2.Compose([
            v2.Resize((224, 224)),
            v2.ToTensor(),
            v2.Normalize(mean=[0.485, 0.456, 0.406],
                                    std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        title = row['title']
        context = row['context']
        image_path = row['img_path']
        label = row['label']
        title_encoding = self.tokenizer1(
            title, 
            truncation=True, 
            padding='max_length', 
            max_length=self.max_len_title, 
            return_tensors='pt'
        )
        context_encoding = self.tokenizer2(
            context,
            truncation=True, 
            padding='max_length', 
            max_length=self.max_len_context, 
            return_tensors='pt'
        )

        if os.path.exists(image_path):
            image_input = image.load_img(image_path)
            image_input = self.transform(image_input)
        else:
            image_input = torch.zeros((3, 224, 224))
        
        return {
            'title': title,
            'context': context,
            'title_input_ids': title_encoding['input_ids'].squeeze(),
            'title_attention_mask': title_encoding['attention_mask'].squeeze(),
            'context_input_ids': context_encoding['input_ids'].squeeze(),
            'context_attention_mask': context_encoding['attention_mask'].squeeze(),
            'image_input': image_input,
            'label': torch.FloatTensor(label)
        }

In [11]:
trainset = Poroset(train_df, tokenizer_title, tokenizer_overview)
testset = Poroset(test_df, tokenizer_title, tokenizer_overview)

In [12]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=32, shuffle=True)

In [13]:
sample = next(iter(trainloader))

# First sample of the batch
print('Title: ', sample['title'][0])
print('Context: ', sample['context'][0])
print('Label: ', sample['label'][0])
print('Image: ', sample['image_input'][0].shape)

Title:  Adventures of Sebastian Cole, The 
Context:   The movie Adventures of Sebastian Cole is about a young boy named Sebastian who embarks on a journey to find his missing father, who is a notorious criminal. Along the way, he meets a group of adventurers who help him navigate the dangerous world of the criminal underworld.
Label:  tensor([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.])
Image:  torch.Size([3, 224, 224])


In [14]:
model = Multimodal(model_title, model_overview, model_image)
model.to(device)

#for param in model.model2.parameters():
#    param.requires_grad = False
    
model

Multimodal(
  (model1): DistilBertForSequenceClassification(
    (distilbert): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0-5): 6 x TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (ffn

In [15]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-4)

In [16]:
def train(epoch):
    model.train()
    for _, data in tqdm(enumerate(trainloader, 0), total=len(trainloader)):
        
        title_input_ids = data['title_input_ids'].to(device)
        title_attention_mask = data['title_attention_mask'].to(device)
        context_input_ids = data['context_input_ids'].to(device)
        context_attention_mask = data['context_attention_mask'].to(device)
        image_input = data['image_input'].to(device)
        label = data['label'].to(device)

        optimizer.zero_grad()
        outputs = model(
            title_input_ids, title_attention_mask,
            context_input_ids, context_attention_mask,
            image_input
        )
        
        loss = loss_fn(outputs, label)
        loss.backward()
        optimizer.step()
    print(f'Epoch: {epoch}, Train Loss: {loss.item()}')

In [17]:
for epoch in range(128):
    train(epoch)

100%|██████████| 98/98 [13:49<00:00,  8.47s/it]


Epoch: 0, Train Loss: 0.25306209921836853


100%|██████████| 98/98 [07:35<00:00,  4.65s/it]


Epoch: 1, Train Loss: 0.3106345236301422


100%|██████████| 98/98 [07:05<00:00,  4.34s/it]


Epoch: 2, Train Loss: 0.18095718324184418


100%|██████████| 98/98 [07:03<00:00,  4.32s/it]


Epoch: 3, Train Loss: 0.0801863968372345


100%|██████████| 98/98 [07:20<00:00,  4.50s/it]


Epoch: 4, Train Loss: 0.0805177316069603


100%|██████████| 98/98 [07:20<00:00,  4.50s/it]


Epoch: 5, Train Loss: 0.18221081793308258


100%|██████████| 98/98 [07:29<00:00,  4.59s/it]


Epoch: 6, Train Loss: 0.021626271307468414


100%|██████████| 98/98 [07:17<00:00,  4.47s/it]


Epoch: 7, Train Loss: 0.01438826322555542


100%|██████████| 98/98 [06:49<00:00,  4.18s/it]


Epoch: 8, Train Loss: 0.03247864916920662


 21%|██▏       | 21/98 [01:57<07:10,  5.60s/it]


KeyboardInterrupt: 

In [None]:
# Validation
def test(testing_loader):
    model.eval()
    fin_targets = []
    fin_outputs = []

    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader), total=len(testing_loader)):
            title_input_ids = data['title_input_ids'].to(device)
            title_attention_mask = data['title_attention_mask'].to(device)
            context_input_ids = data['context_input_ids'].to(device)
            context_attention_mask = data['context_attention_mask'].to(device)
            image_input = data['image_input'].to(device)
            label = data['label'].to(device)

            outputs = model(
                title_input_ids, title_attention_mask,
                context_input_ids, context_attention_mask,
                image_input
            )

            fin_targets.extend(label.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets

In [None]:
outputs, targets = test(testloader)

outputs = np.array(outputs) >= 0.5

In [None]:
# Multi-label F1 score, macro-averaged
from torchmetrics.classification import MultilabelF1Score
f1 = MultilabelF1Score(num_labels=18, threshold=0.5, average='macro')
f1.update(torch.tensor(outputs), torch.tensor(targets))

f1.compute()

In [None]:
# Save model
torch.save(model.state_dict(), 'model.pt')

In [None]:
# Map@k score
def apk(actual, predicted, k=10):
    if not actual:
        return 0.0

    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [None]:
mapk(targets, outputs, 18)

In [None]:
# Inferencing title and overview
def inference(title, overview = None, genres = genres):
    model.eval()
    with torch.no_grad():
        title_encoding = tokenizer(title, truncation=True, padding='max_length', max_length=256, return_tensors='pt')
        title_input_ids = title_encoding['input_ids'].to(device)
        title_attention_mask = title_encoding['attention_mask'].to(device)
        if overview is not None:
            overview_encoding = tokenizer(overview, truncation=True, padding='max_length', max_length=256, return_tensors='pt')
            overview_input_ids = overview_encoding['input_ids'].to(device)
            overview_attention_mask = overview_encoding['attention_mask'].to(device)
            outputs = model(title_input_ids, title_attention_mask, overview_input_ids, overview_attention_mask)
        else:
            outputs = model(title_input_ids, title_attention_mask, None, None)
        outputs = outputs.cpu().detach().numpy().tolist()
        outputs = np.array(outputs) >= 0.5
        print([genres[i] for i in range(len(genres)) if outputs[0][i] == 1])

In [None]:
print(testset[3]['title'])
#print genres of the movie in testset in word
for i in range(len(genres)):
    if testset[3]['label'][i] == 1:
        print(genres[i])

In [None]:
inference(testset[3]['title'],testset[3]['overview'])