# Importing the required libraries

In [1]:
import torch
import pandas as pd
import numpy as np
import os
import warnings

from transformers import AutoTokenizer, DistilBertForSequenceClassification
from tqdm import tqdm
from torchvision import models
from torchvision.transforms import v2
from torch.utils.data import Dataset, DataLoader
from keras.preprocessing import image
from torchmetrics.classification import MultilabelF1Score

### Setting up the environment
***

In [2]:
warnings.filterwarnings("ignore")

***

In [3]:
with open('ml1m/content/dataset/genres.txt', 'r') as f:
    genre_all = f.readlines()
genres = [genre.strip() for genre in genre_all]

mapping = {}
for genre, i in enumerate(genres):
    mapping[genre] = i

In [4]:
users = pd.read_csv('ml1m/content/dataset/users.dat', sep='::',
                        engine='python',
                        names=['userid', 'gender', 'age', 'occupation', 'zip']).set_index('userid')
ratings = pd.read_csv('ml1m/content/dataset/ratings.dat', engine='python',
                          sep='::', names=['userid', 'movieid', 'rating', 'timestamp'])
movies_train = pd.read_csv('ml1m/content/dataset/movies_train.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='ISO-8859-1', index_col=False).set_index('movieid')
movies_test = pd.read_csv('ml1m/content/dataset/movies_test.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='ISO-8859-1', index_col=False).set_index('movieid')
movies_train['genre'] = movies_train.genre.str.split('|')
movies_train.index.name = 'ID'
movies_test['genre'] = movies_test.genre.str.split('|')
movies_test.index.name = 'ID'

*** 
Must add overview column to this table via generator

In [5]:
def preprocess(df, path='ml1m/content/dataset/ml1m-images', genres=genres) -> pd.DataFrame:
    df['img_path'] = df.apply(lambda x: os.path.join(path, str(x.name) + '.jpg'), axis=1)
    df = df[df.img_path.apply(lambda x: os.path.exists(x))]
    df['label'] = df.genre.apply(lambda x: [1 if genre in x else 0 for genre in genres])
    df.drop(columns=['genre'], inplace=True)
    df = df.reset_index(drop=True)
    return df

***

In [6]:
trainset = preprocess(movies_train)
testset = preprocess(movies_test)

In [7]:
trainset.head(5)

Unnamed: 0,title,img_path,label
0,Washington Square (1997),ml1m/content/dataset/ml1m-images\1650.jpg,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
1,"Net, The (1995)",ml1m/content/dataset/ml1m-images\185.jpg,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Batman Returns (1992),ml1m/content/dataset/ml1m-images\1377.jpg,"[1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
3,"Boys from Brazil, The (1978)",ml1m/content/dataset/ml1m-images\3204.jpg,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Police Academy 5: Assignment: Miami Beach (1988),ml1m/content/dataset/ml1m-images\2382.jpg,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


# Model Implementation

### Sub-models
***

In [8]:
tokenizer1 = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model1 = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", problem_type="multi_label_classification", num_labels=18)
model1.config.id2label = mapping

tokenizer2 = AutoTokenizer.from_pretrained("bert-base-uncased")
model2 = DistilBertForSequenceClassification.from_pretrained("bert-base-uncased", problem_type="multi_label_classification", num_labels=18)
model2.config.id2label = mapping

model3 = models.resnet101(pretrained=False)
model3.fc = torch.nn.Linear(2048, len(genres))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You are using a model of type bert to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['transformer.layer.3.attention.q_lin.weight', 'transformer.layer.10.attention.k_lin.bias', 'transformer.layer.9.attention.out_lin.bias', 'transformer.layer.6.attention.k_lin.weight', 'transformer.layer.2.ffn.lin1.weight', 'transformer.layer.10.attention.q_lin.weight', 'transformer.layer.4.output_layer_norm.bias', 'transformer.layer.5.attentio

### Deep Fusion Multimodal Model
***

In [9]:
class Multimodal(torch.nn.Module):
    def __init__(self, model1, model2, model3):
        super().__init__()
        self.model1 = model1
        self.model2 = model2
        self.model3 = model3
        self.fc1 = torch.nn.Linear(18, 18)
        self.fc2 = torch.nn.Linear(18, 18)
        self.fc3 = torch.nn.Linear(18, 18)

    def forward(self, 
                title_input_ids, title_attention_mask,
                plot_input_ids, plot_attention_mask,
                image_input):
        title_output = self.model1(title_input_ids, title_attention_mask)
        plot_output = self.model2(plot_input_ids, plot_attention_mask)
        image_output = self.model3(image_input)

        title_output = self.fc1(title_output.logits)
        plot_output = self.fc2(plot_output.logits)
        image_output = self.fc3(image_output)
        
        output = torch.add(title_output, plot_output, image_output)
        return output

# Custom Datasets & Data Loaders

***
### Custom Dataset
***

In [10]:
class Poroset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer1, tokenizer2, max_len=256):
        self.df = df
        self.tokenizer1 = tokenizer1
        self.tokenizer2 = tokenizer2
        self.max_len = max_len
        self.image = image

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        title = row['title']
        overview = row['overview']
        label = row['label']
        title_encoding = self.tokenizer1(title, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
        overview_encoding = self.tokenizer2(overview, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
        image_input = image.open(row['img_path'])
        image_input = v2.Compose([v2.Resize((224, 224)), v2.ToTensor()])(image_input)
        return {
            'title': title,
            'overview': overview,
            'title_input_ids': title_encoding['input_ids'].squeeze(),
            'title_attention_mask': title_encoding['attention_mask'].squeeze(),
            'overview_input_ids': overview_encoding['input_ids'].squeeze(),
            'overview_attention_mask': overview_encoding['attention_mask'].squeeze(),
            'image_input': image_input,
            'label': torch.FloatTensor(label)
        }

In [11]:
trainset = Poroset(trainset, tokenizer1, tokenizer2)
testset = Poroset(testset, tokenizer1, tokenizer2)

***

***
### Custom Data Loader
***

In [12]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=32, shuffle=True)

Check if the data loader is working properly

In [13]:
sample = next(iter(trainloader))
sample

KeyError: 'overview'

# Setting up the Trainer

***
### GPU & Model Configuration
***

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Multimodal(model1, model2, model3).to(device)

# Freeze model2 since it's pretrained
for param in model.model2.parameters():
    param.requires_grad = False

***
### Setting up loss function & optimizer
***

In [17]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-4)

NameError: name 'model' is not defined

***
### Trainer
***

In [24]:
def train(epoch):
    model.train()
    for _, data in tqdm(enumerate(trainloader, 0), total=len(trainloader)):
        title_input_ids = data['title_input_ids'].to(device)
        title_attention_mask = data['title_attention_mask'].to(device)
        overview_input_ids = data['overview_input_ids'].to(device)
        overview_attention_mask = data['overview_attention_mask'].to(device)
        image_input = data['image_input'].to(device)
        label = data['label'].to(device)

        optimizer.zero_grad()
        outputs = model(title_input_ids, title_attention_mask, overview_input_ids, overview_attention_mask, image_input)
        loss = loss_fn(outputs, label)
        loss.backward()
        optimizer.step()
    print(f'Epoch: {epoch}, Loss:  {loss.item()}')

# Training the model

In [25]:
for epoch in range(32):
    train(epoch)

100%|██████████| 1044/1044 [13:45<00:00,  1.26it/s]


Epoch 0 loss: 0.5877975225448608


100%|██████████| 1044/1044 [13:30<00:00,  1.29it/s]


Epoch 1 loss: 0.582400918006897


100%|██████████| 1044/1044 [13:30<00:00,  1.29it/s]


Epoch 2 loss: 0.5804471373558044


100%|██████████| 1044/1044 [13:30<00:00,  1.29it/s]


Epoch 3 loss: 0.569242000579834


100%|██████████| 1044/1044 [13:30<00:00,  1.29it/s]


Epoch 4 loss: 0.5625030398368835


100%|██████████| 1044/1044 [13:31<00:00,  1.29it/s]


Epoch 5 loss: 0.5552199482917786


100%|██████████| 1044/1044 [13:30<00:00,  1.29it/s]


Epoch 6 loss: 0.5557193756103516


100%|██████████| 1044/1044 [13:30<00:00,  1.29it/s]


Epoch 7 loss: 0.5441743731498718


100%|██████████| 1044/1044 [13:30<00:00,  1.29it/s]


Epoch 8 loss: 0.5739604234695435


100%|██████████| 1044/1044 [13:30<00:00,  1.29it/s]


Epoch 9 loss: 0.549248993396759


100%|██████████| 1044/1044 [13:30<00:00,  1.29it/s]


Epoch 10 loss: 0.5646395683288574


100%|██████████| 1044/1044 [13:31<00:00,  1.29it/s]


Epoch 11 loss: 0.5462006330490112


100%|██████████| 1044/1044 [13:30<00:00,  1.29it/s]


Epoch 12 loss: 0.5448351502418518


100%|██████████| 1044/1044 [13:30<00:00,  1.29it/s]


Epoch 13 loss: 0.5582103133201599


100%|██████████| 1044/1044 [13:30<00:00,  1.29it/s]


Epoch 14 loss: 0.5416957139968872


100%|██████████| 1044/1044 [13:30<00:00,  1.29it/s]


Epoch 15 loss: 0.5413329601287842


100%|██████████| 1044/1044 [13:30<00:00,  1.29it/s]


Epoch 16 loss: 0.5385647416114807


100%|██████████| 1044/1044 [13:30<00:00,  1.29it/s]


Epoch 17 loss: 0.5503974556922913


100%|██████████| 1044/1044 [13:30<00:00,  1.29it/s]


Epoch 18 loss: 0.5608019828796387


100%|██████████| 1044/1044 [13:30<00:00,  1.29it/s]


Epoch 19 loss: 0.553778350353241


100%|██████████| 1044/1044 [13:30<00:00,  1.29it/s]


Epoch 20 loss: 0.558198869228363


100%|██████████| 1044/1044 [13:30<00:00,  1.29it/s]


Epoch 21 loss: 0.5483788251876831


100%|██████████| 1044/1044 [13:30<00:00,  1.29it/s]


Epoch 22 loss: 0.5504338145256042


100%|██████████| 1044/1044 [13:30<00:00,  1.29it/s]


Epoch 23 loss: 0.5401769280433655


100%|██████████| 1044/1044 [13:30<00:00,  1.29it/s]


Epoch 24 loss: 0.5429915189743042


100%|██████████| 1044/1044 [13:30<00:00,  1.29it/s]


Epoch 25 loss: 0.5376887321472168


100%|██████████| 1044/1044 [13:29<00:00,  1.29it/s]


Epoch 26 loss: 0.5472096800804138


100%|██████████| 1044/1044 [13:29<00:00,  1.29it/s]


Epoch 27 loss: 0.5329397916793823


100%|██████████| 1044/1044 [13:29<00:00,  1.29it/s]


Epoch 28 loss: 0.5490307807922363


100%|██████████| 1044/1044 [13:29<00:00,  1.29it/s]


Epoch 29 loss: 0.5395540595054626


100%|██████████| 1044/1044 [13:30<00:00,  1.29it/s]


Epoch 30 loss: 0.5566591024398804


100%|██████████| 1044/1044 [13:30<00:00,  1.29it/s]


Epoch 31 loss: 0.5341386795043945


In [26]:
# Validation
def validation(testing_loader):
    model.eval()
    fin_targets = []
    fin_outputs = []

    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader), total=len(testing_loader)):
            title_input_ids = data['title_input_ids'].to(device)
            title_attention_mask = data['title_attention_mask'].to(device)
            overview_input_ids = data['overview_input_ids'].to(device)
            overview_attention_mask = data['overview_attention_mask'].to(device)
            targets = data['label'].to(device)

            outputs = model(title_input_ids, title_attention_mask, overview_input_ids, overview_attention_mask)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(outputs.cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets

In [27]:
outputs, targets = validation(testloader)

outputs = np.array(outputs) >= 0.5

100%|██████████| 185/185 [00:51<00:00,  3.59it/s]


In [28]:
# Multi-label F1 score, macro-averaged
from torchmetrics.classification import MultilabelF1Score
f1 = MultilabelF1Score(num_labels=18, threshold=0.5, average='macro')
f1.update(torch.tensor(outputs), torch.tensor(targets))

f1.compute()

tensor(0.2762)

In [29]:
# Save model
torch.save(model.state_dict(), 'model.pt')

In [30]:
# Inferencing title and overview
def inference(title, overview = None, genres = genres):
    model.eval()
    with torch.no_grad():
        title_encoding = tokenizer(title, truncation=True, padding='max_length', max_length=256, return_tensors='pt')
        title_input_ids = title_encoding['input_ids'].to(device)
        title_attention_mask = title_encoding['attention_mask'].to(device)
        if overview is not None:
            overview_encoding = tokenizer(overview, truncation=True, padding='max_length', max_length=256, return_tensors='pt')
            overview_input_ids = overview_encoding['input_ids'].to(device)
            overview_attention_mask = overview_encoding['attention_mask'].to(device)
            outputs = model(title_input_ids, title_attention_mask, overview_input_ids, overview_attention_mask)
        else:
            outputs = model(title_input_ids, title_attention_mask, None, None)
        outputs = outputs.cpu().detach().numpy().tolist()
        outputs = np.array(outputs) >= 0.5
        print([genres[i] for i in range(len(genres)) if outputs[0][i] == 1])

In [31]:
print(testset[3]['title'])
#print genres of the movie in testset in word
for i in range(len(genres)):
    if testset[3]['label'][i] == 1:
        print(genres[i])

a christmas carol
Animation
Drama


In [32]:
inference(testset[3]['title'],testset[3]['overview'])

['Drama']
