# Importing the required libraries

In [1]:
import torch
import pandas as pd
import numpy as np
import os
import warnings

from transformers import AutoTokenizer, DistilBertForSequenceClassification, AutoModelForSeq2SeqLM
from tqdm import tqdm
from torchvision import models
from torchvision.transforms import v2
from torch.utils.data import Dataset, DataLoader
from keras.preprocessing import image
from torchmetrics.classification import MultilabelF1Score

### Setting up the environment
***

In [2]:
warnings.filterwarnings("ignore")

***

# Data Preprocessing

In [3]:
with open('ml1m/content/dataset/genres.txt', 'r') as f:
    genre_all = f.readlines()
genres = [genre.strip() for genre in genre_all]

mapping = {}
for genre, i in enumerate(genres):
    mapping[genre] = i

In [4]:
users = pd.read_csv('ml1m/content/dataset/users.dat', sep='::',
                        engine='python',
                        names=['userid', 'gender', 'age', 'occupation', 'zip']).set_index('userid')
ratings = pd.read_csv('ml1m/content/dataset/ratings.dat', engine='python',
                          sep='::', names=['userid', 'movieid', 'rating', 'timestamp'])
movies_train = pd.read_csv('ml1m/content/dataset/movies_train.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='ISO-8859-1', index_col=False).set_index('movieid')
movies_test = pd.read_csv('ml1m/content/dataset/movies_test.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='ISO-8859-1', index_col=False).set_index('movieid')
movies_train['genre'] = movies_train.genre.str.split('|')
movies_train.index.name = 'ID'
movies_test['genre'] = movies_test.genre.str.split('|')
movies_test.index.name = 'ID'

*** 
Must add overview column to this table via generator

In [5]:
def preprocess(df, path='ml1m/content/dataset/ml1m-images', genres=genres) -> pd.DataFrame:
    df['img_path'] = df.apply(lambda x: os.path.join(path, str(x.name) + '.jpg'), axis=1)
    df['label'] = df.genre.apply(lambda x: [1 if genre in x else 0 for genre in genres])
    df.drop(columns=['genre'], inplace=True)
    df = df.reset_index(drop=True)
    return df

***

In [6]:
trainset = preprocess(movies_train)
testset = preprocess(movies_test)
validset = testset.sample(frac=0.5, random_state=42)
testset = testset.drop(validset.index)

In [7]:
print(len(trainset), len(testset), len(validset))

3106 389 388


3883 movies loaded to trainset and testset

# Model Implementation

### Sub-models
***

In [8]:
tokenizer1 = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model1 = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", problem_type="multi_label_classification", num_labels=18)
model1.config.id2label = mapping

tokenizer2 = AutoTokenizer.from_pretrained("bert-base-uncased")
model2 = DistilBertForSequenceClassification.from_pretrained("bert-base-uncased", problem_type="multi_label_classification", num_labels=18)
model2.config.id2label = mapping

model3 = models.resnet101(pretrained=False)
model3.fc = torch.nn.Linear(2048, len(genres))

tokenizer_gen = AutoTokenizer.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")
model_gen = AutoModelForSeq2SeqLM.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model1.to(device)
model2.to(device)
model3.to(device)
model_gen.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You are using a model of type bert to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['transformer.layer.7.output_layer_norm.weight', 'transformer.layer.3.attention.out_lin.bias', 'transformer.layer.11.ffn.lin1.weight', 'transformer.layer.1.ffn.lin2.bias', 'transformer.layer.0.output_layer_norm.weight', 'transformer.layer.10.ffn.lin1.bias', 'transformer.layer.11.sa_layer_norm.bias', 'transformer.layer.10.sa_layer_norm.weight',

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

### Deep Fusion Multimodal Model
***

In [9]:
class Multimodal(torch.nn.Module):
    def __init__(self, model1, model2, model3):
        super().__init__()
        self.model1 = model1
        self.model2 = model2
        self.model3 = model3
        self.fc1 = torch.nn.Linear(18, 18)
        self.fc2 = torch.nn.Linear(18, 18)
        self.fc3 = torch.nn.Linear(18, 18)

    def forward(self, 
                title_input_ids, title_attention_mask,
                plot_input_ids, plot_attention_mask,
                image_input):
        title_output = self.model1(title_input_ids, title_attention_mask)
        plot_output = self.model2(plot_input_ids, plot_attention_mask)
        image_output = self.model3(image_input)

        title_output = self.fc1(title_output.logits)
        plot_output = self.fc2(plot_output.logits)
        image_output = self.fc3(image_output)
        
        output = torch.add(title_output, plot_output)
        output = torch.add(output, image_output)
        return output

# Custom Datasets & Data Loaders

***
### Custom Dataset
***

In [10]:
class Poroset(torch.utils.data.Dataset):
    def __init__(self, df, 
                 tokenizer1, tokenizer2, 
                 generator, tokenizer_gen,
                 max_len=256,
                 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
        self.df = df
        self.tokenizer1 = tokenizer1
        self.tokenizer2 = tokenizer2
        self.max_len = max_len
        self.image = image
        self.generator = generator
        self.tokenizer_gen = tokenizer_gen
        self.device = device
        self.transform = v2.Compose([
            v2.Resize((224, 224)),
            v2.ToTensor(),
            v2.Normalize(mean=[0.485, 0.456, 0.406],
                                    std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        title = row['title']

        self.generator.eval()
        quote = 'What is the story of the movie {}?'
        input_ids = self.tokenizer_gen.encode(quote.format(title), return_tensors='pt').to(self.device)
        outputs = self.generator.generate(input_ids, max_length=256, do_sample=True, temperature=0.09)
        plot = self.tokenizer_gen.decode(outputs[0], skip_special_tokens=True)

        label = row['label']
        title_encoding = self.tokenizer1(title, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
        plot_encoding = self.tokenizer2(plot, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')

        path='ml1m/content/dataset/ml1m-images'
        image_path = os.path.join(path, str(row.name) + '.jpg')
        if os.path.exists(image_path):
            image_input = image.load_img(image_path)
            image_input = self.transform(image_input)
        else:
            image_input = torch.zeros((3, 224, 224))
        
        return {
            'title': title,
            'plot': plot,
            'title_input_ids': title_encoding['input_ids'].squeeze(),
            'title_attention_mask': title_encoding['attention_mask'].squeeze(),
            'plot_input_ids': plot_encoding['input_ids'].squeeze(),
            'plot_attention_mask': plot_encoding['attention_mask'].squeeze(),
            'image_input': image_input,
            'label': torch.FloatTensor(label)
        }

In [11]:
trainset = Poroset(trainset, tokenizer1, tokenizer2, model_gen, tokenizer_gen)
validset = Poroset(validset, tokenizer1, tokenizer2, model_gen, tokenizer_gen)
testset = Poroset(testset, tokenizer1, tokenizer2, model_gen, tokenizer_gen)

***
### Custom Data Loader
***

In [12]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=True)
validloader = torch.utils.data.DataLoader(validset, batch_size=32, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=32, shuffle=True)

Check if the data loader is working properly

In [13]:
sample = next(iter(trainloader))

# First sample of the batch
print('Title: ', sample['title'][0])
print('Plot: ', sample['plot'][0])
print('Label: ', sample['label'][0])
print('Image: ', sample['image_input'][0].shape)

Title:  Follow the Bitch (1998)
Plot:  The movie Follow the Bitch is a crime drama film about a man named Jack who is convicted of murdering his wife and her lover. He is sentenced to life in prison and faces numerous challenges, including being convicted of murder and being convicted of murder.
Label:  tensor([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
Image:  torch.Size([3, 224, 224])


# Setting up the Trainer

***
### GPU & Model Configuration
***

In [14]:
model = Multimodal(model1, model2, model3)
model.to(device)

for param in model.model2.parameters():
    param.requires_grad = False

***
### Setting up loss function & optimizer
***

In [15]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-4)

***
### Trainer & Validation
***

In [16]:
def train(epoch):
    model.train()
    for _, data in tqdm(enumerate(trainloader, 0), total=len(trainloader)):
        title_input_ids = data['title_input_ids'].to(device)
        title_attention_mask = data['title_attention_mask'].to(device)
        plot_input_ids = data['plot_input_ids'].to(device)
        plot_attention_mask = data['plot_attention_mask'].to(device)
        image_input = data['image_input'].to(device)
        label = data['label'].to(device)

        optimizer.zero_grad()
        outputs = model(
            title_input_ids, title_attention_mask,
            plot_input_ids, plot_attention_mask,
            image_input
        )
        
        loss = loss_fn(outputs, label)
        loss.backward()
        optimizer.step()
    print(f'Epoch: {epoch}, Train Loss: {loss.item()}')

In [17]:
def valid(epoch):
    model.eval()
    with torch.no_grad():
        for _, data in tqdm(enumerate(validloader, 0), total=len(validloader)):
            title_input_ids = data['title_input_ids'].to(device)
            title_attention_mask = data['title_attention_mask'].to(device)
            plot_input_ids = data['plot_input_ids'].to(device)
            plot_attention_mask = data['plot_attention_mask'].to(device)
            image_input = data['image_input'].to(device)
            label = data['label'].to(device)

            outputs = model(
                title_input_ids, title_attention_mask,
                plot_input_ids, plot_attention_mask,
                image_input
            )

            loss = loss_fn(outputs, label)
    print(f'Epoch: {epoch}, Valid Loss: {loss.item()}')

# Training & Validating Loop

In [18]:
for epoch in range(10):
    train(epoch)
    valid(epoch)

  2%|▏         | 2/98 [03:20<2:40:23, 100.24s/it]


KeyboardInterrupt: 

# Testing the model

In [None]:
# Validation
def test(testing_loader):
    model.eval()
    fin_targets = []
    fin_outputs = []

    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader), total=len(testing_loader)):
            title_input_ids = data['title_input_ids'].to(device)
            title_attention_mask = data['title_attention_mask'].to(device)
            plot_input_ids = data['plot_input_ids'].to(device)
            plot_attention_mask = data['plot_attention_mask'].to(device)
            image_input = data['image_input'].to(device)
            label = data['label'].to(device)

            outputs = model(
                title_input_ids, title_attention_mask,
                plot_input_ids, plot_attention_mask,
                image_input
            )

            fin_targets.extend(label.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets

In [None]:
outputs, targets = test(testloader)

outputs = np.array(outputs) >= 0.5

  0%|          | 0/13 [01:12<?, ?it/s]


KeyError: 'overview_input_ids'

In [None]:
# Multi-label F1 score, macro-averaged
from torchmetrics.classification import MultilabelF1Score
f1 = MultilabelF1Score(num_labels=18, threshold=0.5, average='macro')
f1.update(torch.tensor(outputs), torch.tensor(targets))

f1.compute()

tensor(0.2762)

In [None]:
# Save model
torch.save(model.state_dict(), 'model.pt')

In [None]:
# Map@k score
def mapk_score(y_true, y_pred):
    score = 0.0
    for k in range(1, 19):
        score += np.sum(y_pred[:, :k] == y_true) / k
    score /= y_true.shape[0]
    return score

In [None]:
# Inferencing title and overview
def inference(title, overview = None, genres = genres):
    model.eval()
    with torch.no_grad():
        title_encoding = tokenizer(title, truncation=True, padding='max_length', max_length=256, return_tensors='pt')
        title_input_ids = title_encoding['input_ids'].to(device)
        title_attention_mask = title_encoding['attention_mask'].to(device)
        if overview is not None:
            overview_encoding = tokenizer(overview, truncation=True, padding='max_length', max_length=256, return_tensors='pt')
            overview_input_ids = overview_encoding['input_ids'].to(device)
            overview_attention_mask = overview_encoding['attention_mask'].to(device)
            outputs = model(title_input_ids, title_attention_mask, overview_input_ids, overview_attention_mask)
        else:
            outputs = model(title_input_ids, title_attention_mask, None, None)
        outputs = outputs.cpu().detach().numpy().tolist()
        outputs = np.array(outputs) >= 0.5
        print([genres[i] for i in range(len(genres)) if outputs[0][i] == 1])

In [None]:
print(testset[3]['title'])
#print genres of the movie in testset in word
for i in range(len(genres)):
    if testset[3]['label'][i] == 1:
        print(genres[i])

a christmas carol
Animation
Drama


In [None]:
inference(testset[3]['title'],testset[3]['overview'])

['Drama']
