In [1]:
import torch
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

In [2]:
with open('ml1m/content/dataset/genres.txt', 'r') as f:
    genre_all = f.readlines()
genres = [genre.strip() for genre in genre_all]

mapping = {}
for genre, i in enumerate(genres):
    mapping[genre] = i

mapping

{0: 'Crime',
 1: 'Thriller',
 2: 'Fantasy',
 3: 'Horror',
 4: 'Sci-Fi',
 5: 'Comedy',
 6: 'Documentary',
 7: 'Adventure',
 8: 'Film-Noir',
 9: 'Animation',
 10: 'Romance',
 11: 'Drama',
 12: 'Western',
 13: 'Musical',
 14: 'Action',
 15: 'Mystery',
 16: 'War',
 17: "Children's"}

In [3]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english", problem_type="multi_label_classification", num_labels=18, ignore_mismatched_sizes=True)
model.config.id2label = mapping

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([18]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([18, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
users = pd.read_csv('ml1m/content/dataset/users.dat', sep='::',
                        engine='python',
                        names=['userid', 'gender', 'age', 'occupation', 'zip']).set_index('userid')
ratings = pd.read_csv('ml1m/content/dataset/ratings.dat', engine='python',
                          sep='::', names=['userid', 'movieid', 'rating', 'timestamp'])
movies_train = pd.read_csv('ml1m/content/dataset/movies_train.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='ISO-8859-1', index_col=False).set_index('movieid')
movies_test = pd.read_csv('ml1m/content/dataset/movies_test.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='ISO-8859-1', index_col=False).set_index('movieid')
movies_train['genre'] = movies_train.genre.str.split('|')
movies_train.index.name = 'ID'
movies_test['genre'] = movies_test.genre.str.split('|')
movies_test.index.name = 'ID'

users.age = users.age.astype('int')
users.gender = users.gender.astype('category')
users.occupation = users.occupation.astype('category')
ratings.movieid = ratings.movieid.astype('int')
ratings.userid = ratings.userid.astype('int')

In [5]:
def preprocess(df, path='ml1m/content/dataset/ml1m-images', genres=genres) -> pd.DataFrame:
    df['label'] = df.genre.apply(lambda x: [1 if genre in x else 0 for genre in genres])
    df.drop(columns=['genre'], inplace=True)
    df.rename(columns={'title': 'text'}, inplace=True)
    df = df.reset_index(drop=True)
    return df

In [6]:
trainset = preprocess(movies_train)
testset = preprocess(movies_test)

# Hard Code

In [7]:
class Poroset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len=64):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        row = self.df.iloc[index]
        text = row.text
        label = row.label
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            pad_to_max_length=True,
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt'
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.float)
        }

    def __len__(self):
        return len(self.df)

In [8]:
trainset = Poroset(trainset, tokenizer)
testset = Poroset(testset, tokenizer)

In [9]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=32, shuffle=True)

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [11]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-5)

In [12]:
def train(epoch):
    model.train()
    for _, data in tqdm(enumerate(trainloader, 0), total=len(trainloader)):
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = loss_fn(outputs.logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch: {epoch}, Loss:  {loss.item()}')

In [13]:
for epoch in range(64):
    train(epoch)

100%|██████████| 98/98 [00:11<00:00,  8.58it/s]


Epoch: 0, Loss:  0.460553914308548


100%|██████████| 98/98 [00:11<00:00,  8.59it/s]


Epoch: 1, Loss:  0.22467412054538727


100%|██████████| 98/98 [00:11<00:00,  8.23it/s]


Epoch: 2, Loss:  0.358207643032074


100%|██████████| 98/98 [00:11<00:00,  8.24it/s]


Epoch: 3, Loss:  0.1990063339471817


100%|██████████| 98/98 [00:11<00:00,  8.63it/s]


Epoch: 4, Loss:  0.20099091529846191


100%|██████████| 98/98 [00:11<00:00,  8.68it/s]


Epoch: 5, Loss:  0.26230278611183167


100%|██████████| 98/98 [00:11<00:00,  8.68it/s]


Epoch: 6, Loss:  0.3331564962863922


100%|██████████| 98/98 [00:11<00:00,  8.86it/s]


Epoch: 7, Loss:  0.16517199575901031


100%|██████████| 98/98 [00:11<00:00,  8.41it/s]


Epoch: 8, Loss:  0.5639883279800415


100%|██████████| 98/98 [00:09<00:00, 10.37it/s]


Epoch: 9, Loss:  0.31659600138664246


100%|██████████| 98/98 [00:09<00:00, 10.46it/s]


Epoch: 10, Loss:  0.3302447199821472


100%|██████████| 98/98 [00:09<00:00, 10.57it/s]


Epoch: 11, Loss:  0.23241032660007477


100%|██████████| 98/98 [00:09<00:00, 10.74it/s]


Epoch: 12, Loss:  0.10738591104745865


100%|██████████| 98/98 [00:09<00:00, 10.57it/s]


Epoch: 13, Loss:  0.20457972586154938


100%|██████████| 98/98 [00:09<00:00, 10.52it/s]


Epoch: 14, Loss:  0.22153601050376892


100%|██████████| 98/98 [00:09<00:00, 10.61it/s]


Epoch: 15, Loss:  0.2072191834449768


100%|██████████| 98/98 [00:09<00:00, 10.77it/s]


Epoch: 16, Loss:  0.16541846096515656


100%|██████████| 98/98 [00:09<00:00, 10.63it/s]


Epoch: 17, Loss:  0.3966199457645416


100%|██████████| 98/98 [00:10<00:00,  9.69it/s]


Epoch: 18, Loss:  0.1674673855304718


100%|██████████| 98/98 [00:14<00:00,  6.98it/s]


Epoch: 19, Loss:  0.1905251294374466


100%|██████████| 98/98 [00:13<00:00,  7.10it/s]


Epoch: 20, Loss:  0.28603315353393555


100%|██████████| 98/98 [00:16<00:00,  6.10it/s]


Epoch: 21, Loss:  3.605805826123364e+33


100%|██████████| 98/98 [00:16<00:00,  5.79it/s]


Epoch: 22, Loss:  0.14992262423038483


100%|██████████| 98/98 [00:15<00:00,  6.47it/s]


Epoch: 23, Loss:  0.0989621952176094


100%|██████████| 98/98 [00:14<00:00,  6.75it/s]


Epoch: 24, Loss:  0.2389724850654602


100%|██████████| 98/98 [00:13<00:00,  7.03it/s]


Epoch: 25, Loss:  0.30379167199134827


100%|██████████| 98/98 [00:13<00:00,  7.26it/s]


Epoch: 26, Loss:  0.10964436084032059


100%|██████████| 98/98 [00:13<00:00,  7.26it/s]


Epoch: 27, Loss:  0.2234857827425003


100%|██████████| 98/98 [00:14<00:00,  6.89it/s]


Epoch: 28, Loss:  0.164289191365242


100%|██████████| 98/98 [00:13<00:00,  7.36it/s]


Epoch: 29, Loss:  0.09500060230493546


100%|██████████| 98/98 [00:13<00:00,  7.22it/s]


Epoch: 30, Loss:  0.07548091560602188


100%|██████████| 98/98 [00:13<00:00,  7.07it/s]


Epoch: 31, Loss:  0.15215083956718445


100%|██████████| 98/98 [00:13<00:00,  7.43it/s]


Epoch: 32, Loss:  3.605805826123364e+33


100%|██████████| 98/98 [00:13<00:00,  7.44it/s]


Epoch: 33, Loss:  0.2781926393508911


100%|██████████| 98/98 [00:13<00:00,  7.24it/s]


Epoch: 34, Loss:  3.605805826123364e+33


100%|██████████| 98/98 [00:14<00:00,  6.85it/s]


Epoch: 35, Loss:  3.605805826123364e+33


100%|██████████| 98/98 [00:14<00:00,  6.69it/s]


Epoch: 36, Loss:  0.15543149411678314


100%|██████████| 98/98 [00:12<00:00,  7.79it/s]


Epoch: 37, Loss:  3.605805826123364e+33


100%|██████████| 98/98 [00:12<00:00,  7.55it/s]


Epoch: 38, Loss:  0.13584764301776886


100%|██████████| 98/98 [00:13<00:00,  7.40it/s]


Epoch: 39, Loss:  0.09124132245779037


100%|██████████| 98/98 [00:13<00:00,  7.12it/s]


Epoch: 40, Loss:  0.1324339210987091


100%|██████████| 98/98 [00:13<00:00,  7.08it/s]


Epoch: 41, Loss:  0.07117597758769989


100%|██████████| 98/98 [00:12<00:00,  7.57it/s]


Epoch: 42, Loss:  0.09697258472442627


100%|██████████| 98/98 [00:12<00:00,  7.56it/s]


Epoch: 43, Loss:  0.17987172305583954


100%|██████████| 98/98 [00:13<00:00,  7.04it/s]


Epoch: 44, Loss:  0.05156918987631798


100%|██████████| 98/98 [00:13<00:00,  7.21it/s]


Epoch: 45, Loss:  3.605805826123364e+33


100%|██████████| 98/98 [00:12<00:00,  7.92it/s]


Epoch: 46, Loss:  0.22861085832118988


100%|██████████| 98/98 [00:13<00:00,  7.19it/s]


Epoch: 47, Loss:  3.605805826123364e+33


100%|██████████| 98/98 [00:14<00:00,  6.56it/s]


Epoch: 48, Loss:  3.605805826123364e+33


100%|██████████| 98/98 [00:14<00:00,  6.77it/s]


Epoch: 49, Loss:  0.2101527899503708


100%|██████████| 98/98 [00:14<00:00,  6.82it/s]


Epoch: 50, Loss:  0.14353172481060028


100%|██████████| 98/98 [00:12<00:00,  7.66it/s]


Epoch: 51, Loss:  0.04447616636753082


100%|██████████| 98/98 [00:13<00:00,  7.05it/s]


Epoch: 52, Loss:  0.03841649368405342


100%|██████████| 98/98 [00:13<00:00,  7.15it/s]


Epoch: 53, Loss:  0.03960404172539711


100%|██████████| 98/98 [00:14<00:00,  6.61it/s]


Epoch: 54, Loss:  3.605805826123364e+33


100%|██████████| 98/98 [00:14<00:00,  6.82it/s]


Epoch: 55, Loss:  0.1297309845685959


100%|██████████| 98/98 [00:13<00:00,  7.07it/s]


Epoch: 56, Loss:  0.12416666001081467


100%|██████████| 98/98 [00:13<00:00,  7.21it/s]


Epoch: 57, Loss:  0.14005328714847565


100%|██████████| 98/98 [00:14<00:00,  6.70it/s]


Epoch: 58, Loss:  0.2737022936344147


100%|██████████| 98/98 [00:13<00:00,  7.48it/s]


Epoch: 59, Loss:  0.0953470841050148


100%|██████████| 98/98 [00:13<00:00,  7.00it/s]


Epoch: 60, Loss:  0.058052223175764084


100%|██████████| 98/98 [00:13<00:00,  7.16it/s]


Epoch: 61, Loss:  0.10651443153619766


100%|██████████| 98/98 [00:12<00:00,  7.75it/s]


Epoch: 62, Loss:  0.04650930315256119


100%|██████████| 98/98 [00:14<00:00,  6.76it/s]

Epoch: 63, Loss:  0.05049486458301544





In [14]:
# Validation
def validation(testing_loader):
    model.eval()
    fin_targets = []
    fin_outputs = []

    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0), total=len(testing_loader)):
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            fin_targets.extend(labels.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs.logits).cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets

In [15]:
outputs, targets = validation(testloader)

outputs = np.array(outputs) >= 0.5

100%|██████████| 25/25 [00:01<00:00, 18.68it/s]


In [16]:
# Multi-label F1 score, macro-averaged
from torchmetrics.classification import MultilabelF1Score
f1 = MultilabelF1Score(num_labels=18, threshold=0.5, average='macro')
f1.update(torch.tensor(outputs), torch.tensor(targets))

f1.compute()

tensor(0.1864)

In [17]:
# Save model
model.save_pretrained('model')

In [18]:
# Inferencing
def inference(input, threshold = 0.5, model=model, tokenizer=tokenizer):
    encoding = tokenizer.encode_plus(
        input,
        add_special_tokens=True,
        max_length=64,
        return_token_type_ids=True,
        padding='max_length',
        pad_to_max_length=True,
        return_attention_mask=True,
        truncation=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    outputs = model(input_ids, attention_mask=attention_mask)
    outputs = torch.sigmoid(outputs.logits).cpu().detach().numpy().tolist()[0]
    outputs = np.array(outputs) >= threshold
    outputs = np.where(outputs == True)[0]
    outputs = [genres[i] for i in outputs]
    print(outputs)

In [19]:
inference('The Untouchables (1987)')

['Drama']
