In [1]:
import torch
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, DistilBertForSequenceClassification
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('ml1m/content/dataset/genres.txt', 'r') as f:
    genre_all = f.readlines()
genres = [genre.strip() for genre in genre_all]

mapping = {}
for genre, i in enumerate(genres):
    mapping[genre] = i

mapping

{0: 'Crime',
 1: 'Thriller',
 2: 'Fantasy',
 3: 'Horror',
 4: 'Sci-Fi',
 5: 'Comedy',
 6: 'Documentary',
 7: 'Adventure',
 8: 'Film-Noir',
 9: 'Animation',
 10: 'Romance',
 11: 'Drama',
 12: 'Western',
 13: 'Musical',
 14: 'Action',
 15: 'Mystery',
 16: 'War',
 17: "Children's"}

In [3]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", problem_type="multi_label_classification", num_labels=18)
model.config.id2label = mapping

tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 28.0kB/s]
config.json: 100%|██████████| 483/483 [00:00<?, ?B/s] 
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 941kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 770kB/s]
model.safetensors: 100%|██████████| 268M/268M [00:42<00:00, 6.23MB/s] 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
users = pd.read_csv('ml1m/content/dataset/users.dat', sep='::',
                        engine='python',
                        names=['userid', 'gender', 'age', 'occupation', 'zip']).set_index('userid')
ratings = pd.read_csv('ml1m/content/dataset/ratings.dat', engine='python',
                          sep='::', names=['userid', 'movieid', 'rating', 'timestamp'])
movies_train = pd.read_csv('ml1m/content/dataset/movies_train.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='ISO-8859-1', index_col=False).set_index('movieid')
movies_test = pd.read_csv('ml1m/content/dataset/movies_test.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='ISO-8859-1', index_col=False).set_index('movieid')
movies_train['genre'] = movies_train.genre.str.split('|')
movies_train.index.name = 'ID'
movies_test['genre'] = movies_test.genre.str.split('|')
movies_test.index.name = 'ID'

users.age = users.age.astype('int')
users.gender = users.gender.astype('category')
users.occupation = users.occupation.astype('category')
ratings.movieid = ratings.movieid.astype('int')
ratings.userid = ratings.userid.astype('int')

In [5]:
def preprocess(df, path='ml1m/content/dataset/ml1m-images', genres=genres) -> pd.DataFrame:
    df['label'] = df.genre.apply(lambda x: [1 if genre in x else 0 for genre in genres])
    df.drop(columns=['genre'], inplace=True)
    df.rename(columns={'title': 'text'}, inplace=True)
    df = df.reset_index(drop=True)
    return df

In [6]:
trainset = preprocess(movies_train)
testset = preprocess(movies_test)

# Hard Code

In [7]:
class Poroset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len=64):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        row = self.df.iloc[index]
        text = row.text
        label = row.label
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            pad_to_max_length=True,
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt'
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.float)
        }

    def __len__(self):
        return len(self.df)

In [8]:
trainset = Poroset(trainset, tokenizer)
testset = Poroset(testset, tokenizer)

In [9]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=32, shuffle=True)

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [11]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-5)

In [12]:
def train(epoch):
    model.train()
    for _, data in tqdm(enumerate(trainloader, 0), total=len(trainloader)):
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = loss_fn(outputs.logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch: {epoch}, Loss:  {loss.item()}')

In [13]:
for epoch in range(64):
    train(epoch)

100%|██████████| 98/98 [05:00<00:00,  3.07s/it]


Epoch: 0, Loss:  0.288343220949173


100%|██████████| 98/98 [04:58<00:00,  3.05s/it]


Epoch: 1, Loss:  0.23165515065193176


100%|██████████| 98/98 [06:10<00:00,  3.79s/it]


Epoch: 2, Loss:  0.27339470386505127


100%|██████████| 98/98 [06:56<00:00,  4.25s/it]


Epoch: 3, Loss:  0.3109546899795532


100%|██████████| 98/98 [06:58<00:00,  4.27s/it]


Epoch: 4, Loss:  0.2761244475841522


100%|██████████| 98/98 [06:57<00:00,  4.26s/it]


Epoch: 5, Loss:  0.3438176214694977


100%|██████████| 98/98 [06:56<00:00,  4.25s/it]


Epoch: 6, Loss:  0.34809577465057373


100%|██████████| 98/98 [06:52<00:00,  4.21s/it]


Epoch: 7, Loss:  0.13059356808662415


100%|██████████| 98/98 [03:11<00:00,  1.96s/it]


Epoch: 8, Loss:  0.1354793757200241


100%|██████████| 98/98 [01:43<00:00,  1.06s/it]


Epoch: 9, Loss:  0.11476833373308182


100%|██████████| 98/98 [03:02<00:00,  1.86s/it]


Epoch: 10, Loss:  0.1987750381231308


100%|██████████| 98/98 [01:43<00:00,  1.06s/it]


Epoch: 11, Loss:  0.1393577605485916


100%|██████████| 98/98 [01:47<00:00,  1.09s/it]


Epoch: 12, Loss:  0.19577626883983612


100%|██████████| 98/98 [01:50<00:00,  1.13s/it]


Epoch: 13, Loss:  0.19761639833450317


100%|██████████| 98/98 [01:53<00:00,  1.15s/it]


Epoch: 14, Loss:  0.03532465919852257


100%|██████████| 98/98 [01:52<00:00,  1.15s/it]


Epoch: 15, Loss:  0.07044047862291336


100%|██████████| 98/98 [01:52<00:00,  1.15s/it]


Epoch: 16, Loss:  0.08114195615053177


100%|██████████| 98/98 [01:49<00:00,  1.12s/it]


Epoch: 17, Loss:  0.10673962533473969


100%|██████████| 98/98 [01:49<00:00,  1.11s/it]


Epoch: 18, Loss:  0.18558372557163239


100%|██████████| 98/98 [01:47<00:00,  1.10s/it]


Epoch: 19, Loss:  0.01926223188638687


100%|██████████| 98/98 [01:48<00:00,  1.11s/it]


Epoch: 20, Loss:  0.03494999557733536


100%|██████████| 98/98 [01:49<00:00,  1.11s/it]


Epoch: 21, Loss:  0.07784207910299301


100%|██████████| 98/98 [01:47<00:00,  1.09s/it]


Epoch: 22, Loss:  0.0752699077129364


100%|██████████| 98/98 [01:49<00:00,  1.11s/it]


Epoch: 23, Loss:  0.08181385695934296


100%|██████████| 98/98 [01:48<00:00,  1.11s/it]


Epoch: 24, Loss:  0.1266384869813919


100%|██████████| 98/98 [01:50<00:00,  1.12s/it]


Epoch: 25, Loss:  0.05455634742975235


100%|██████████| 98/98 [03:41<00:00,  2.26s/it]


Epoch: 26, Loss:  0.18046711385250092


100%|██████████| 98/98 [03:10<00:00,  1.94s/it]


Epoch: 27, Loss:  0.04122469574213028


100%|██████████| 98/98 [01:49<00:00,  1.12s/it]


Epoch: 28, Loss:  0.10111361742019653


100%|██████████| 98/98 [01:52<00:00,  1.15s/it]


Epoch: 29, Loss:  0.05330103635787964


100%|██████████| 98/98 [01:52<00:00,  1.15s/it]


Epoch: 30, Loss:  0.003656969405710697


100%|██████████| 98/98 [01:51<00:00,  1.14s/it]


Epoch: 31, Loss:  0.023019468411803246


100%|██████████| 98/98 [01:53<00:00,  1.15s/it]


Epoch: 32, Loss:  0.08131687343120575


100%|██████████| 98/98 [04:17<00:00,  2.63s/it]


Epoch: 33, Loss:  0.027081338688731194


100%|██████████| 98/98 [01:51<00:00,  1.14s/it]


Epoch: 34, Loss:  0.004520680755376816


100%|██████████| 98/98 [01:59<00:00,  1.22s/it]


Epoch: 35, Loss:  0.008470841683447361


100%|██████████| 98/98 [01:57<00:00,  1.20s/it]


Epoch: 36, Loss:  0.0014163546729832888


100%|██████████| 98/98 [01:55<00:00,  1.18s/it]


Epoch: 37, Loss:  0.015679407864809036


100%|██████████| 98/98 [01:55<00:00,  1.18s/it]


Epoch: 38, Loss:  0.05723736062645912


100%|██████████| 98/98 [01:57<00:00,  1.20s/it]


Epoch: 39, Loss:  0.01447168830782175


100%|██████████| 98/98 [02:02<00:00,  1.25s/it]


Epoch: 40, Loss:  0.002324535511434078


100%|██████████| 98/98 [01:56<00:00,  1.19s/it]


Epoch: 41, Loss:  0.003999894019216299


100%|██████████| 98/98 [01:59<00:00,  1.22s/it]


Epoch: 42, Loss:  0.037031710147857666


100%|██████████| 98/98 [02:08<00:00,  1.31s/it]


Epoch: 43, Loss:  0.015419559553265572


100%|██████████| 98/98 [02:08<00:00,  1.31s/it]


Epoch: 44, Loss:  0.05205089971423149


100%|██████████| 98/98 [02:06<00:00,  1.29s/it]


Epoch: 45, Loss:  0.009257004596292973


100%|██████████| 98/98 [02:09<00:00,  1.32s/it]


Epoch: 46, Loss:  0.043573375791311264


100%|██████████| 98/98 [02:06<00:00,  1.29s/it]


Epoch: 47, Loss:  0.005661619361490011


100%|██████████| 98/98 [02:06<00:00,  1.29s/it]


Epoch: 48, Loss:  0.009531121701002121


100%|██████████| 98/98 [02:07<00:00,  1.30s/it]


Epoch: 49, Loss:  0.0008502462296746671


100%|██████████| 98/98 [02:07<00:00,  1.30s/it]


Epoch: 50, Loss:  0.00904424674808979


100%|██████████| 98/98 [02:14<00:00,  1.37s/it]


Epoch: 51, Loss:  0.039174262434244156


100%|██████████| 98/98 [02:10<00:00,  1.33s/it]


Epoch: 52, Loss:  0.031033344566822052


100%|██████████| 98/98 [02:10<00:00,  1.33s/it]


Epoch: 53, Loss:  0.015105326659977436


100%|██████████| 98/98 [02:13<00:00,  1.37s/it]


Epoch: 54, Loss:  0.031192919239401817


100%|██████████| 98/98 [02:14<00:00,  1.37s/it]


Epoch: 55, Loss:  0.0027654762379825115


100%|██████████| 98/98 [03:05<00:00,  1.89s/it]


Epoch: 56, Loss:  0.013712316751480103


100%|██████████| 98/98 [06:34<00:00,  4.03s/it]


Epoch: 57, Loss:  0.018990052863955498


100%|██████████| 98/98 [02:08<00:00,  1.31s/it]


Epoch: 58, Loss:  0.005869248416274786


100%|██████████| 98/98 [05:10<00:00,  3.17s/it]


Epoch: 59, Loss:  0.0006743945414200425


100%|██████████| 98/98 [06:52<00:00,  4.20s/it]


Epoch: 60, Loss:  0.0012699845246970654


100%|██████████| 98/98 [05:39<00:00,  3.47s/it]


Epoch: 61, Loss:  0.004329721909016371


100%|██████████| 98/98 [05:23<00:00,  3.30s/it]


Epoch: 62, Loss:  0.004370098002254963


100%|██████████| 98/98 [02:38<00:00,  1.62s/it]

Epoch: 63, Loss:  0.04565936699509621





In [14]:
# Validation
def validation(testing_loader):
    model.eval()
    fin_targets = []
    fin_outputs = []

    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0), total=len(testing_loader)):
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            fin_targets.extend(labels.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs.logits).cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets

In [15]:
outputs, targets = validation(testloader)

outputs = np.array(outputs) >= 0.5

100%|██████████| 25/25 [00:08<00:00,  3.05it/s]


In [16]:
# Multi-label F1 score, macro-averaged
from torchmetrics.classification import MultilabelF1Score
f1 = MultilabelF1Score(num_labels=18, threshold=0.5, average='macro')
f1.update(torch.tensor(outputs), torch.tensor(targets))

f1.compute()

tensor(0.3071)

In [17]:
# Save model
model.save_pretrained('model')

In [18]:
# Inferencing
def inference(input, threshold = 0.5, model=model, tokenizer=tokenizer):
    encoding = tokenizer.encode_plus(
        input,
        add_special_tokens=True,
        max_length=64,
        return_token_type_ids=True,
        padding='max_length',
        pad_to_max_length=True,
        return_attention_mask=True,
        truncation=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    outputs = model(input_ids, attention_mask=attention_mask)
    outputs = torch.sigmoid(outputs.logits).cpu().detach().numpy().tolist()[0]
    outputs = np.array(outputs) >= threshold
    outputs = np.where(outputs == True)[0]
    outputs = [genres[i] for i in outputs]
    print(outputs)

In [19]:
inference('The Untouchables (1987)')

['Crime', 'Drama', 'Action']
