In [1]:
import torch
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, DistilBertForSequenceClassification
from tqdm import tqdm

In [2]:
with open('ml1m/content/dataset/genres.txt', 'r') as f:
    genre_all = f.readlines()
genres = [genre.strip() for genre in genre_all]

mapping = {}
for genre, i in enumerate(genres):
    mapping[genre] = i

mapping

{0: 'Crime',
 1: 'Thriller',
 2: 'Fantasy',
 3: 'Horror',
 4: 'Sci-Fi',
 5: 'Comedy',
 6: 'Documentary',
 7: 'Adventure',
 8: 'Film-Noir',
 9: 'Animation',
 10: 'Romance',
 11: 'Drama',
 12: 'Western',
 13: 'Musical',
 14: 'Action',
 15: 'Mystery',
 16: 'War',
 17: "Children's"}

In [3]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", problem_type="multi_label_classification", num_labels=18)
model.config.id2label = mapping

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
users = pd.read_csv('ml1m/content/dataset/users.dat', sep='::',
                        engine='python',
                        names=['userid', 'gender', 'age', 'occupation', 'zip']).set_index('userid')
ratings = pd.read_csv('ml1m/content/dataset/ratings.dat', engine='python',
                          sep='::', names=['userid', 'movieid', 'rating', 'timestamp'])
movies_train = pd.read_csv('ml1m/content/dataset/movies_train.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='ISO-8859-1', index_col=False).set_index('movieid')
movies_test = pd.read_csv('ml1m/content/dataset/movies_test.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='ISO-8859-1', index_col=False).set_index('movieid')
movies_train['genre'] = movies_train.genre.str.split('|')
movies_train.index.name = 'ID'
movies_test['genre'] = movies_test.genre.str.split('|')
movies_test.index.name = 'ID'

users.age = users.age.astype('int')
users.gender = users.gender.astype('category')
users.occupation = users.occupation.astype('category')
ratings.movieid = ratings.movieid.astype('int')
ratings.userid = ratings.userid.astype('int')

In [5]:
def preprocess(df, path='ml1m/content/dataset/ml1m-images', genres=genres) -> pd.DataFrame:
    df['label'] = df.genre.apply(lambda x: [1 if genre in x else 0 for genre in genres])
    df.drop(columns=['genre'], inplace=True)
    df.rename(columns={'title': 'text'}, inplace=True)
    df = df.reset_index(drop=True)
    return df

In [6]:
trainset = preprocess(movies_train)
testset = preprocess(movies_test)

# Hard Code

In [7]:
class Poroset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len=64):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        row = self.df.iloc[index]
        text = row.text
        label = row.label
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            pad_to_max_length=True,
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt'
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.float)
        }

    def __len__(self):
        return len(self.df)

In [8]:
trainset = Poroset(trainset, tokenizer)
testset = Poroset(testset, tokenizer)

In [9]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=32, shuffle=True)

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [11]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-5)

In [12]:
def train(epoch):
    model.train()
    for _, data in tqdm(enumerate(trainloader, 0), total=len(trainloader)):
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = loss_fn(outputs.logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch: {epoch}, Loss:  {loss.item()}')

In [13]:
for epoch in range(32):
    train(epoch)

  0%|          | 0/98 [00:00<?, ?it/s]

100%|██████████| 98/98 [00:09<00:00, 10.08it/s]


Epoch: 0, Loss:  0.45576605200767517


100%|██████████| 98/98 [00:08<00:00, 10.94it/s]


Epoch: 1, Loss:  0.2698230743408203


100%|██████████| 98/98 [00:08<00:00, 10.94it/s]


Epoch: 2, Loss:  0.30712011456489563


100%|██████████| 98/98 [00:09<00:00, 10.84it/s]


Epoch: 3, Loss:  0.24804483354091644


100%|██████████| 98/98 [00:08<00:00, 10.95it/s]


Epoch: 4, Loss:  0.15726158022880554


100%|██████████| 98/98 [00:08<00:00, 11.03it/s]


Epoch: 5, Loss:  0.12012694031000137


100%|██████████| 98/98 [00:09<00:00, 10.72it/s]


Epoch: 6, Loss:  0.31045252084732056


100%|██████████| 98/98 [00:09<00:00, 10.15it/s]


Epoch: 7, Loss:  0.2804860770702362


100%|██████████| 98/98 [00:09<00:00, 10.04it/s]


Epoch: 8, Loss:  0.16216301918029785


100%|██████████| 98/98 [00:09<00:00, 10.32it/s]


Epoch: 9, Loss:  0.24716860055923462


100%|██████████| 98/98 [00:09<00:00, 10.74it/s]


Epoch: 10, Loss:  0.1992119401693344


100%|██████████| 98/98 [00:09<00:00, 10.63it/s]


Epoch: 11, Loss:  0.285226434469223


100%|██████████| 98/98 [00:09<00:00, 10.26it/s]


Epoch: 12, Loss:  0.14183856546878815


100%|██████████| 98/98 [00:09<00:00, 10.35it/s]


Epoch: 13, Loss:  0.06837382912635803


100%|██████████| 98/98 [00:09<00:00, 10.37it/s]


Epoch: 14, Loss:  0.12036792188882828


100%|██████████| 98/98 [00:09<00:00, 10.41it/s]


Epoch: 15, Loss:  0.09971380233764648


100%|██████████| 98/98 [00:09<00:00, 10.28it/s]


Epoch: 16, Loss:  0.257861465215683


100%|██████████| 98/98 [00:09<00:00, 10.23it/s]


Epoch: 17, Loss:  0.04670961573719978


100%|██████████| 98/98 [00:09<00:00, 10.21it/s]


Epoch: 18, Loss:  0.12478189915418625


100%|██████████| 98/98 [00:09<00:00, 10.34it/s]


Epoch: 19, Loss:  0.013522242195904255


100%|██████████| 98/98 [00:09<00:00, 10.43it/s]


Epoch: 20, Loss:  0.12125827372074127


100%|██████████| 98/98 [00:09<00:00, 10.37it/s]


Epoch: 21, Loss:  0.08006749302148819


100%|██████████| 98/98 [00:09<00:00, 10.11it/s]


Epoch: 22, Loss:  0.04729028046131134


100%|██████████| 98/98 [00:09<00:00,  9.99it/s]


Epoch: 23, Loss:  0.09062019735574722


100%|██████████| 98/98 [00:09<00:00, 10.26it/s]


Epoch: 24, Loss:  0.016012635082006454


100%|██████████| 98/98 [00:09<00:00, 10.02it/s]


Epoch: 25, Loss:  0.0427311547100544


100%|██████████| 98/98 [00:09<00:00, 10.00it/s]


Epoch: 26, Loss:  0.013237861916422844


100%|██████████| 98/98 [00:09<00:00, 10.33it/s]


Epoch: 27, Loss:  0.074485182762146


100%|██████████| 98/98 [00:09<00:00, 10.24it/s]


Epoch: 28, Loss:  0.05739229917526245


100%|██████████| 98/98 [00:09<00:00, 10.00it/s]


Epoch: 29, Loss:  0.003453482873737812


100%|██████████| 98/98 [00:09<00:00, 10.14it/s]


Epoch: 30, Loss:  0.04404656961560249


100%|██████████| 98/98 [00:09<00:00, 10.22it/s]

Epoch: 31, Loss:  0.032494720071554184





In [14]:
# Validation
def validation(testing_loader):
    model.eval()
    fin_targets = []
    fin_outputs = []

    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0), total=len(testing_loader)):
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            fin_targets.extend(labels.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs.logits).cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets

In [15]:
outputs, targets = validation(testloader)

outputs = np.array(outputs) >= 0.5

100%|██████████| 25/25 [00:00<00:00, 26.55it/s]


In [19]:
# Multi-label F1 score, macro-averaged
from torchmetrics.classification import MultilabelF1Score
f1 = MultilabelF1Score(num_labels=18, threshold=0.5, average='macro')
f1.update(torch.tensor(outputs), torch.tensor(targets))

f1.compute()

tensor(0.2733)

In [17]:
# Save model
model.save_pretrained('model')

In [42]:
# Inferencing
def inference(input, threshold = 0.5, model=model, tokenizer=tokenizer):
    encoding = tokenizer.encode_plus(
        input,
        add_special_tokens=True,
        max_length=64,
        return_token_type_ids=True,
        padding='max_length',
        pad_to_max_length=True,
        return_attention_mask=True,
        truncation=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    outputs = model(input_ids, attention_mask=attention_mask)
    outputs = torch.sigmoid(outputs.logits).cpu().detach().numpy().tolist()[0]
    outputs = np.array(outputs) >= threshold
    outputs = np.where(outputs == True)[0]
    outputs = [genres[i] for i in outputs]
    print(outputs)

In [71]:
inference('The Untouchables (1987)')

['Drama', 'Action']
