In [4]:
import torch
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

In [5]:
with open('ml1m/content/dataset/genres.txt', 'r') as f:
    genre_all = f.readlines()
genres = [genre.strip() for genre in genre_all]

mapping = {}
for genre, i in enumerate(genres):
    mapping[genre] = i

mapping

{0: 'Crime',
 1: 'Thriller',
 2: 'Fantasy',
 3: 'Horror',
 4: 'Sci-Fi',
 5: 'Comedy',
 6: 'Documentary',
 7: 'Adventure',
 8: 'Film-Noir',
 9: 'Animation',
 10: 'Romance',
 11: 'Drama',
 12: 'Western',
 13: 'Musical',
 14: 'Action',
 15: 'Mystery',
 16: 'War',
 17: "Children's"}

In [6]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", problem_type="multi_label_classification", num_labels=18)
model.config.id2label = mapping

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
users = pd.read_csv('ml1m/content/dataset/users.dat', sep='::',
                        engine='python',
                        names=['userid', 'gender', 'age', 'occupation', 'zip']).set_index('userid')
ratings = pd.read_csv('ml1m/content/dataset/ratings.dat', engine='python',
                          sep='::', names=['userid', 'movieid', 'rating', 'timestamp'])
movies_train = pd.read_csv('ml1m/content/dataset/movies_train.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='ISO-8859-1', index_col=False).set_index('movieid')
movies_test = pd.read_csv('ml1m/content/dataset/movies_test.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='ISO-8859-1', index_col=False).set_index('movieid')
movies_train['genre'] = movies_train.genre.str.split('|')
movies_train.index.name = 'ID'
movies_test['genre'] = movies_test.genre.str.split('|')
movies_test.index.name = 'ID'

users.age = users.age.astype('int')
users.gender = users.gender.astype('category')
users.occupation = users.occupation.astype('category')
ratings.movieid = ratings.movieid.astype('int')
ratings.userid = ratings.userid.astype('int')

In [8]:
def preprocess(df, path='ml1m/content/dataset/ml1m-images', genres=genres) -> pd.DataFrame:
    df['label'] = df.genre.apply(lambda x: [1 if genre in x else 0 for genre in genres])
    df.drop(columns=['genre'], inplace=True)
    df.rename(columns={'title': 'text'}, inplace=True)
    df = df.reset_index(drop=True)
    return df

In [9]:
trainset = preprocess(movies_train)
testset = preprocess(movies_test)

# Hard Code

In [10]:
class Poroset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len=64):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        row = self.df.iloc[index]
        text = row.text
        label = row.label
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            pad_to_max_length=True,
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt'
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.float)
        }

    def __len__(self):
        return len(self.df)

In [11]:
trainset = Poroset(trainset, tokenizer)
testset = Poroset(testset, tokenizer)

In [12]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=32, shuffle=True)

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [14]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-5)

In [15]:
def train(epoch):
    model.train()
    for _, data in tqdm(enumerate(trainloader, 0), total=len(trainloader)):
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = loss_fn(outputs.logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch: {epoch}, Loss:  {loss.item()}')

In [16]:
for epoch in range(64):
    train(epoch)

  0%|          | 0/98 [00:00<?, ?it/s]

100%|██████████| 98/98 [00:23<00:00,  4.10it/s]


Epoch: 0, Loss:  0.28000563383102417


100%|██████████| 98/98 [00:21<00:00,  4.50it/s]


Epoch: 1, Loss:  0.3310627341270447


100%|██████████| 98/98 [00:21<00:00,  4.62it/s]


Epoch: 2, Loss:  0.17934368550777435


100%|██████████| 98/98 [00:20<00:00,  4.80it/s]


Epoch: 3, Loss:  0.20069929957389832


100%|██████████| 98/98 [00:22<00:00,  4.27it/s]


Epoch: 4, Loss:  0.2902584671974182


100%|██████████| 98/98 [00:25<00:00,  3.82it/s]


Epoch: 5, Loss:  0.6377385854721069


100%|██████████| 98/98 [00:24<00:00,  4.02it/s]


Epoch: 6, Loss:  0.36732789874076843


100%|██████████| 98/98 [00:23<00:00,  4.15it/s]


Epoch: 7, Loss:  0.2727365493774414


100%|██████████| 98/98 [00:24<00:00,  4.06it/s]


Epoch: 8, Loss:  0.30954065918922424


100%|██████████| 98/98 [00:22<00:00,  4.27it/s]


Epoch: 9, Loss:  0.2295275777578354


100%|██████████| 98/98 [00:23<00:00,  4.12it/s]


Epoch: 10, Loss:  0.2563683092594147


100%|██████████| 98/98 [00:23<00:00,  4.23it/s]


Epoch: 11, Loss:  0.17563076317310333


100%|██████████| 98/98 [00:22<00:00,  4.32it/s]


Epoch: 12, Loss:  0.10023944824934006


100%|██████████| 98/98 [00:25<00:00,  3.91it/s]


Epoch: 13, Loss:  0.17225193977355957


100%|██████████| 98/98 [00:23<00:00,  4.15it/s]


Epoch: 14, Loss:  0.09511181712150574


100%|██████████| 98/98 [00:26<00:00,  3.68it/s]


Epoch: 15, Loss:  0.24038733541965485


100%|██████████| 98/98 [00:27<00:00,  3.52it/s]


Epoch: 16, Loss:  0.16122934222221375


100%|██████████| 98/98 [00:25<00:00,  3.88it/s]


Epoch: 17, Loss:  0.31239500641822815


100%|██████████| 98/98 [00:24<00:00,  3.99it/s]


Epoch: 18, Loss:  0.24111594259738922


100%|██████████| 98/98 [00:22<00:00,  4.43it/s]


Epoch: 19, Loss:  0.09189382940530777


100%|██████████| 98/98 [00:21<00:00,  4.49it/s]


Epoch: 20, Loss:  0.08506599813699722


100%|██████████| 98/98 [00:21<00:00,  4.65it/s]


Epoch: 21, Loss:  0.277985155582428


100%|██████████| 98/98 [00:23<00:00,  4.19it/s]


Epoch: 22, Loss:  0.12426655739545822


100%|██████████| 98/98 [00:25<00:00,  3.91it/s]


Epoch: 23, Loss:  0.1233995258808136


100%|██████████| 98/98 [00:24<00:00,  4.08it/s]


Epoch: 24, Loss:  0.06814946234226227


100%|██████████| 98/98 [00:24<00:00,  3.95it/s]


Epoch: 25, Loss:  0.21900013089179993


100%|██████████| 98/98 [00:25<00:00,  3.84it/s]


Epoch: 26, Loss:  0.06775425374507904


100%|██████████| 98/98 [00:25<00:00,  3.79it/s]


Epoch: 27, Loss:  0.19896484911441803


100%|██████████| 98/98 [00:24<00:00,  4.06it/s]


Epoch: 28, Loss:  0.09164009243249893


100%|██████████| 98/98 [00:24<00:00,  3.94it/s]


Epoch: 29, Loss:  0.1128692775964737


100%|██████████| 98/98 [00:25<00:00,  3.90it/s]


Epoch: 30, Loss:  0.13895872235298157


100%|██████████| 98/98 [00:24<00:00,  3.97it/s]


Epoch: 31, Loss:  0.04639572277665138


100%|██████████| 98/98 [00:21<00:00,  4.65it/s]


Epoch: 32, Loss:  0.04343739151954651


100%|██████████| 98/98 [00:19<00:00,  5.16it/s]


Epoch: 33, Loss:  0.07083424180746078


100%|██████████| 98/98 [00:17<00:00,  5.54it/s]


Epoch: 34, Loss:  0.14080291986465454


100%|██████████| 98/98 [00:17<00:00,  5.52it/s]


Epoch: 35, Loss:  0.1443641483783722


100%|██████████| 98/98 [00:17<00:00,  5.49it/s]


Epoch: 36, Loss:  0.02778102271258831


100%|██████████| 98/98 [00:17<00:00,  5.48it/s]


Epoch: 37, Loss:  0.1582098752260208


100%|██████████| 98/98 [00:19<00:00,  5.03it/s]


Epoch: 38, Loss:  0.05556793883442879


100%|██████████| 98/98 [00:19<00:00,  5.15it/s]


Epoch: 39, Loss:  0.008317975327372551


100%|██████████| 98/98 [00:19<00:00,  4.99it/s]


Epoch: 40, Loss:  0.020502246916294098


100%|██████████| 98/98 [00:19<00:00,  5.08it/s]


Epoch: 41, Loss:  0.15879350900650024


100%|██████████| 98/98 [00:19<00:00,  4.98it/s]


Epoch: 42, Loss:  0.009331694804131985


100%|██████████| 98/98 [00:19<00:00,  5.10it/s]


Epoch: 43, Loss:  0.09081263095140457


100%|██████████| 98/98 [00:19<00:00,  5.14it/s]


Epoch: 44, Loss:  0.016007905825972557


100%|██████████| 98/98 [00:18<00:00,  5.21it/s]


Epoch: 45, Loss:  0.027287354692816734


100%|██████████| 98/98 [00:19<00:00,  5.11it/s]


Epoch: 46, Loss:  0.011871249414980412


100%|██████████| 98/98 [00:19<00:00,  5.08it/s]


Epoch: 47, Loss:  0.003766340436413884


100%|██████████| 98/98 [00:19<00:00,  5.07it/s]


Epoch: 48, Loss:  0.02766585536301136


100%|██████████| 98/98 [00:19<00:00,  5.12it/s]


Epoch: 49, Loss:  0.04602060094475746


100%|██████████| 98/98 [00:19<00:00,  5.07it/s]


Epoch: 50, Loss:  0.0025931415148079395


100%|██████████| 98/98 [00:19<00:00,  4.95it/s]


Epoch: 51, Loss:  0.010890803299844265


100%|██████████| 98/98 [00:18<00:00,  5.21it/s]


Epoch: 52, Loss:  0.005596073344349861


100%|██████████| 98/98 [00:17<00:00,  5.52it/s]


Epoch: 53, Loss:  0.04949874430894852


100%|██████████| 98/98 [00:17<00:00,  5.63it/s]


Epoch: 54, Loss:  0.008279169909656048


100%|██████████| 98/98 [00:18<00:00,  5.41it/s]


Epoch: 55, Loss:  0.013570686802268028


100%|██████████| 98/98 [00:17<00:00,  5.61it/s]


Epoch: 56, Loss:  0.004640011582523584


100%|██████████| 98/98 [00:17<00:00,  5.57it/s]


Epoch: 57, Loss:  0.014108349569141865


100%|██████████| 98/98 [00:17<00:00,  5.51it/s]


Epoch: 58, Loss:  0.019709095358848572


100%|██████████| 98/98 [00:19<00:00,  5.07it/s]


Epoch: 59, Loss:  0.024676019325852394


100%|██████████| 98/98 [00:20<00:00,  4.88it/s]


Epoch: 60, Loss:  0.003033487591892481


100%|██████████| 98/98 [00:19<00:00,  5.14it/s]


Epoch: 61, Loss:  0.007170053198933601


100%|██████████| 98/98 [00:19<00:00,  5.13it/s]


Epoch: 62, Loss:  0.003088085912168026


100%|██████████| 98/98 [00:19<00:00,  5.01it/s]

Epoch: 63, Loss:  0.003552255453541875





In [17]:
# Validation
def validation(testing_loader):
    model.eval()
    fin_targets = []
    fin_outputs = []

    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0), total=len(testing_loader)):
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            fin_targets.extend(labels.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs.logits).cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets

In [18]:
outputs, targets = validation(testloader)

outputs = np.array(outputs) >= 0.5

100%|██████████| 25/25 [00:01<00:00, 14.93it/s]


In [19]:
# Multi-label F1 score, macro-averaged
from torchmetrics.classification import MultilabelF1Score
f1 = MultilabelF1Score(num_labels=18, threshold=0.5, average='macro')
f1.update(torch.tensor(outputs), torch.tensor(targets))

f1.compute()

tensor(0.3089)

In [20]:
# Save model
model.save_pretrained('model')

In [21]:
# Inferencing
def inference(input, threshold = 0.5, model=model, tokenizer=tokenizer):
    encoding = tokenizer.encode_plus(
        input,
        add_special_tokens=True,
        max_length=64,
        return_token_type_ids=True,
        padding='max_length',
        pad_to_max_length=True,
        return_attention_mask=True,
        truncation=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    outputs = model(input_ids, attention_mask=attention_mask)
    outputs = torch.sigmoid(outputs.logits).cpu().detach().numpy().tolist()[0]
    outputs = np.array(outputs) >= threshold
    outputs = np.where(outputs == True)[0]
    outputs = [genres[i] for i in outputs]
    print(outputs)

In [22]:
inference('The Untouchables (1987)')

['Crime', 'Drama', 'Action']
