In [1]:
import torch
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

In [2]:
with open('ml1m/content/dataset/genres.txt', 'r') as f:
    genre_all = f.readlines()
genres = [genre.strip() for genre in genre_all]

mapping = {}
for genre, i in enumerate(genres):
    mapping[genre] = i

mapping

{0: 'Crime',
 1: 'Thriller',
 2: 'Fantasy',
 3: 'Horror',
 4: 'Sci-Fi',
 5: 'Comedy',
 6: 'Documentary',
 7: 'Adventure',
 8: 'Film-Noir',
 9: 'Animation',
 10: 'Romance',
 11: 'Drama',
 12: 'Western',
 13: 'Musical',
 14: 'Action',
 15: 'Mystery',
 16: 'War',
 17: "Children's"}

In [3]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", problem_type="multi_label_classification", num_labels=18)
model.config.id2label = mapping

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
users = pd.read_csv('ml1m/content/dataset/users.dat', sep='::',
                        engine='python',
                        names=['userid', 'gender', 'age', 'occupation', 'zip']).set_index('userid')
ratings = pd.read_csv('ml1m/content/dataset/ratings.dat', engine='python',
                          sep='::', names=['userid', 'movieid', 'rating', 'timestamp'])
movies_train = pd.read_csv('ml1m/content/dataset/movies_train.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='ISO-8859-1', index_col=False).set_index('movieid')
movies_test = pd.read_csv('ml1m/content/dataset/movies_test.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='ISO-8859-1', index_col=False).set_index('movieid')
movies_train['genre'] = movies_train.genre.str.split('|')
movies_train.index.name = 'ID'
movies_test['genre'] = movies_test.genre.str.split('|')
movies_test.index.name = 'ID'

users.age = users.age.astype('int')
users.gender = users.gender.astype('category')
users.occupation = users.occupation.astype('category')
ratings.movieid = ratings.movieid.astype('int')
ratings.userid = ratings.userid.astype('int')

In [5]:
def preprocess(df, path='ml1m/content/dataset/ml1m-images', genres=genres) -> pd.DataFrame:
    df['label'] = df.genre.apply(lambda x: [1 if genre in x else 0 for genre in genres])
    df.drop(columns=['genre'], inplace=True)
    df.rename(columns={'title': 'text'}, inplace=True)
    df = df.reset_index(drop=True)
    return df

In [6]:
trainset = preprocess(movies_train)
testset = preprocess(movies_test)

# Hard Code

In [7]:
class Poroset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len=64):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        row = self.df.iloc[index]
        text = row.text
        label = row.label
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            pad_to_max_length=True,
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt'
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.float)
        }

    def __len__(self):
        return len(self.df)

In [8]:
trainset = Poroset(trainset, tokenizer)
testset = Poroset(testset, tokenizer)

In [9]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=32, shuffle=True)

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [11]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-5)

In [12]:
def train(epoch):
    model.train()
    for _, data in tqdm(enumerate(trainloader, 0), total=len(trainloader)):
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = loss_fn(outputs.logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch: {epoch}, Loss:  {loss.item()}')

In [13]:
for epoch in range(64):
    train(epoch)

100%|██████████| 98/98 [00:16<00:00,  5.99it/s]


Epoch: 0, Loss:  0.3744993507862091


100%|██████████| 98/98 [00:17<00:00,  5.67it/s]


Epoch: 1, Loss:  0.30752578377723694


100%|██████████| 98/98 [00:18<00:00,  5.44it/s]


Epoch: 2, Loss:  0.31439071893692017


100%|██████████| 98/98 [00:18<00:00,  5.40it/s]


Epoch: 3, Loss:  0.5372664332389832


100%|██████████| 98/98 [00:17<00:00,  5.46it/s]


Epoch: 4, Loss:  0.22566673159599304


100%|██████████| 98/98 [00:17<00:00,  5.54it/s]


Epoch: 5, Loss:  0.28515148162841797


100%|██████████| 98/98 [00:17<00:00,  5.54it/s]


Epoch: 6, Loss:  0.3420316278934479


100%|██████████| 98/98 [00:17<00:00,  5.53it/s]


Epoch: 7, Loss:  0.3067784309387207


100%|██████████| 98/98 [00:17<00:00,  5.50it/s]


Epoch: 8, Loss:  0.1840265393257141


100%|██████████| 98/98 [00:17<00:00,  5.51it/s]


Epoch: 9, Loss:  0.19382384419441223


100%|██████████| 98/98 [00:17<00:00,  5.47it/s]


Epoch: 10, Loss:  0.28988465666770935


100%|██████████| 98/98 [00:17<00:00,  5.45it/s]


Epoch: 11, Loss:  0.15191850066184998


100%|██████████| 98/98 [00:17<00:00,  5.49it/s]


Epoch: 12, Loss:  0.278266966342926


100%|██████████| 98/98 [00:18<00:00,  5.41it/s]


Epoch: 13, Loss:  0.17205782234668732


100%|██████████| 98/98 [00:18<00:00,  5.38it/s]


Epoch: 14, Loss:  0.195742666721344


100%|██████████| 98/98 [00:17<00:00,  5.48it/s]


Epoch: 15, Loss:  0.12646067142486572


100%|██████████| 98/98 [00:17<00:00,  5.47it/s]


Epoch: 16, Loss:  0.11076873540878296


100%|██████████| 98/98 [00:18<00:00,  5.42it/s]


Epoch: 17, Loss:  0.1764114648103714


100%|██████████| 98/98 [00:17<00:00,  5.48it/s]


Epoch: 18, Loss:  0.10863284021615982


100%|██████████| 98/98 [00:17<00:00,  5.56it/s]


Epoch: 19, Loss:  0.2405577003955841


100%|██████████| 98/98 [00:17<00:00,  5.68it/s]


Epoch: 20, Loss:  0.03647680953145027


100%|██████████| 98/98 [00:16<00:00,  5.88it/s]


Epoch: 21, Loss:  0.1515214443206787


100%|██████████| 98/98 [00:16<00:00,  5.87it/s]


Epoch: 22, Loss:  0.0459723025560379


100%|██████████| 98/98 [00:16<00:00,  5.86it/s]


Epoch: 23, Loss:  0.1091795563697815


100%|██████████| 98/98 [00:17<00:00,  5.58it/s]


Epoch: 24, Loss:  0.06626064330339432


100%|██████████| 98/98 [00:17<00:00,  5.62it/s]


Epoch: 25, Loss:  0.01724928990006447


100%|██████████| 98/98 [00:16<00:00,  5.91it/s]


Epoch: 26, Loss:  0.01190884131938219


100%|██████████| 98/98 [00:16<00:00,  5.80it/s]


Epoch: 27, Loss:  0.0621895007789135


100%|██████████| 98/98 [00:17<00:00,  5.66it/s]


Epoch: 28, Loss:  0.059174735099077225


100%|██████████| 98/98 [00:17<00:00,  5.74it/s]


Epoch: 29, Loss:  0.02192297950387001


100%|██████████| 98/98 [00:17<00:00,  5.73it/s]


Epoch: 30, Loss:  0.02540813758969307


100%|██████████| 98/98 [00:17<00:00,  5.73it/s]


Epoch: 31, Loss:  0.008669687435030937


100%|██████████| 98/98 [00:17<00:00,  5.61it/s]


Epoch: 32, Loss:  0.07970163971185684


100%|██████████| 98/98 [00:17<00:00,  5.54it/s]


Epoch: 33, Loss:  0.040298521518707275


100%|██████████| 98/98 [00:16<00:00,  5.79it/s]


Epoch: 34, Loss:  0.025854920968413353


100%|██████████| 98/98 [00:16<00:00,  5.87it/s]


Epoch: 35, Loss:  0.023279257118701935


100%|██████████| 98/98 [00:16<00:00,  5.83it/s]


Epoch: 36, Loss:  0.059121403843164444


100%|██████████| 98/98 [00:16<00:00,  5.82it/s]


Epoch: 37, Loss:  0.010300053283572197


100%|██████████| 98/98 [00:17<00:00,  5.60it/s]


Epoch: 38, Loss:  0.033717669546604156


100%|██████████| 98/98 [00:17<00:00,  5.58it/s]


Epoch: 39, Loss:  0.035134103149175644


100%|██████████| 98/98 [00:17<00:00,  5.54it/s]


Epoch: 40, Loss:  0.011988090351223946


100%|██████████| 98/98 [00:17<00:00,  5.52it/s]


Epoch: 41, Loss:  0.04552920535206795


100%|██████████| 98/98 [00:17<00:00,  5.70it/s]


Epoch: 42, Loss:  0.08874118328094482


100%|██████████| 98/98 [00:18<00:00,  5.43it/s]


Epoch: 43, Loss:  0.053148381412029266


100%|██████████| 98/98 [00:17<00:00,  5.55it/s]


Epoch: 44, Loss:  0.09068293869495392


100%|██████████| 98/98 [00:18<00:00,  5.40it/s]


Epoch: 45, Loss:  0.13976506888866425


100%|██████████| 98/98 [00:17<00:00,  5.52it/s]


Epoch: 46, Loss:  0.02715793065726757


100%|██████████| 98/98 [00:17<00:00,  5.72it/s]


Epoch: 47, Loss:  0.034016985446214676


100%|██████████| 98/98 [00:17<00:00,  5.68it/s]


Epoch: 48, Loss:  0.1187867745757103


100%|██████████| 98/98 [00:17<00:00,  5.69it/s]


Epoch: 49, Loss:  0.004840452689677477


100%|██████████| 98/98 [00:17<00:00,  5.70it/s]


Epoch: 50, Loss:  0.03753616660833359


100%|██████████| 98/98 [00:16<00:00,  5.77it/s]


Epoch: 51, Loss:  0.0456959493458271


100%|██████████| 98/98 [00:17<00:00,  5.76it/s]


Epoch: 52, Loss:  0.0013494921149685979


100%|██████████| 98/98 [00:16<00:00,  5.86it/s]


Epoch: 53, Loss:  0.012976637110114098


100%|██████████| 98/98 [00:16<00:00,  5.80it/s]


Epoch: 54, Loss:  0.008827537298202515


100%|██████████| 98/98 [00:17<00:00,  5.76it/s]


Epoch: 55, Loss:  0.05436988174915314


100%|██████████| 98/98 [00:16<00:00,  5.86it/s]


Epoch: 56, Loss:  0.0360587015748024


100%|██████████| 98/98 [00:16<00:00,  5.78it/s]


Epoch: 57, Loss:  0.017268994823098183


100%|██████████| 98/98 [00:16<00:00,  5.78it/s]


Epoch: 58, Loss:  0.04189328849315643


100%|██████████| 98/98 [00:17<00:00,  5.59it/s]


Epoch: 59, Loss:  0.007939830422401428


100%|██████████| 98/98 [00:17<00:00,  5.57it/s]


Epoch: 60, Loss:  0.016234640032052994


100%|██████████| 98/98 [00:17<00:00,  5.70it/s]


Epoch: 61, Loss:  0.0011184604372829199


100%|██████████| 98/98 [00:17<00:00,  5.59it/s]


Epoch: 62, Loss:  0.00940537266433239


100%|██████████| 98/98 [00:17<00:00,  5.59it/s]

Epoch: 63, Loss:  0.015813151374459267





In [14]:
# Validation
def validation(testing_loader):
    model.eval()
    fin_targets = []
    fin_outputs = []

    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0), total=len(testing_loader)):
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            fin_targets.extend(labels.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs.logits).cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets

In [15]:
outputs, targets = validation(testloader)

outputs = np.array(outputs) >= 0.5

100%|██████████| 25/25 [00:01<00:00, 16.30it/s]


In [16]:
# Multi-label F1 score, macro-averaged
from torchmetrics.classification import MultilabelF1Score
f1 = MultilabelF1Score(num_labels=18, threshold=0.5, average='macro')
f1.update(torch.tensor(outputs), torch.tensor(targets))

f1.compute()

tensor(0.3016)

In [17]:
# Save model
model.save_pretrained('model')

In [18]:
# Inferencing
def inference(input, threshold = 0.5, model=model, tokenizer=tokenizer):
    encoding = tokenizer.encode_plus(
        input,
        add_special_tokens=True,
        max_length=64,
        return_token_type_ids=True,
        padding='max_length',
        pad_to_max_length=True,
        return_attention_mask=True,
        truncation=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    outputs = model(input_ids, attention_mask=attention_mask)
    outputs = torch.sigmoid(outputs.logits).cpu().detach().numpy().tolist()[0]
    outputs = np.array(outputs) >= threshold
    outputs = np.where(outputs == True)[0]
    outputs = [genres[i] for i in outputs]
    print(outputs)

In [19]:
inference('The Untouchables (1987)')

['Crime', 'Drama', 'Action']
