In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, hamming_loss
from sklearn.preprocessing import MultiLabelBinarizer
import torch
from transformers import DistilBertTokenizer, AutoTokenizer
from transformers import DistilBertForSequenceClassification, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import EvalPrediction
import pickle

In [None]:
data = pd.read_csv('./IMDB-Movie-Data.csv', usecols = ["Genre", "Description"])

data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.duplicated().sum()

In [None]:
data['Description'].str.len().plot.hist(bins=50)

In [None]:
data['Genre'] = data['Genre'].str.split(',')

data.head()

In [None]:
genres = [g for genre in data['Genre'] for g in genre]

genres

In [None]:
genre_counts = pd.Series(genres).value_counts()

genre_counts

### Label Encoder

In [None]:
multilabel = MultiLabelBinarizer()

In [None]:
labels = multilabel.fit_transform(data['Genre']).astype('float32')

labels

In [None]:
texts = data['Description'].tolist()

texts

### Model Building

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size = 0.2,
                                                                      random_state = 42)

In [None]:
model_ckpt = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_ckpt)
model = DistilBertForSequenceClassification.from_pretrained(model_ckpt, num_labels = len(labels[0]),
                                            problem_type = 'multi_label_classification')

In [None]:
# Building Custom Dataset
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len = 128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)
        
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = torch.tensor(self.labels[idx])

        encoding = self.tokenizer(text, truncation = True, padding = "max_length", max_length = self.max_len, return_tensors = 'pt')

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': label
        }

In [None]:
train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

In [None]:
train_dataset[0]

In [None]:
val_dataset[0]

### Multi-Label Classification Evaluation Metrics

In [None]:
def multi_labels_metrics(predictions, labels, threshold=0.3):
  sigmoid = torch.nn.Sigmoid()
  probs = sigmoid(torch.Tensor(predictions))

  y_pred = np.zeros(probs.shape)
  y_pred[np.where(probs>=threshold)] = 1
  y_true = labels

  f1 = f1_score(y_true, y_pred, average = 'macro')
  roc_auc = roc_auc_score(y_true, y_pred, average = 'macro')
  hamming = hamming_loss(y_true, y_pred)

  metrics = {
      "roc_auc": roc_auc,
      "hamming_loss": hamming,
      "f1": f1
  }

  return metrics

def compute_metrics(p:EvalPrediction):
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

  result = multi_labels_metrics(predictions=preds,
                                labels=p.label_ids)

  return result

In [None]:
args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir = './results',
    num_train_epochs=5,
    save_steps=1000,
    save_total_limit=2
)

trainer = Trainer(model=model,
                  args=args,
                  train_dataset=train_dataset,
                  eval_dataset = val_dataset,
                  compute_metrics=compute_metrics)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model('./distilbert_finetuned_multilabel')

In [None]:
with open("multi-label_binarizer.pkl", "wb") as f:
    pickle.dump(multilabel, f)

### Predictions

In [None]:
text = "Low-level FBI agent Peter Sutherland works in the basement of the White House manning a phone that never rings - until the night it does, propelling him into a conspiracy that leads all the way to the Oval Office"

encoding = tokenizer(text, return_tensors = 'pt')
encoding

In [None]:
encoding.to(trainer.model.device)
outputs = trainer.model(**encoding)

outputs

In [None]:
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(outputs.logits[0].cpu())

probs

In [None]:
preds = np.zeros(probs.shape)

preds

In [None]:
preds[np.where(probs >= 0.3)] = 1

preds

In [None]:
multilabel.classes_

In [None]:
multilabel.inverse_transform(preds.reshape(1, -1))