## Imports

In [1]:
import transformers
from datasets import Dataset
from transformers import DataCollatorWithPadding
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, jaccard_score, hamming_loss, f1_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from torchtext.data.utils import get_tokenizer
from torch.nn import functional as F
import torch
import random
import os

In [2]:
import sys
sys.path.append('..')

In [3]:
from src.data.nn_utils import get_whole_target, MovieHFDatasetMLL, MovieHFDataset
from src.data.text_processing import replace_labels, partial_clean_text
from src.models.eval_nn_utils import compute_metrics_mlc, multi_label_metrics, compute_metrics, transform_predictions
from src.models.train_predict import evaluate_ml

In [3]:
from dotenv import load_dotenv
import os
load_dotenv()
SEED=int(os.getenv('SEED'))

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def seed_everything(seed):
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(SEED)

For fine-tuning of big models I'm going to use less transformed dataset - the only transformation I'm going to use is cleaning, without deleting stop words, punctuation and lemmatizing. Later work will be done using standart hugging face methods tokenizer and trainer.

## Data processing

In [5]:
df_filtered = pd.read_csv('../data/interim/data_intermidiate.csv')

In [6]:
replacements = [
    ('animated', 'animation'),
    ('biography', 'biographical'),
    ('biopic', 'biographical'),
    ('com', 'com'),
    ('com', 'comedy'),
    ('docudrama', 'documentary drama'),
    ('dramedy', 'drama comedy'),
    ('sci fi', 'sci_fi'),
    ('science fiction', 'sci_fi'),
    ('film', ''),
    ('world war ii', 'world_ii war'),
    ('rom ', 'romantic '),
    ('romance', 'romantic'),
    ('comedyedy', 'comedy')
]

In [7]:
labels = df_filtered['Genre'].apply(lambda x: replace_labels(x, replacements))

In [8]:
plots_cleaned = df_filtered['Plot'].apply(partial_clean_text)

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [10]:
X_train, X_val, y_train, y_val = train_test_split(plots_cleaned, labels, test_size=0.2, stratify=labels)

In [11]:
X_train_tokenized = tokenizer(X_train.to_list(), truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val.to_list(), truncation=True, max_length=512)

In [12]:
labels_unique = labels.unique()
num_labels = len(labels_unique)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels_unique):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [None]:
train_mlc_dataset = MovieHFDataset(X_train_tokenized, y_train, label2id)
val_mlc_dataset = MovieHFDataset(X_val_tokenized, y_val, label2id)

# Training

## Multiclass

In [14]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=num_labels
)
model = model.to(device)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
! pip install -U accelerate
! pip install -U transformers
! pip install -U transformers[torch]

In [None]:
training_args = TrainingArguments(
    output_dir="Bert_clf",
    learning_rate=2e-5,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer_mlc = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_mlc_dataset,
    eval_dataset=val_mlc_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics_mlc,
)


In [None]:
trainer_mlc.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,2.3447,2.277169,0.436622,0.359649
2,1.9944,2.136765,0.457011,0.392894
3,1.8146,2.137366,0.462589,0.400411


TrainOutput(global_step=7797, training_loss=2.1801505045385046, metrics={'train_runtime': 3335.0577, 'train_samples_per_second': 18.703, 'train_steps_per_second': 2.338, 'total_flos': 8261922572612064.0, 'train_loss': 2.1801505045385046, 'epoch': 3.0})

In [None]:
trainer_mlc.save_model("../models/multiclass_models/NN models/bert_multi_class")

## Multilabel

In [None]:
labels = labels.apply(lambda x: x.split())

In [None]:
labels

0                 [western]
1                  [comedy]
2                   [short]
3            [biographical]
4                  [comedy]
                ...        
25986               [drama]
25987               [drama]
25988              [comedy]
25989              [comedy]
25990    [romantic, comedy]
Name: Genre, Length: 25991, dtype: object

In [None]:
mlb = MultiLabelBinarizer()
one_hot_labels = mlb.fit_transform(labels)

In [None]:
one_hot_train = mlb.transform(y_train.apply(lambda x: x.split()))
one_hot_val = mlb.transform(y_val.apply(lambda x: x.split()))

In [None]:
mlb.classes_

array(['action', 'adventure', 'animation', 'anime', 'arts',
       'biographical', 'black', 'comedy', 'costume', 'crime',
       'devotional', 'disaster', 'documentary', 'drama', 'epic', 'family',
       'fantasy', 'feature', 'folklore', 'historical', 'horror', 'kaiju',
       'literary', 'martial', 'masala', 'melodrama', 'mockumentary',
       'musical', 'mystery', 'mythology', 'noir', 'period', 'political',
       'propaganda', 'psychological', 'romantic', 'sci_fi', 'screwball',
       'serial', 'sex', 'short', 'slapstick', 'slasher', 'social',
       'sports', 'spy', 'subject', 'superhero', 'suspense',
       'swashbuckler', 'thriller', 'tokusatsu', 'war', 'western',
       'world_ii'], dtype=object)

In [None]:
id2label = {idx:label for idx, label in enumerate(mlb.classes_)}
label2id = {label:idx for idx, label in enumerate(mlb.classes_)}

There are some classes that are very simmilar - animated and animation, 'biographical', 'biography', 'biopic'; com and comedy, docudrama can be split into documentary and drama, dramedy - to drama and comedy, sci and fi sjhould be merged to sci-fi, 'rom', 'romance', 'romantic' should be merged to romantic, world and ii should be merged to 'world war ii' as this is the only case thay appear.

In [None]:
train_dataset_mll = MovieHFDatasetMLL(X_train_tokenized, one_hot_train, label2id)
val_dataset_mll = MovieHFDatasetMLL(X_val_tokenized, one_hot_val, label2id)

In [None]:
model_mll = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased",
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(mlb.classes_))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
args = TrainingArguments(
    'MLL_tuned_Bert',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1'
    #push_to_hub=True,
)

In [None]:
trainer_mll = Trainer(
    model_mll,
    args,
    train_dataset=train_dataset_mll,
    eval_dataset=val_dataset_mll,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer_mll.train()



Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy,Jaccard
1,0.0541,0.05163,0.350929,0.624399,0.2814,0.325992
2,0.0444,0.047199,0.478889,0.681938,0.373918,0.441399
3,0.0389,0.046717,0.508579,0.698249,0.39623,0.471357


TrainOutput(global_step=7797, training_loss=0.05480613319540813, metrics={'train_runtime': 3315.0203, 'train_samples_per_second': 18.816, 'train_steps_per_second': 2.352, 'total_flos': 8249586564603120.0, 'train_loss': 0.05480613319540813, 'epoch': 3.0})

In [None]:
trainer_mll.save_model("../models/multilabel_models/NN models/bert_multi-label")

# Evaluation

## Evaluation multiclasss

In [None]:
bert_mlc = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/Models/bert_tuned")



In [None]:
trained_mlc = Trainer(
    model=bert_mlc,
    args=training_args,
    train_dataset=train_mlc_dataset,
    eval_dataset=val_mlc_dataset,
    data_collator=data_collator,
)

In [None]:
train_pred = trained_mlc.predict(train_mlc_dataset)

In [None]:
test_pred = trained_mlc.predict(val_mlc_dataset)

In [None]:
print(classification_report(train_pred.label_ids, np.argmax(train_pred.predictions, axis=1), target_names=list(id2label.values())))

                             precision    recall  f1-score   support

                    western       0.78      0.96      0.86       692
                     comedy       0.59      0.75      0.66      3518
                      short       0.00      0.00      0.00        24
               biographical       0.00      0.00      0.00       170
                      drama       0.51      0.82      0.63      4793
                  adventure       0.51      0.58      0.54       421
                     horror       0.58      0.89      0.70       937
                      crime       0.31      0.39      0.35       454
               drama horror       0.00      0.00      0.00        12
           historical drama       0.00      0.00      0.00        52
                    fantasy       0.38      0.02      0.04       163
                       epic       0.00      0.00      0.00         8
                 historical       0.00      0.00      0.00        62
               comedy short      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


From above report I see that vast majority of classes are never predicted but those few that do are predicted in the most accurate way across all models with an accuracy of 41%.

In [None]:
print(classification_report(test_pred.label_ids, np.argmax(test_pred.predictions, axis=1), target_names=list(id2label.values())))

                             precision    recall  f1-score   support

                    western       0.74      0.95      0.83       173
                     comedy       0.54      0.70      0.61       880
                      short       0.00      0.00      0.00         6
               biographical       0.00      0.00      0.00        43
                      drama       0.48      0.78      0.59      1198
                  adventure       0.47      0.51      0.49       106
                     horror       0.56      0.89      0.69       235
                      crime       0.26      0.29      0.27       114
               drama horror       0.00      0.00      0.00         3
           historical drama       0.00      0.00      0.00        13
                    fantasy       0.00      0.00      0.00        41
                       epic       0.00      0.00      0.00         2
                 historical       0.00      0.00      0.00        15
               comedy short      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
print(f'Number of predicted classes = {len(np.unique(np.argmax(test_pred.predictions, axis=1)))}')

Number of predicted classes = 17


Test report is also not great - only 17 classes out of 138 were at least predicted and overall accuracy is 0.39 which is nearly identical to logistic regression which is surpassed by 1% in terms of weighted f1 score. the best classified classes are traditionally western, horror, drama and comedy.

In [None]:
accuracy_bert_mlc = (test_pred.label_ids == np.argmax(test_pred.predictions, axis=1)).sum()/len(test_pred.predictions,)
print(f'Test accuracy of tuned bert classifier = {accuracy_bert_mlc}')

Test accuracy of tuned bert classifier = 0.385843431429121


Test accuracy appears to be slightly worse then the logistic regression and so few conclusiob could be made. Main one is that nn methods need a lot of high quality data and they cannot learn from just few examples. For the most frequent classes - drama and comedy, bert far surpasses logistic regression which has f1 score for them of 0.48 and 0.49 while bert - 0.59 and 0.61 but it cannot accurately predict less frequent classes and so the result is not great.

## Evaluation multilabel

In [None]:
bert_mll = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/Models/mll_bert_tuned")

In [None]:
trained_mll = Trainer(
    bert_mll,
    args,
    train_dataset=train_dataset_mll,
    eval_dataset=val_dataset_mll,
    tokenizer=tokenizer
)

In [None]:
train_pred_mll = trained_mll.predict(train_dataset_mll)

In [None]:
test_pred_mll = trained_mll.predict(val_dataset_mll)

In [None]:
trans_pred_train = transform_predictions(train_pred_mll.predictions)
trans_pred_test = transform_predictions(test_pred_mll.predictions)

In [None]:
print(classification_report(train_pred_mll.label_ids, trans_pred_train, target_names=list(id2label.values())))

               precision    recall  f1-score   support

       action       0.72      0.52      0.60      1526
    adventure       0.79      0.45      0.58       508
    animation       0.88      0.67      0.76       484
        anime       0.00      0.00      0.00        90
         arts       0.00      0.00      0.00        37
 biographical       0.00      0.00      0.00       239
        black       0.00      0.00      0.00        42
       comedy       0.86      0.69      0.77      5386
      costume       0.00      0.00      0.00        13
        crime       0.65      0.27      0.38      1131
   devotional       0.00      0.00      0.00        10
     disaster       0.00      0.00      0.00        17
  documentary       0.00      0.00      0.00        71
        drama       0.74      0.75      0.75      6892
         epic       0.00      0.00      0.00         8
       family       0.00      0.00      0.00       352
      fantasy       0.00      0.00      0.00       246
      fea

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
mll_bert_report = classification_report(test_pred_mll.label_ids, trans_pred_test, target_names=list(id2label.values()))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
print(mll_bert_report)

               precision    recall  f1-score   support

       action       0.74      0.52      0.61       379
    adventure       0.78      0.33      0.46       127
    animation       0.89      0.66      0.76       122
        anime       0.00      0.00      0.00        22
         arts       0.00      0.00      0.00         9
 biographical       0.00      0.00      0.00        60
        black       0.00      0.00      0.00        10
       comedy       0.86      0.69      0.77      1347
      costume       0.00      0.00      0.00         3
        crime       0.53      0.19      0.28       283
   devotional       0.00      0.00      0.00         2
     disaster       0.00      0.00      0.00         4
  documentary       0.00      0.00      0.00        18
        drama       0.72      0.76      0.74      1722
         epic       0.00      0.00      0.00         2
       family       0.00      0.00      0.00        89
      fantasy       0.00      0.00      0.00        61
      fea

From classiffication report I see that multilabel problem is much more suitable as there are much less classes and classes are more presented. The scores goy much higher - micro f1 score is now 64% and weighted - 59%. The downside is that still most of the classes are predicted poorly, model needs at least 500 instances to start classifying it.

Next I'll get some numeric metrics

In [None]:
evaluate_ml(test_pred_mll.label_ids, trans_pred_test, 'test')

model's test score = 0.48432390844393153
model's jaccard test score = 0.5632974289927549
model's hamming test loss = 0.01372641591914529
model's one match test score = 0.6441623389113291


In [None]:
evaluate_ml(train_pred_mll.label_ids, trans_pred_train, 'train')

model's train score = 0.49850904193920736
model's jaccard train score = 0.5768685071181224
model's hamming train loss = 0.01320612823113785
model's one match train score = 0.6571758368603309


From numeric metrics I see that model is not overfitting, it's full match accuracy is much higher then the best multiclass accuray - 0.48 vs 0.39. More adequete and less strict jaccard score here is 0.56 in test case and in 65% of cases model gives at least some match.

I want to plot number of instances of each class vs it's f1 score.

In [None]:
counts_test = np.sum(test_pred_mll.label_ids == 1, axis=0)
counts_train = np.sum(train_pred_mll.label_ids == 1, axis=0)
counts = counts_train + counts_test

In [None]:
f1_scores = f1_score(test_pred_mll.label_ids, trans_pred_test, average=None)

In [None]:
label_scores = pd.DataFrame(np.array([f1_scores, counts]).T, columns=['f1_score', 'n_instances'], index=list(id2label.values()))

In [None]:
print(f'Correlation between n_instances and f1 score = {label_scores.f1_score.corr(label_scores.n_instances)}')

Correlation between n_instances and f1 score = 0.6250997362051567


Correlation between quality of predictions and number of classes is very high but what number of instances is needed to get at least some non zero predictions qualtity?

In [None]:
label_scores[label_scores['f1_score'] > 0].sort_values(by='n_instances')

Unnamed: 0,f1_score,n_instances
war,0.705882,488.0
animation,0.760563,606.0
adventure,0.464088,635.0
sci_fi,0.662069,743.0
musical,0.491379,788.0
western,0.875676,909.0
horror,0.795699,1378.0
crime,0.28125,1414.0
thriller,0.276042,1503.0
action,0.610853,1905.0


The smallest number of instances with non-zero f1 is 488 and the label is war. so I guess 500 can be called a logical  point to filter out the infrequent classes

Things to do - try models with smaller number of labels - for example only the above, perform same steps on calc

# Multilabel with best labels

Below are the best classes from multilabel ml part. I'll fine tune bert only on those and compare the result.

In [None]:
best_classes = ['spy', 'short', 'fantasy', 'mystery', 'war', 'animation', 'adventure', 'sci_fi', 'musical', 'western', 'horror', 'crime', 'thriller', 'action', 'romantic', 'comedy', 'drama']

In [None]:
genres_shortened = labels.apply(lambda labels: list(filter(lambda label: label in best_classes , labels)))

In [None]:
data_shortened = pd.concat([plots_cleaned, genres_shortened], axis=1)
data_shortened = data_shortened[data_shortened['Genre'].apply(lambda x: len(x) > 0)]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(data_shortened.Plot, data_shortened.Genre, test_size=0.2, stratify=data_shortened.Genre)

In [None]:
X_train_tokenized = tokenizer(X_train.to_list(), truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val.to_list(), truncation=True, max_length=512)

In [None]:
mlb = MultiLabelBinarizer()
one_hot_labels = mlb.fit_transform(data_shortened.Genre)

In [None]:
one_hot_train = mlb.transform(y_train)
one_hot_val = mlb.transform(y_val)

In [None]:
id2label = {idx:label for idx, label in enumerate(mlb.classes_)}
label2id = {label:idx for idx, label in enumerate(mlb.classes_)}

In [None]:
train_dataset_mll_shortened = MovieHFDatasetMLL(X_train_tokenized, one_hot_train, label2id)
val_dataset_mll_shortened = MovieHFDatasetMLL(X_val_tokenized, one_hot_val, label2id)

In [None]:
model_mll_shortened = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased",
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(mlb.classes_))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="Bert_clf",
    learning_rate=2e-5,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)
trainer_mll_shortened = Trainer(
    model_mll_shortened,
    training_args,
    train_dataset=train_dataset_mll_shortened,
    eval_dataset=val_dataset_mll_shortened,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer_mll_shortened.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.1404,0.131868,0.524024,0.704443,0.387289
2,0.1135,0.124194,0.586427,0.746023,0.453928
3,0.0905,0.129072,0.605263,0.763313,0.474907
4,0.0762,0.133059,0.601684,0.764913,0.472645


TrainOutput(global_step=9724, training_loss=0.11261177043373424, metrics={'train_runtime': 4178.7206, 'train_samples_per_second': 18.615, 'train_steps_per_second': 2.327, 'total_flos': 1.0285368944726064e+16, 'train_loss': 0.11261177043373424, 'epoch': 4.0})

In [None]:
trainer_mll_shortened.save_model("/content/drive/MyDrive/Models/mll_bert_tuned_shortened")

In [None]:
train_pred = trainer_mll_shortened.predict(train_dataset_mll_shortened)

In [None]:
test_pred = trainer_mll_shortened.predict(val_dataset_mll_shortened)

In [None]:
trans_pred_train = transform_predictions(train_pred.predictions)
trans_pred_test = transform_predictions(test_pred.predictions)

In [None]:
evaluate_ml(train_pred.label_ids, trans_pred_train, 'train')

model's train score = 0.5806551138993161
model's jaccard train score = 0.6562323237517355
model's hamming train loss = 0.034497986987256465
model's one match train score = 0.7335321643441147


In [None]:
evaluate_ml(test_pred.label_ids, trans_pred_test, 'test')

model's test score = 0.4539284245166598
model's jaccard test score = 0.5217160290689702
model's hamming test loss = 0.05013671449657609
model's one match test score = 0.5913204442616208


Achieved results are 2% better both in terms of jaccard score and full-match score then a corresponding chained logistic regression.

# Conclusions

All in all, fine tuned bert appear to be the most powerful models - they achieve the best results in all - multiclass, multilabel and multuilabel with the most frequent classes, although the difference between it and log_reg is not that huge - 5% in mlc, 3% in mll and only 2% in mll with less classes. And so, because log_reg is much more interpretable, in scripts for prediction I'll have 2 options - bert and log_reg, depending on importance of speed and interpretability for potential user.