# **0. Setting up dependencies**

In [1]:
!nvidia-smi

Sun Jan 12 03:59:51 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   38C    P0             26W /  250W |       0MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import torch
cuda_available = torch.cuda.is_available()
print(f"CUDA available: {cuda_available}")

if cuda_available:
    num_gpus = torch.cuda.device_count()
    print(f"Number of GPUs available: {num_gpus}")
    
    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
    
    current_device = torch.cuda.current_device()
    print(f"Current GPU device: {current_device}")
    print(f"Current GPU device name: {torch.cuda.get_device_name(current_device)}")
else:
    print("No CUDA-enabled GPU detected.")

CUDA available: True
Number of GPUs available: 1
GPU 0: Tesla P100-PCIE-16GB
Current GPU device: 0
Current GPU device name: Tesla P100-PCIE-16GB


In [None]:
!pip install -q transformers[sentencepiece] fastai nbdev  -q

In [None]:
%cd ..
!git clone https://github.com/ohmeow/blurr.git
%cd blurr
!pip install -e ".[dev]" --quiet

In [None]:
import torch
torch.__version__

In [None]:
# !pip install onnx==1.10.0 
!pip install onnx --quiet
!pip install onnxruntime-gpu onnxruntime_tools onnxmltools --quiet
!pip install onnx optimum torch onnxruntime --quiet
# !pip install onnxruntime --upgrade

After installing, restart the kernel and start executing from the import section

# **Importing libraries**

In [5]:
import onnxruntime as rt
from onnxruntime.quantization import quantize_dynamic, QuantType

from transformers import AutoTokenizer
import torch

import numpy as np
import torch
from sklearn.metrics import f1_score, accuracy_score
import pandas as pd

from fastai.text.all import *

from transformers import AutoModelForSequenceClassification, AutoConfig
from tqdm.notebook import tqdm

from blurr.text import *
from blurr.text.data.all import *
from blurr.text.modeling import *
from blurr.text.modeling.core import BaseModelWrapper, BaseModelCallback, blurr_splitter

# **Loading and Reading Data**

In [6]:
df = pd.read_csv("/kaggle/input/nlp-project-data/film_details.csv")
df = df.rename(columns={'Plot summary': 'Plot_summary'})
df = df[df['Genres'] != "['']"]
df

Unnamed: 0,Title,Category,Url,Metascore,Number of critic reviewers,User score,Number of user reviewers,Plot_summary,Genres
0,Dekalog (1988),movie,https://www.metacritic.com/movie/dekalog-1988/,100,13,100,112,"This masterwork by Krzysztof Kieślowski is one of the twentieth century’s greatest achievements in visual storytelling. Originally made for Polish television, Dekalog focuses on the residents of a housing complex in late-Communist Poland, whose lives become subtly intertwined as they face emotional dilemmas that are at once deeply person...",['Drama']
1,The Godfather,movie,https://www.metacritic.com/movie/the-godfather/,100,16,100,4082,"Francis Ford Coppola's epic features Marlon Brando in his Oscar-winning role as the patriarch of the Corleone family. Director Coppola paints a chilling portrait of the Sicilian clan's rise and near fall from power in America, masterfully balancing the story between the Corleone's family life and the ugly crime business in which they are...","['Crime', 'Drama']"
2,Lawrence of Arabia (re-release),movie,https://www.metacritic.com/movie/lawrence-of-arabia-re-release/,100,8,100,442,"The 40th anniversary re-release of David Lean's 1962 masterpiece, starring Peter O'Toole in one of the most electrifying debuts in film history.","['Adventure', 'Biography', 'Drama', 'War']"
3,The Leopard (re-release),movie,https://www.metacritic.com/movie/the-leopard-re-release/,100,12,100,84,"Set in Sicily in 1860, Luchino Visconti's spectacular 1963 adaptation of Giuseppe di Lampedusa's international bestseller is one of the cinema's greatest evocations of the past, achingly depicting the passing of an ancient order. (Film Forum)","['Drama', 'History']"
4,The Conformist,movie,https://www.metacritic.com/movie/the-conformist-re-release/,100,11,100,105,"Set in Rome in the 1930s, this re-release of Bernardo Bertolucci's 1970 breakthrough feature stars Jean-Louis Trintignant as a Mussolini operative sent to Paris to locate and eliminate an old professor who fled Italy when the fascists came to power.",['Drama']
...,...,...,...,...,...,...,...,...,...
15149,Cavemen,tv,https://www.metacritic.com/tv/cavemen/,19,13,19,6,"Cavemen revolves around Joel, his younger brother Andy and best friend Nick. Joel is engaged to Kate. Even though he has some problems with her conservative parents accepting him, he is happy with his life and pursuing the American Dream. The pilot is based on the Geico commercials that claim their service is so easy, ""Even a caveman can...","['Comedy', 'Sci-Fi']"
15150,Work It,tv,https://www.metacritic.com/tv/work-it/,19,22,19,44,"After they are laid off, Lee Standish (Ben Koldyke) and Angel Ortiz (Amaury Nolasco) dress up as women to get new jobs as pharmaceutical sales wo(men).",['Comedy']
15151,Category 7: The End of the World,tv,https://www.metacritic.com/tv/category-7-the-end-of-the-world/,18,11,18,7,"""Category 7: The End of the World"" picks up where ""Category 6: Day of Destruction"" left off. The city of Chicago has been destroyed by a monstrous storm and as the storm gathers strength, it threatens to ravage the rest of the world. A television evangelist and his wife prey on the nation’s fears by broadcasting warnings of biblical pl...","['Action', 'Adventure', 'Drama', 'Sci-Fi', 'Thriller']"
15152,Stalker,tv,https://www.metacritic.com/tv/stalker/,17,24,17,137,"Lt. Beth Davis (Maggie Q) leads the Threat Assessment Unit of the Los Angeles Police Department that includes recent transfer Det. Jack Larsen (McDermott), Det. Ben Caldwell (Victor Rasuk), and Det. Janice Lawrence (Mariana Klaveno).","['Crime', 'Drama', 'Thriller']"


In [7]:
df.shape

(15149, 9)

In [8]:
genres_list = df.Genres.to_list()
genre_count = {}
for genres in genres_list:
  genre_list = eval(genres)
  for genre in genre_list:
    if genre in genre_count.keys():
      genre_count[genre] += 1
    else:
      genre_count[genre] = 1
print(f"Number of Genres: {len(genre_count)}")
print()
print(genre_count)

Number of Genres: 27

{'Drama': 9078, 'Crime': 2788, 'Adventure': 1998, 'Biography': 1306, 'War': 622, 'History': 902, 'Mystery': 2083, 'Romance': 2920, 'Thriller': 4005, 'Film-Noir': 31, 'Comedy': 5257, 'Musical': 356, 'Animation': 619, 'Family': 1036, 'Fantasy': 1522, 'Action': 2606, 'Western': 241, 'Music': 777, 'Documentary': 1354, 'Sport': 462, 'Sci-Fi': 1527, 'Horror': 1579, 'News': 28, 'Unknown': 16, 'Reality-TV': 100, 'Talk-Show': 30, 'Game-Show': 30}


In [9]:
genre_count = df['Genres'].value_counts()
threshold = int(len(df) * 0.01)
rare_genres = [ cat for cat, count in genre_count.items() if count < threshold]
len(rare_genres), rare_genres[:5]

(1807,
 ["['Action', 'Crime', 'Thriller']",
  "['Biography', 'Drama']",
  "['Documentary', 'Biography']",
  "['Horror', 'Thriller']",
  "['Biography', 'Drama', 'History']"])

In [10]:
genres_list = df.Genres.to_list()
revised_genre_list = []
indices_to_drop = []

for idx, genres in enumerate(genres_list):
  genre_list = eval(genres)
  revised_genres = []

  for genre in genre_list:
    if genre not in rare_genres:
      revised_genres.append(genre)

  if len(revised_genres) == 0:
    indices_to_drop.append(idx)
  else:
    revised_genre_list.append(revised_genres)

df = df.drop(indices_to_drop).reset_index(drop=True)
df.shape

(15149, 9)

In [11]:
df['Revised_genres'] = revised_genre_list
df

Unnamed: 0,Title,Category,Url,Metascore,Number of critic reviewers,User score,Number of user reviewers,Plot_summary,Genres,Revised_genres
0,Dekalog (1988),movie,https://www.metacritic.com/movie/dekalog-1988/,100,13,100,112,"This masterwork by Krzysztof Kieślowski is one of the twentieth century’s greatest achievements in visual storytelling. Originally made for Polish television, Dekalog focuses on the residents of a housing complex in late-Communist Poland, whose lives become subtly intertwined as they face emotional dilemmas that are at once deeply person...",['Drama'],[Drama]
1,The Godfather,movie,https://www.metacritic.com/movie/the-godfather/,100,16,100,4082,"Francis Ford Coppola's epic features Marlon Brando in his Oscar-winning role as the patriarch of the Corleone family. Director Coppola paints a chilling portrait of the Sicilian clan's rise and near fall from power in America, masterfully balancing the story between the Corleone's family life and the ugly crime business in which they are...","['Crime', 'Drama']","[Crime, Drama]"
2,Lawrence of Arabia (re-release),movie,https://www.metacritic.com/movie/lawrence-of-arabia-re-release/,100,8,100,442,"The 40th anniversary re-release of David Lean's 1962 masterpiece, starring Peter O'Toole in one of the most electrifying debuts in film history.","['Adventure', 'Biography', 'Drama', 'War']","[Adventure, Biography, Drama, War]"
3,The Leopard (re-release),movie,https://www.metacritic.com/movie/the-leopard-re-release/,100,12,100,84,"Set in Sicily in 1860, Luchino Visconti's spectacular 1963 adaptation of Giuseppe di Lampedusa's international bestseller is one of the cinema's greatest evocations of the past, achingly depicting the passing of an ancient order. (Film Forum)","['Drama', 'History']","[Drama, History]"
4,The Conformist,movie,https://www.metacritic.com/movie/the-conformist-re-release/,100,11,100,105,"Set in Rome in the 1930s, this re-release of Bernardo Bertolucci's 1970 breakthrough feature stars Jean-Louis Trintignant as a Mussolini operative sent to Paris to locate and eliminate an old professor who fled Italy when the fascists came to power.",['Drama'],[Drama]
...,...,...,...,...,...,...,...,...,...,...
15144,Cavemen,tv,https://www.metacritic.com/tv/cavemen/,19,13,19,6,"Cavemen revolves around Joel, his younger brother Andy and best friend Nick. Joel is engaged to Kate. Even though he has some problems with her conservative parents accepting him, he is happy with his life and pursuing the American Dream. The pilot is based on the Geico commercials that claim their service is so easy, ""Even a caveman can...","['Comedy', 'Sci-Fi']","[Comedy, Sci-Fi]"
15145,Work It,tv,https://www.metacritic.com/tv/work-it/,19,22,19,44,"After they are laid off, Lee Standish (Ben Koldyke) and Angel Ortiz (Amaury Nolasco) dress up as women to get new jobs as pharmaceutical sales wo(men).",['Comedy'],[Comedy]
15146,Category 7: The End of the World,tv,https://www.metacritic.com/tv/category-7-the-end-of-the-world/,18,11,18,7,"""Category 7: The End of the World"" picks up where ""Category 6: Day of Destruction"" left off. The city of Chicago has been destroyed by a monstrous storm and as the storm gathers strength, it threatens to ravage the rest of the world. A television evangelist and his wife prey on the nation’s fears by broadcasting warnings of biblical pl...","['Action', 'Adventure', 'Drama', 'Sci-Fi', 'Thriller']","[Action, Adventure, Drama, Sci-Fi, Thriller]"
15147,Stalker,tv,https://www.metacritic.com/tv/stalker/,17,24,17,137,"Lt. Beth Davis (Maggie Q) leads the Threat Assessment Unit of the Los Angeles Police Department that includes recent transfer Det. Jack Larsen (McDermott), Det. Ben Caldwell (Victor Rasuk), and Det. Janice Lawrence (Mariana Klaveno).","['Crime', 'Drama', 'Thriller']","[Crime, Drama, Thriller]"


In [12]:
revised_genres_list = df.Revised_genres.to_list()
revised_genre_count = {}
for genres in revised_genres_list:
  genre_list = genres
  for genre in genre_list:
    if genre in revised_genre_count.keys():
      revised_genre_count[genre] += 1
    else:
      revised_genre_count[genre] = 1
print(f"Number of Genres: {len(revised_genre_count)}")
print(revised_genre_count)

Number of Genres: 27
{'Drama': 9078, 'Crime': 2788, 'Adventure': 1998, 'Biography': 1306, 'War': 622, 'History': 902, 'Mystery': 2083, 'Romance': 2920, 'Thriller': 4005, 'Film-Noir': 31, 'Comedy': 5257, 'Musical': 356, 'Animation': 619, 'Family': 1036, 'Fantasy': 1522, 'Action': 2606, 'Western': 241, 'Music': 777, 'Documentary': 1354, 'Sport': 462, 'Sci-Fi': 1527, 'Horror': 1579, 'News': 28, 'Unknown': 16, 'Reality-TV': 100, 'Talk-Show': 30, 'Game-Show': 30}


In [13]:
encode_genre_types = { key: idx for idx, (key, value) in enumerate(revised_genre_count.items())}
with open("genre_types_encoded.json", "w") as fp:
  json.dump(encode_genre_types, fp)

In [14]:
# We need this because for multilabel classification all genres have possibility to be present in the predictions
categorical_genre_list = []
revised_genres_list = df.Revised_genres.to_list()

for revised_genres in revised_genres_list:
  categorical_list = [0] * len(encode_genre_types)
  for genre in revised_genres:
    genre_type_index = encode_genre_types[genre] 
    categorical_list[genre_type_index] = 1
  categorical_genre_list.append(categorical_list)

categorical_genre_list[3][:23]

[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [15]:
df['genre_cat_list'] = categorical_genre_list
df

Unnamed: 0,Title,Category,Url,Metascore,Number of critic reviewers,User score,Number of user reviewers,Plot_summary,Genres,Revised_genres,genre_cat_list
0,Dekalog (1988),movie,https://www.metacritic.com/movie/dekalog-1988/,100,13,100,112,"This masterwork by Krzysztof Kieślowski is one of the twentieth century’s greatest achievements in visual storytelling. Originally made for Polish television, Dekalog focuses on the residents of a housing complex in late-Communist Poland, whose lives become subtly intertwined as they face emotional dilemmas that are at once deeply person...",['Drama'],[Drama],"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,The Godfather,movie,https://www.metacritic.com/movie/the-godfather/,100,16,100,4082,"Francis Ford Coppola's epic features Marlon Brando in his Oscar-winning role as the patriarch of the Corleone family. Director Coppola paints a chilling portrait of the Sicilian clan's rise and near fall from power in America, masterfully balancing the story between the Corleone's family life and the ugly crime business in which they are...","['Crime', 'Drama']","[Crime, Drama]","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,Lawrence of Arabia (re-release),movie,https://www.metacritic.com/movie/lawrence-of-arabia-re-release/,100,8,100,442,"The 40th anniversary re-release of David Lean's 1962 masterpiece, starring Peter O'Toole in one of the most electrifying debuts in film history.","['Adventure', 'Biography', 'Drama', 'War']","[Adventure, Biography, Drama, War]","[1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,The Leopard (re-release),movie,https://www.metacritic.com/movie/the-leopard-re-release/,100,12,100,84,"Set in Sicily in 1860, Luchino Visconti's spectacular 1963 adaptation of Giuseppe di Lampedusa's international bestseller is one of the cinema's greatest evocations of the past, achingly depicting the passing of an ancient order. (Film Forum)","['Drama', 'History']","[Drama, History]","[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,The Conformist,movie,https://www.metacritic.com/movie/the-conformist-re-release/,100,11,100,105,"Set in Rome in the 1930s, this re-release of Bernardo Bertolucci's 1970 breakthrough feature stars Jean-Louis Trintignant as a Mussolini operative sent to Paris to locate and eliminate an old professor who fled Italy when the fascists came to power.",['Drama'],[Drama],"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...,...,...,...,...,...,...,...,...,...
15144,Cavemen,tv,https://www.metacritic.com/tv/cavemen/,19,13,19,6,"Cavemen revolves around Joel, his younger brother Andy and best friend Nick. Joel is engaged to Kate. Even though he has some problems with her conservative parents accepting him, he is happy with his life and pursuing the American Dream. The pilot is based on the Geico commercials that claim their service is so easy, ""Even a caveman can...","['Comedy', 'Sci-Fi']","[Comedy, Sci-Fi]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
15145,Work It,tv,https://www.metacritic.com/tv/work-it/,19,22,19,44,"After they are laid off, Lee Standish (Ben Koldyke) and Angel Ortiz (Amaury Nolasco) dress up as women to get new jobs as pharmaceutical sales wo(men).",['Comedy'],[Comedy],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
15146,Category 7: The End of the World,tv,https://www.metacritic.com/tv/category-7-the-end-of-the-world/,18,11,18,7,"""Category 7: The End of the World"" picks up where ""Category 6: Day of Destruction"" left off. The city of Chicago has been destroyed by a monstrous storm and as the storm gathers strength, it threatens to ravage the rest of the world. A television evangelist and his wife prey on the nation’s fears by broadcasting warnings of biblical pl...","['Action', 'Adventure', 'Drama', 'Sci-Fi', 'Thriller']","[Action, Adventure, Drama, Sci-Fi, Thriller]","[1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
15147,Stalker,tv,https://www.metacritic.com/tv/stalker/,17,24,17,137,"Lt. Beth Davis (Maggie Q) leads the Threat Assessment Unit of the Los Angeles Police Department that includes recent transfer Det. Jack Larsen (McDermott), Det. Ben Caldwell (Victor Rasuk), and Det. Janice Lawrence (Mariana Klaveno).","['Crime', 'Drama', 'Thriller']","[Crime, Drama, Thriller]","[1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


# Data Split

In [16]:
splitter = RandomSplitter(valid_pct=0.1, seed=42)
train_ids, valid_ids = splitter(df)
len(train_ids), len(valid_ids)

(13635, 1514)

In [17]:
valid_df = df.loc[valid_ids]
valid_df.head()

Unnamed: 0,Title,Category,Url,Metascore,Number of critic reviewers,User score,Number of user reviewers,Plot_summary,Genres,Revised_genres,genre_cat_list
10679,The Day the Earth Stood Still,movie,https://www.metacritic.com/movie/the-day-the-earth-stood-still/,40,34,40,134,"The Day the Earth Stood Still is 20th Century Fox’s contemporary reinvention of its 1951 classic. Klaatu, an alien who arrives on our planet, triggers a global upheaval. As governments and scientists race to unravel the mystery behind the visitor’s appearance, a woman and her young stepson get caught up in his mission – and come to unde...","['Adventure', 'Drama', 'Sci-Fi', 'Thriller']","[Adventure, Drama, Sci-Fi, Thriller]","[1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
10860,The Life Before Her Eyes,movie,https://www.metacritic.com/movie/the-life-before-her-eyes/,38,25,38,6,"The Life Before Her Eyes is an intense and visually evocative drama about the loss of youth, investigating how a single moment in time can define an entire life. Based on Laura Kasischke's visionary novel, the story hinges on a pivotal confrontation: two high school girls held captive by a gunman and forced to make the terrifying choice ...","['Drama', 'Fantasy', 'Mystery', 'Thriller']","[Drama, Fantasy, Mystery, Thriller]","[1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4059,Digging for Fire,movie,https://www.metacritic.com/movie/digging-for-fire/,69,22,69,24,The discovery of a bone and a gun send a husband and wife on separate adventures over the course of a weekend.,"['Comedy', 'Drama']","[Comedy, Drama]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
12081,Reclaim,movie,https://www.metacritic.com/movie/reclaim/,26,7,26,13,"After their newly adopted daughter goes missing in a small town, Steven and Shannon will stop at nothing to uncover the truth behind her disappearance and the dangerous secret behind the adoption agency they trusted. Risking their own lives, they will discover just what being a parent means and how far they will go to get their child bac...","['Action', 'Adventure', 'Crime', 'Drama', 'Mystery', 'Thriller']","[Action, Adventure, Crime, Drama, Mystery, Thriller]","[1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
60,The Shop Around the Corner,movie,https://www.metacritic.com/movie/the-shop-around-the-corner/,96,15,96,33,"Two employees at a gift shop can barely stand each other, without realizing that they are falling in love through the post as each other's anonymous pen pal.","['Comedy', 'Drama', 'Romance']","[Comedy, Drama, Romance]","[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


# Fastai & Blurr Inference

In [18]:
model_path = "/kaggle/input/final-text-model-files/mctc-lm-stage-1_epoch_5.pkl"
learner_inf = load_learner(model_path)
learner_inf

<fastai.learner.Learner at 0x7a9d11daecb0>

In [19]:
pred_info = learner_inf.blurr_predict("random placeholder")

# pred_info

In [20]:
pred_genre = pred_info[0]['labels']
print(f"Predicted Genres: {pred_genre}")
pred_score = pred_info[0]['scores']
print(f"Confidence Scores: {pred_score}")

Predicted Genres: ['Drama', 'Comedy']
Confidence Scores: [0.6022409200668335, 0.7427214980125427]


In [21]:
preds = []
for idx, row in tqdm(valid_df.iterrows(), total=len(valid_df)):
  desc = row['Plot_summary']
  labels = learner_inf.blurr_predict(desc)[0]['labels']
  pred_genres = [0] * len(encode_genre_types)
  for label in labels:
    pred_genres[encode_genre_types[label]] = 1
  preds.append(pred_genres)

print(preds[0][:50])

  0%|          | 0/1514 [00:00<?, ?it/s]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]


In [22]:
print(valid_df.shape)
print(len(preds))


(1514, 11)
1514


In [23]:
def metric_measures(test_df, preds):

  targets = [np.asarray(target) for target in test_df['genre_cat_list'].to_list()]
  outputs = [np.asarray(pred) for pred in preds]

  # print(len(targets))
  # print()
  # print(targets)
    
  # print(len(outputs))
  # print()
  # print(outputs)
    
  accuracy = accuracy_score(targets, outputs)

  #print(accuracy)

    
  f1_score_micro = f1_score(targets, outputs, average='micro')
  f1_score_macro = f1_score(targets, outputs, average='macro')

  print(f"F1 Score (Micro) = {f1_score_micro}")
  print(f"F1 Score (Macro) = {f1_score_macro}")




metric_measures(valid_df, preds)

F1 Score (Micro) = 0.6506172839506174
F1 Score (Macro) = 0.520508177731966


# Converting to ONNX

In [24]:
model_path = "/kaggle/input/final-text-model-files/mctc-lm-stage-1_epoch_5.pkl"
learner_inf = load_learner(model_path)

In [25]:
learner_inf.model.hf_model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [26]:
classifier = learner_inf.model.hf_model.eval()
classifier

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [27]:
type(torch.LongTensor([[0] * 512]))

torch.Tensor

In [28]:
classifier = learner_inf.model.hf_model.eval()

torch.onnx.export(
    classifier,
    (torch.LongTensor([[0] * 512]),),
    "film_genre_classifier.onnx",
    input_names=['input_ids'],
    output_names=['output'],
    #opset_version=11,
    dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence_len'}, 'output': {0: 'batch_size'}}
)

In [29]:
import os

if not os.path.exists('models'):
    mkdir('models')

In [30]:
onnx_model_path = 'film_genre_classifier.onnx'
quantized_onnx_model_path = 'models/film_genre_classifier_quantized.onnx'

quantize_dynamic(
    onnx_model_path,
    quantized_onnx_model_path,
    weight_type=QuantType.QUInt8,
)

### Normal ONNX

In [32]:
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

class_labels = list(encode_genre_types.keys())

inf_session = rt.InferenceSession('/kaggle/working/film_genre_classifier.onnx')
input_name = inf_session.get_inputs()[0].name
output_name = inf_session.get_outputs()[0].name

In [34]:
preds = []
for idx, row in tqdm(valid_df.iterrows(), total=valid_df.shape[0]):
  desc = row['Plot_summary']
  input_ids = tokenizer(desc)['input_ids'][:512]

  probs = inf_session.run([output_name], {input_name: [input_ids]})[0]
  probs = torch.FloatTensor(probs)

  masks = torch.sigmoid(probs) >= 0.5
  labels = [class_labels[idx] for idx, mask in enumerate(masks[0]) if mask]

  pred_genres = [0] * len(encode_genre_types)
  for label in labels:
    pred_genres[encode_genre_types[label]] = 1
  preds.append(pred_genres)


print(preds[0][:50])

  0%|          | 0/1514 [00:00<?, ?it/s]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]


In [35]:
metric_measures(valid_df, preds) 

F1 Score (Micro) = 0.649653121902874
F1 Score (Macro) = 0.5293027975587308


### Quantized ONNX

In [36]:
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

class_labels = list(encode_genre_types.keys())

inf_session = rt.InferenceSession('/kaggle/working/models/film_genre_classifier_quantized.onnx')
input_name = inf_session.get_inputs()[0].name
output_name = inf_session.get_outputs()[0].name

In [37]:
preds = []
for idx, row in tqdm(valid_df.iterrows(), total=valid_df.shape[0]):
  desc = row['Plot_summary']
  input_ids = tokenizer(desc)['input_ids'][:512]

  probs = inf_session.run([output_name], {input_name: [input_ids]})[0]
  probs = torch.FloatTensor(probs)

  masks = torch.sigmoid(probs) >= 0.5
  labels = [class_labels[idx] for idx, mask in enumerate(masks[0]) if mask]

  pred_genres = [0] * len(encode_genre_types)
  for label in labels:
    pred_genres[encode_genre_types[label]] = 1
  preds.append(pred_genres)


print(preds[0][:50])

  0%|          | 0/1514 [00:00<?, ?it/s]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]


In [38]:
metric_measures(valid_df, preds) #

F1 Score (Micro) = 0.6484384798695597
F1 Score (Macro) = 0.5166492196796111
