# Import modules and dependencies

In [1]:
! pip install -q transformers[sentencepiece] datasets evaluate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import transformers

import evaluate
from datasets import Dataset, DatasetDict
from transformers import  AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, EarlyStoppingCallback

# Loading Data

In [3]:
#Reading the data

df = pd.read_csv("/kaggle/input/nlp-project-data/film_details.csv")
df

Unnamed: 0,Title,Category,Url,Metascore,Number of critic reviewers,User score,Number of user reviewers,Plot summary,Genres
0,Dekalog (1988),movie,https://www.metacritic.com/movie/dekalog-1988/,100,13,100,112,This masterwork by Krzysztof Kieślowski is one...,['Drama']
1,The Godfather,movie,https://www.metacritic.com/movie/the-godfather/,100,16,100,4082,Francis Ford Coppola's epic features Marlon Br...,"['Crime', 'Drama']"
2,Lawrence of Arabia (re-release),movie,https://www.metacritic.com/movie/lawrence-of-a...,100,8,100,442,The 40th anniversary re-release of David Lean'...,"['Adventure', 'Biography', 'Drama', 'War']"
3,The Leopard (re-release),movie,https://www.metacritic.com/movie/the-leopard-r...,100,12,100,84,"Set in Sicily in 1860, Luchino Visconti's spec...","['Drama', 'History']"
4,The Conformist,movie,https://www.metacritic.com/movie/the-conformis...,100,11,100,105,"Set in Rome in the 1930s, this re-release of B...",['Drama']
...,...,...,...,...,...,...,...,...,...
15149,Cavemen,tv,https://www.metacritic.com/tv/cavemen/,19,13,19,6,"Cavemen revolves around Joel, his younger brot...","['Comedy', 'Sci-Fi']"
15150,Work It,tv,https://www.metacritic.com/tv/work-it/,19,22,19,44,"After they are laid off, Lee Standish (Ben Kol...",['Comedy']
15151,Category 7: The End of the World,tv,https://www.metacritic.com/tv/category-7-the-e...,18,11,18,7,"""Category 7: The End of the World"" picks up wh...","['Action', 'Adventure', 'Drama', 'Sci-Fi', 'Th..."
15152,Stalker,tv,https://www.metacritic.com/tv/stalker/,17,24,17,137,Lt. Beth Davis (Maggie Q) leads the Threat Ass...,"['Crime', 'Drama', 'Thriller']"


In [4]:
#Dropping null entries

df = df.dropna().reset_index(drop=True)
df

Unnamed: 0,Title,Category,Url,Metascore,Number of critic reviewers,User score,Number of user reviewers,Plot summary,Genres
0,Dekalog (1988),movie,https://www.metacritic.com/movie/dekalog-1988/,100,13,100,112,This masterwork by Krzysztof Kieślowski is one...,['Drama']
1,The Godfather,movie,https://www.metacritic.com/movie/the-godfather/,100,16,100,4082,Francis Ford Coppola's epic features Marlon Br...,"['Crime', 'Drama']"
2,Lawrence of Arabia (re-release),movie,https://www.metacritic.com/movie/lawrence-of-a...,100,8,100,442,The 40th anniversary re-release of David Lean'...,"['Adventure', 'Biography', 'Drama', 'War']"
3,The Leopard (re-release),movie,https://www.metacritic.com/movie/the-leopard-r...,100,12,100,84,"Set in Sicily in 1860, Luchino Visconti's spec...","['Drama', 'History']"
4,The Conformist,movie,https://www.metacritic.com/movie/the-conformis...,100,11,100,105,"Set in Rome in the 1930s, this re-release of B...",['Drama']
...,...,...,...,...,...,...,...,...,...
15149,Cavemen,tv,https://www.metacritic.com/tv/cavemen/,19,13,19,6,"Cavemen revolves around Joel, his younger brot...","['Comedy', 'Sci-Fi']"
15150,Work It,tv,https://www.metacritic.com/tv/work-it/,19,22,19,44,"After they are laid off, Lee Standish (Ben Kol...",['Comedy']
15151,Category 7: The End of the World,tv,https://www.metacritic.com/tv/category-7-the-e...,18,11,18,7,"""Category 7: The End of the World"" picks up wh...","['Action', 'Adventure', 'Drama', 'Sci-Fi', 'Th..."
15152,Stalker,tv,https://www.metacritic.com/tv/stalker/,17,24,17,137,Lt. Beth Davis (Maggie Q) leads the Threat Ass...,"['Crime', 'Drama', 'Thriller']"


In [5]:
#Taking the first genre as single genre

genre_list = []

indices_to_drop = []
for idx, genres in enumerate(df.Genres.to_list()):
  genres_list = eval(genres)
  if len(genres_list):
    genre_list.append(genres_list[0])
  else:
    indices_to_drop.append(idx)

df = df.drop(indices_to_drop).reset_index(drop=True)
df.shape

df['Single_Genre'] = genre_list

df

Unnamed: 0,Title,Category,Url,Metascore,Number of critic reviewers,User score,Number of user reviewers,Plot summary,Genres,Single_Genre
0,Dekalog (1988),movie,https://www.metacritic.com/movie/dekalog-1988/,100,13,100,112,This masterwork by Krzysztof Kieślowski is one...,['Drama'],Drama
1,The Godfather,movie,https://www.metacritic.com/movie/the-godfather/,100,16,100,4082,Francis Ford Coppola's epic features Marlon Br...,"['Crime', 'Drama']",Crime
2,Lawrence of Arabia (re-release),movie,https://www.metacritic.com/movie/lawrence-of-a...,100,8,100,442,The 40th anniversary re-release of David Lean'...,"['Adventure', 'Biography', 'Drama', 'War']",Adventure
3,The Leopard (re-release),movie,https://www.metacritic.com/movie/the-leopard-r...,100,12,100,84,"Set in Sicily in 1860, Luchino Visconti's spec...","['Drama', 'History']",Drama
4,The Conformist,movie,https://www.metacritic.com/movie/the-conformis...,100,11,100,105,"Set in Rome in the 1930s, this re-release of B...",['Drama'],Drama
...,...,...,...,...,...,...,...,...,...,...
15149,Cavemen,tv,https://www.metacritic.com/tv/cavemen/,19,13,19,6,"Cavemen revolves around Joel, his younger brot...","['Comedy', 'Sci-Fi']",Comedy
15150,Work It,tv,https://www.metacritic.com/tv/work-it/,19,22,19,44,"After they are laid off, Lee Standish (Ben Kol...",['Comedy'],Comedy
15151,Category 7: The End of the World,tv,https://www.metacritic.com/tv/category-7-the-e...,18,11,18,7,"""Category 7: The End of the World"" picks up wh...","['Action', 'Adventure', 'Drama', 'Sci-Fi', 'Th...",Action
15152,Stalker,tv,https://www.metacritic.com/tv/stalker/,17,24,17,137,Lt. Beth Davis (Maggie Q) leads the Threat Ass...,"['Crime', 'Drama', 'Thriller']",Crime


In [6]:
df['Single_Genre'].value_counts()

Single_Genre
Comedy         3837
Drama          3470
Action         2437
Documentary    1353
Crime          1199
Biography       825
Adventure       632
Animation       605
Horror          526
Fantasy          63
Reality-TV       48
Thriller         34
Mystery          33
Game-Show        20
Sci-Fi           14
Romance          11
Family           10
Musical           7
Unknown           6
                  5
Music             5
Film-Noir         4
Western           4
Talk-Show         2
History           1
War               1
Sport             1
News              1
Name: count, dtype: int64

In [7]:
len(df['Single_Genre'].value_counts())

28

## Removing rare genres

Rare genres are the genres that has less than 1% frequency in the entire genre count

In [8]:
genre_count = df['Single_Genre'].value_counts()
threshold = int(len(df) * 0.005) #Rare genre threshold
rare_genres = [ cat for cat, count in genre_count.items() if count < threshold]

In [9]:
print(len(rare_genres))
print()
print(rare_genres[:3])

19

['Fantasy', 'Reality-TV', 'Thriller']


In [10]:
rare_indices_to_drop = [idx for idx, row in df.iterrows() if row['Single_Genre'] in rare_genres]
len(rare_indices_to_drop)

270

In [11]:
df = df.drop(rare_indices_to_drop).reset_index(drop=True)
df.shape

(14884, 10)

In [12]:
df['Single_Genre'].value_counts()

Single_Genre
Comedy         3837
Drama          3470
Action         2437
Documentary    1353
Crime          1199
Biography       825
Adventure       632
Animation       605
Horror          526
Name: count, dtype: int64

In [13]:
len(df['Single_Genre'].value_counts())

9

In [14]:
#Seeing final classes

labels = list(set(df.Single_Genre.to_list()))
label_count = len(labels)

print(labels)
print()
print(label_count)

['Drama', 'Animation', 'Comedy', 'Crime', 'Action', 'Biography', 'Adventure', 'Documentary', 'Horror']

9


In [15]:
df

Unnamed: 0,Title,Category,Url,Metascore,Number of critic reviewers,User score,Number of user reviewers,Plot summary,Genres,Single_Genre
0,Dekalog (1988),movie,https://www.metacritic.com/movie/dekalog-1988/,100,13,100,112,This masterwork by Krzysztof Kieślowski is one...,['Drama'],Drama
1,The Godfather,movie,https://www.metacritic.com/movie/the-godfather/,100,16,100,4082,Francis Ford Coppola's epic features Marlon Br...,"['Crime', 'Drama']",Crime
2,Lawrence of Arabia (re-release),movie,https://www.metacritic.com/movie/lawrence-of-a...,100,8,100,442,The 40th anniversary re-release of David Lean'...,"['Adventure', 'Biography', 'Drama', 'War']",Adventure
3,The Leopard (re-release),movie,https://www.metacritic.com/movie/the-leopard-r...,100,12,100,84,"Set in Sicily in 1860, Luchino Visconti's spec...","['Drama', 'History']",Drama
4,The Conformist,movie,https://www.metacritic.com/movie/the-conformis...,100,11,100,105,"Set in Rome in the 1930s, this re-release of B...",['Drama'],Drama
...,...,...,...,...,...,...,...,...,...,...
14879,Cavemen,tv,https://www.metacritic.com/tv/cavemen/,19,13,19,6,"Cavemen revolves around Joel, his younger brot...","['Comedy', 'Sci-Fi']",Comedy
14880,Work It,tv,https://www.metacritic.com/tv/work-it/,19,22,19,44,"After they are laid off, Lee Standish (Ben Kol...",['Comedy'],Comedy
14881,Category 7: The End of the World,tv,https://www.metacritic.com/tv/category-7-the-e...,18,11,18,7,"""Category 7: The End of the World"" picks up wh...","['Action', 'Adventure', 'Drama', 'Sci-Fi', 'Th...",Action
14882,Stalker,tv,https://www.metacritic.com/tv/stalker/,17,24,17,137,Lt. Beth Davis (Maggie Q) leads the Threat Ass...,"['Crime', 'Drama', 'Thriller']",Crime


In [16]:
df.describe(include='object')

Unnamed: 0,Title,Category,Url,Plot summary,Genres,Single_Genre
count,14884,14884,14884,14884,14884,14884
unique,14253,2,14824,14822,1746,9
top,Robin Hood,movie,https://www.metacritic.com/movie/battle-for-te...,"A comic spoof in the tradition of ""Airplane,"" ...",['Drama'],Comedy
freq,4,12403,2,2,1063,3837


# Data Processing

In [17]:
ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['Title', 'Category', 'Url', 'Metascore', 'Number of critic reviewers', 'User score', 'Number of user reviewers', 'Plot summary', 'Genres', 'Single_Genre'],
    num_rows: 14884
})

In [18]:
ds[0]

{'Title': 'Dekalog (1988)',
 'Category': 'movie',
 'Url': 'https://www.metacritic.com/movie/dekalog-1988/',
 'Metascore': 100,
 'Number of critic reviewers': 13,
 'User score': 100,
 'Number of user reviewers': 112,
 'Plot summary': 'This masterwork by Krzysztof Kieślowski is one of the twentieth century’s greatest achievements in visual storytelling. Originally made for Polish television, Dekalog focuses on the residents of a housing complex in late-Communist Poland, whose lives become subtly intertwined as they face emotional dilemmas that are at once deeply person...',
 'Genres': "['Drama']",
 'Single_Genre': 'Drama'}

# Tokenization

### [HF Model Link](https://huggingface.co/albert/albert-base-v1)

In [19]:
model_name = "albert/albert-base-v1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]



AlbertTokenizerFast(name_or_path='albert/albert-base-v1', vocab_size=30000, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '<unk>', 'sep_token': '[SEP]', 'pad_token': '<pad>', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}

In [20]:
ds[0]['Plot summary']

'This masterwork by Krzysztof Kieślowski is one of the twentieth century’s greatest achievements in visual storytelling. Originally made for Polish television, Dekalog focuses on the residents of a housing complex in late-Communist Poland, whose lives become subtly intertwined as they face emotional dilemmas that are at once deeply person...'

In [21]:
# distilroberta-base tokenizer operates on subword units rather than full words like most transformer-based tokenizersto handle out-of-vocabulary words.

tokenizer.tokenize(ds[0]['Plot summary'][:150])

['▁this',
 '▁master',
 'work',
 '▁by',
 '▁kr',
 'zy',
 'sz',
 't',
 'of',
 '▁',
 'kie',
 'slow',
 'ski',
 '▁is',
 '▁one',
 '▁of',
 '▁the',
 '▁twentieth',
 '▁century',
 '’',
 's',
 '▁greatest',
 '▁achievements',
 '▁in',
 '▁visual',
 '▁storytelling',
 '.',
 '▁originally',
 '▁made',
 '▁for',
 '▁polish',
 '▁tel']

In [22]:
def tokenize_desc(x) :
  return tokenizer(x['Plot summary'], truncation=True, padding=True)

In [23]:
tokenized_ds = ds.map(tokenize_desc, batched=True)
tokenized_ds

Map:   0%|          | 0/14884 [00:00<?, ? examples/s]

Dataset({
    features: ['Title', 'Category', 'Url', 'Metascore', 'Number of critic reviewers', 'User score', 'Number of user reviewers', 'Plot summary', 'Genres', 'Single_Genre', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 14884
})

In [24]:
tokenized_ds[0]

{'Title': 'Dekalog (1988)',
 'Category': 'movie',
 'Url': 'https://www.metacritic.com/movie/dekalog-1988/',
 'Metascore': 100,
 'Number of critic reviewers': 13,
 'User score': 100,
 'Number of user reviewers': 112,
 'Plot summary': 'This masterwork by Krzysztof Kieślowski is one of the twentieth century’s greatest achievements in visual storytelling. Originally made for Polish television, Dekalog focuses on the residents of a housing complex in late-Communist Poland, whose lives become subtly intertwined as they face emotional dilemmas that are at once deeply person...',
 'Genres': "['Drama']",
 'Single_Genre': 'Drama',
 'input_ids': [2,
  48,
  1129,
  3783,
  34,
  9645,
  3327,
  6649,
  38,
  1041,
  13,
  8960,
  15680,
  2413,
  25,
  53,
  16,
  14,
  9171,
  428,
  1,
  18,
  3023,
  10446,
  19,
  3458,
  23344,
  9,
  912,
  117,
  26,
  2283,
  633,
  15,
  121,
  657,
  5567,
  7155,
  27,
  14,
  2175,
  16,
  21,
  2123,
  1502,
  19,
  456,
  8,
  19575,
  2040,
  15,
 

In [25]:
row = tokenized_ds[0]

print("Plot")
print()
print(row['Plot summary'])
print()
print("====================================================================================================")
print()
print("Input ids")
print()
print(row['input_ids'])
print("====================================================================================================")
print()
print("Attention Mask")
print()
print(row['attention_mask'])

Plot

This masterwork by Krzysztof Kieślowski is one of the twentieth century’s greatest achievements in visual storytelling. Originally made for Polish television, Dekalog focuses on the residents of a housing complex in late-Communist Poland, whose lives become subtly intertwined as they face emotional dilemmas that are at once deeply person...


Input ids

[2, 48, 1129, 3783, 34, 9645, 3327, 6649, 38, 1041, 13, 8960, 15680, 2413, 25, 53, 16, 14, 9171, 428, 1, 18, 3023, 10446, 19, 3458, 23344, 9, 912, 117, 26, 2283, 633, 15, 121, 657, 5567, 7155, 27, 14, 2175, 16, 21, 2123, 1502, 19, 456, 8, 19575, 2040, 15, 1196, 1551, 533, 29261, 28613, 28, 59, 276, 6090, 23314, 18, 30, 50, 35, 382, 5206, 840, 9, 9, 9, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Attention Mask

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [26]:
# Vocabulary index, Numericalization like we did in ULMFit
tokenizer.vocab['person']

7276

# Categorize

In [27]:
#final genres

labels

['Drama',
 'Animation',
 'Comedy',
 'Crime',
 'Action',
 'Biography',
 'Adventure',
 'Documentary',
 'Horror']

In [28]:
labels.index('Drama')

0

In [29]:
def categorize(x):
  return {"labels": [labels.index(genre) for genre in x['Single_Genre']]}

In [30]:
categorized_ds = tokenized_ds.map(categorize, batched=True)
categorized_ds

Map:   0%|          | 0/14884 [00:00<?, ? examples/s]

Dataset({
    features: ['Title', 'Category', 'Url', 'Metascore', 'Number of critic reviewers', 'User score', 'Number of user reviewers', 'Plot summary', 'Genres', 'Single_Genre', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 14884
})

In [31]:
print(categorized_ds[0])

{'Title': 'Dekalog (1988)', 'Category': 'movie', 'Url': 'https://www.metacritic.com/movie/dekalog-1988/', 'Metascore': 100, 'Number of critic reviewers': 13, 'User score': 100, 'Number of user reviewers': 112, 'Plot summary': 'This masterwork by Krzysztof Kieślowski is one of the twentieth century’s greatest achievements in visual storytelling. Originally made for Polish television, Dekalog focuses on the residents of a housing complex in late-Communist Poland, whose lives become subtly intertwined as they face emotional dilemmas that are at once deeply person...', 'Genres': "['Drama']", 'Single_Genre': 'Drama', 'input_ids': [2, 48, 1129, 3783, 34, 9645, 3327, 6649, 38, 1041, 13, 8960, 15680, 2413, 25, 53, 16, 14, 9171, 428, 1, 18, 3023, 10446, 19, 3458, 23344, 9, 912, 117, 26, 2283, 633, 15, 121, 657, 5567, 7155, 27, 14, 2175, 16, 21, 2123, 1502, 19, 456, 8, 19575, 2040, 15, 1196, 1551, 533, 29261, 28613, 28, 59, 276, 6090, 23314, 18, 30, 50, 35, 382, 5206, 840, 9, 9, 9, 3, 0, 0, 0, 0

In [32]:
row = categorized_ds[0]
row['labels']

0

# Data Splitting

In [33]:
split_ds = categorized_ds.train_test_split(0.2, seed=42)
split_ds

DatasetDict({
    train: Dataset({
        features: ['Title', 'Category', 'Url', 'Metascore', 'Number of critic reviewers', 'User score', 'Number of user reviewers', 'Plot summary', 'Genres', 'Single_Genre', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 11907
    })
    test: Dataset({
        features: ['Title', 'Category', 'Url', 'Metascore', 'Number of critic reviewers', 'User score', 'Number of user reviewers', 'Plot summary', 'Genres', 'Single_Genre', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2977
    })
})

# Modeling

In [34]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=label_count)
model

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=76

In [35]:
batch_size = 32
epochs = 100
lr = 3.75e-4

In [36]:
training_args = TrainingArguments(
    "models",
    learning_rate = lr,
    warmup_ratio = 0.1,
    lr_scheduler_type='cosine',
    fp16=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    report_to='none',
    load_best_model_at_end = True,
    save_strategy='epoch'
    # save_steps=200,
)




In [37]:
def accuracy(eval_preds):
  metric = evaluate.load("accuracy")
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

In [38]:
trainer = Trainer(
    model,
    training_args,
    train_dataset = split_ds['train'],
    eval_dataset = split_ds['test'],
    tokenizer = tokenizer,
    compute_metrics = accuracy,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.0001)]
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [41]:
train_results = trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.407821,0.560296
2,0.798300,1.43194,0.538126
3,0.688400,1.467387,0.533087
4,0.688400,1.577109,0.541149


In [42]:
print(train_results)

TrainOutput(global_step=1492, training_loss=0.7019262032598337, metrics={'train_runtime': 541.1654, 'train_samples_per_second': 2200.252, 'train_steps_per_second': 68.925, 'total_flos': 270927069377424.0, 'train_loss': 0.7019262032598337, 'epoch': 4.0})


In [43]:
metrics = train_results.metrics
max_train_steps = len(trainer.state.log_history) * trainer.args.per_device_train_batch_size
metrics["train_samples"] = max_train_steps

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

# Evaluate and log best model
metrics = trainer.evaluate()
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)



***** train metrics *****
  epoch                    =        4.0
  total_flos               =   252320GF
  train_loss               =     0.7019
  train_runtime            = 0:09:01.16
  train_samples            =        224
  train_samples_per_second =   2200.252
  train_steps_per_second   =     68.925


***** eval metrics *****
  epoch                   =        4.0
  eval_accuracy           =     0.5603
  eval_loss               =     1.4078
  eval_runtime            = 0:00:11.60
  eval_samples_per_second =    256.443
  eval_steps_per_second   =      8.097


In [44]:
# train_losses = []
# eval_losses = []

# for log in trainer.state.log_history:
#     try:
#         train_losses.append(log['loss'] )
#     except:
#         pass
#         #eval_losses.append(log['eval_loss'] )
        

# for log in trainer.state.log_history:
#     try:
#         eval_losses.append(log['eval_loss'] )
#     except:
#         pass
        

# plt.figure(figsize=(10, 6))
# plt.plot(train_losses, label='Train Loss')
# plt.plot(eval_losses, label='Validation Loss')
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.legend()
# plt.title('Training and Validation Loss')
# plt.show()

In [45]:
trainer.save_model('models/film_genre_classifier_albert_base_v1')

In [46]:
import shutil
shutil.make_archive("film_genre_classifier_albert_base_v1", 'zip', 'models')

'/kaggle/working/film_genre_classifier_albert_base_v1.zip'