<a href="https://colab.research.google.com/github/OlgaSeleznova/ML_toolbox/blob/main/Text_classification_with_Bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The pipeline is adapted for custom dataset from the Hugging face text classification on GLUE notebook: https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/text_classification.ipynb#scrollTo=545PP3o8IrJV

In [15]:
# ! pip install transformers
# ! pip install datasets

# Libraries

In [16]:
import matplotlib.pyplot as plt
import pandas as pd
import torch
import numpy as np

from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import DatasetDict, Dataset,load_dataset, load_metric, Metric

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer


In [17]:
# create cuda device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


# Load data

In [18]:
# load data
from google.colab import drive
drive.mount('/content/drive/')

lyrics = pd.read_parquet('/content/drive/My Drive/Colab Notebooks/metrolyrics.parquet').reset_index(drop=True)
lyrics.shape

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


(49976, 8)

In [19]:
#encode genre names
genre_srt_to_int = {'Rock':0, 'Pop':1, 'Hip-Hop':2, 'Metal':3, 'Country':4}
lyrics['genre'] = lyrics['genre'].replace(genre_srt_to_int)

# rename columns to more suitable
lyrics = lyrics.rename(columns={'lyrics':'text','genre':'label'})

data = lyrics.loc[:,['text','label']]

In [20]:
data.head()

Unnamed: 0,text,label
0,[HEALY]\n[spoken] This is Bert Healy saying .....,1
1,[Chorus: repeat 2X] Even when I'm tryin to be ...,2
2,How could you cause me so much pain?\nAnd leav...,1
3,In a scarlet vision\nIn a velvet room\nI come ...,0
4,Sprintime in Savannah\nIt dont get much pretti...,4


In [21]:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# train is now 75% of the entire data set
x_train, x_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size= 1 - train_ratio, random_state=42, shuffle=True)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state=42, shuffle=True) 

def create_datasets(x, y):
    return pd.concat([x, y], axis=1, ignore_index=False, sort=False)

data_train = create_datasets(x_train, y_train)
data_valid = create_datasets(x_val, y_val)
data_test = create_datasets(x_test, y_test)
data_train.shape,data_valid.shape, data_test.shape

((37482, 2), (7496, 2), (4998, 2))

# Preprocessing

In [22]:
# set up parameters
model_checkpoint = "distilbert-base-uncased"
batch_size = 16
num_labels = len(np.unique(data_train['label']))
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [23]:
def df_to_dataset(df):
    return Dataset.from_pandas(df)

def tokenize_function(ds):
    return tokenizer(ds["text"], truncation=True, padding=True,max_length=100,add_special_tokens=True)


dataset_dict = DatasetDict({"train":df_to_dataset(data_train),"valid":df_to_dataset(data_valid)})

encoded_dataset = dataset_dict.map(tokenize_function, batched = True,load_from_cache_file=False)

encoded_dataset

HBox(children=(FloatProgress(value=0.0, max=38.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




DatasetDict({
    train: Dataset({
        features: ['__index_level_0__', 'attention_mask', 'input_ids', 'label', 'text'],
        num_rows: 37482
    })
    valid: Dataset({
        features: ['__index_level_0__', 'attention_mask', 'input_ids', 'label', 'text'],
        num_rows: 7496
    })
})

# Training

In [24]:
# create model
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [25]:
# create arguments
args = TrainingArguments(
    "test-glue",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

In [26]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    metric = load_metric('accuracy')
    return metric.compute(predictions=predictions, references=labels)

In [27]:
trainer = Trainer(
    model,
    args,
    train_dataset= encoded_dataset["train"],
    eval_dataset=encoded_dataset["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [28]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,0.9451,0.920978,0.627935,24.5649,305.151
2,0.8197,0.909785,0.645544,24.6538,304.051
3,0.6379,0.960862,0.645011,24.6274,304.377
4,0.4855,1.071407,0.642609,24.8218,301.993
5,0.3874,1.154679,0.636339,24.6563,304.02


TrainOutput(global_step=11715, training_loss=0.6705110205578285, metrics={'train_runtime': 2374.0372, 'train_samples_per_second': 4.935, 'total_flos': 7529082467382000.0, 'epoch': 5.0, 'init_mem_cpu_alloc_delta': 0, 'init_mem_gpu_alloc_delta': 268962304, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': -43847680, 'train_mem_gpu_alloc_delta': 1082887168, 'train_mem_cpu_peaked_delta': 45916160, 'train_mem_gpu_peaked_delta': 424506368})

In [29]:
trainer.evaluate()

{'epoch': 5.0,
 'eval_accuracy': 0.6455442902881536,
 'eval_loss': 0.9097845554351807,
 'eval_mem_cpu_alloc_delta': -36864,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_gpu_peaked_delta': 89157120,
 'eval_runtime': 23.9541,
 'eval_samples_per_second': 312.932}