In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import pickle
import re
import torch
import math

from datasets import load_dataset
from datasets import load_metric


from sklearn.model_selection import train_test_split


from transformers import AutoTokenizer
from transformers import TrainingArguments
from transformers import Trainer

from transformers import AutoModelForSequenceClassification


In [2]:
dataset=load_dataset("dair-ai/emotion")
dataset

No config specified, defaulting to: emotion/split
Found cached dataset emotion (/Users/phanisingaraju/.cache/huggingface/datasets/dair-ai___emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [3]:
dataset['train'].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}

In [4]:
dataset.set_format(type="pandas")
train_df = dataset['train'][:]
valid_df = dataset['validation'][:]
test_df = dataset['test'][:]

In [5]:
train_df.head()


Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


##### Tokenization

In [4]:
tokenizer=AutoTokenizer.from_pretrained('bert-base-cased')

In [5]:
def tokenize_data(example):
    return tokenizer(example['text'], padding='max_length')

dataset = dataset.map(tokenize_data, batched=True)


Loading cached processed dataset at /Users/phanisingaraju/.cache/huggingface/datasets/dair-ai___emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd/cache-3ae163e423f56b16.arrow
Loading cached processed dataset at /Users/phanisingaraju/.cache/huggingface/datasets/dair-ai___emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd/cache-d07b27401109f4e3.arrow


  0%|          | 0/2 [00:00<?, ?ba/s]

In [16]:
remove_columns = ['text']
dataset = dataset.map(remove_columns=remove_columns)

  0%|          | 0/16000 [00:00<?, ?ex/s]

  0%|          | 0/2000 [00:00<?, ?ex/s]

  0%|          | 0/2000 [00:00<?, ?ex/s]

In [17]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [18]:
len(dataset['train']['input_ids'])

16000

In [19]:
dataset['train']

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 16000
})

In [10]:
training_args = TrainingArguments("test_trainer", num_train_epochs=3)

In [11]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=6)


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [13]:
trainer = Trainer(
    model=model, args=training_args, train_dataset=dataset['train'], eval_dataset=dataset['validation'])

In [14]:
%%time
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 16000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6000
  Number of trainable parameters = 108314886


Step,Training Loss
500,0.7625
1000,0.3947
1500,0.2933
2000,0.2555
2500,0.1653
3000,0.1726
3500,0.1561
4000,0.1803
4500,0.1147
5000,0.1134


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1500
Configuration saved in test_trainer/checkpoint-1500/config.json
Model weights saved in test_trainer/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-2000
Configuration saved in test_trainer/checkpoint-2000/config.json
Model weights saved in test_trainer/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-2500
Configuration saved in test_trainer/checkpoint-2500/config.json
Model weights saved in test_trainer/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-30

CPU times: user 22h 49min 25s, sys: 14h 21min 35s, total: 1d 13h 11min 1s
Wall time: 1d 3h 53min 50s


TrainOutput(global_step=6000, training_loss=0.23516877365112304, metrics={'train_runtime': 100430.0815, 'train_samples_per_second': 0.478, 'train_steps_per_second': 0.06, 'total_flos': 1.2629784231936e+16, 'train_loss': 0.23516877365112304, 'epoch': 3.0})

create an iterator for the dataset using the torch DataLoader class. This helps save on memory during training because, unlike a for loop, with an iterator the entire dataset does not need to be loaded into memory.

In [15]:
trainer.save_model("../models/bert_finetuned_model_local")

Saving model checkpoint to ../models/bert_finetuned_model_local
Configuration saved in ../models/bert_finetuned_model_local/config.json
Model weights saved in ../models/bert_finetuned_model_local/pytorch_model.bin
