In [7]:
!pip install transformers torch datasets -U



# Main Library

In [2]:
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from datasets import load_dataset, Dataset

2024-07-02 04:06:44.945792: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-02 04:06:44.945913: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-02 04:06:45.079605: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Loading Data

In [3]:
data = load_dataset('emotion')
data.set_format(type='pandas')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/3.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/592k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [4]:
number_of_label = len(data['train'][:]['label'].value_counts())
class_labels = data['train'].features['label'].names

number_of_label, class_labels

(6, ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'])

# Tokenization

In [5]:
model_name = 'distilbert-base-uncased'

tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=number_of_label)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
data.reset_format()
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [7]:
def token_data(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

data_encoded = data.map(token_data, batched=True)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [8]:
data_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

# Building Model

In [9]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=1000
)

In [11]:
# compute Metrics

def compute_metrics(p):
    pred, label = p
    prediction = pred.argmax(axis=1)
    return {'Accuracy': (prediction == label).mean()}

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_encoded['train'],
    eval_dataset=data_encoded['validation'],
    compute_metrics=compute_metrics,
)

In [13]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Step,Training Loss
1000,0.5917
2000,0.133




TrainOutput(global_step=2000, training_loss=0.3623613739013672, metrics={'train_runtime': 904.2283, 'train_samples_per_second': 35.389, 'train_steps_per_second': 2.212, 'total_flos': 4239259140096000.0, 'train_loss': 0.3623613739013672, 'epoch': 2.0})

# Testing Data

In [16]:
evaluation = trainer.evaluate(data_encoded['test'])
evaluation



{'eval_loss': 0.14044873416423798,
 'eval_Accuracy': 0.9345,
 'eval_runtime': 19.3503,
 'eval_samples_per_second': 103.358,
 'eval_steps_per_second': 3.256,
 'epoch': 2.0}

In [20]:
prediction = trainer.predict(data_encoded['test'])[1]
prediction



array([0, 0, 0, ..., 1, 1, 4])

In [26]:
from sklearn.metrics import confusion_matrix, accuracy_score

cm = confusion_matrix(data_encoded['test']['label'], prediction)
acc = accuracy_score(data_encoded['test']['label'], prediction)

print('THe Accuracy Of Testing Data is ---> ', acc*100)
cm

THe Accuracy Of Testing Data is --->  100.0


array([[581,   0,   0,   0,   0,   0],
       [  0, 695,   0,   0,   0,   0],
       [  0,   0, 159,   0,   0,   0],
       [  0,   0,   0, 275,   0,   0],
       [  0,   0,   0,   0, 224,   0],
       [  0,   0,   0,   0,   0,  66]])