In [None]:
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [None]:
emotion = load_dataset('dair-ai/emotion')
emotion.set_format(type='pandas')

emotion

In [None]:
train = emotion['train'][:]

train.head()

In [None]:
classes = emotion['train'].features['label'].names

classes

In [None]:
train['label_name'] = train['label'].apply(lambda x: classes[x])

train.head()

### Data Analysis

In [None]:
label_counts = train['label_name'].value_counts(ascending=True)
label_counts.plot.bar()
plt.title('Frequency of Classes')
plt.show()

In [None]:
train['Words per Tweet'] = train['text'].str.split().apply(len)

train.head()

In [None]:
train.boxplot('Words per Tweet', by='label_name')

### Text to Token Conversion

In [None]:
model_ckpt = 'distilbert-base-uncased'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
# Tokenization Example
text = "I love Machine Learning!. Tokenization is awesome"
encoded_text = tokenizer(text)

encoded_text

In [None]:
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)

tokens

In [None]:
# Total vocab size (trained data token size) and token size limit for the model
tokenizer.vocab_size, tokenizer.model_max_length

### Tokenization of the Emotion Data

In [None]:
emotion.reset_format()

emotion

In [None]:
def tokenize(batch):
    temp = tokenizer(batch['text'], padding = True, truncation = True)
    return temp


tokenize(emotion['train'][:5])

In [None]:
emotions_encoded = emotion.map(tokenize, batched = True, batch_size = None)

emotions_encoded

### Model Building

In [None]:
text

In [None]:
inputs = tokenizer(text, return_tensors = 'pt')

inputs

In [None]:
model = AutoModel.from_pretrained(model_ckpt)

model

In [None]:
with torch.no_grad():
    outputs = model(**inputs)

outputs

In [None]:
last_hidden_states = outputs.last_hidden_state

last_hidden_states

In [None]:
last_hidden_states.shape

### Fine Tuning Transformers

In [None]:
num_labels = len(classes)

num_labels

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels = num_labels).to(device)

model

In [None]:
batch_size = 64
model_name = 'distilbert-finetuned-emotion'

In [None]:
training_args = TrainingArguments(output_dir = "./model/" + model_name,
                                  num_train_epochs = 2,
                                  learning_rate = 2e-5,
                                  per_device_eval_batch_size = batch_size,
                                  per_device_train_batch_size = batch_size,
                                  weight_decay = 0.01,
                                  eval_strategy = 'epoch',
                                  disable_tqdm = False)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average = 'weighted')
    acc = accuracy_score(labels, preds)
    return {"Accuracy": acc, "F1 Score": f1}

In [None]:
trainer = Trainer(model = model,
                  args = training_args,
                  compute_metrics = compute_metrics,
                  train_dataset = emotions_encoded['train'],
                  eval_dataset = emotions_encoded['validation'],
                  tokenizer = tokenizer)

In [None]:
trainer.train()

In [None]:
pred_outputs = trainer.predict(emotions_encoded['test'])
pred_outputs.metrics

In [None]:
pred_outputs

In [None]:
y_preds = np.argmax(pred_outputs.predictions, axis = 1)

y_preds

In [None]:
y_true = emotions_encoded['test'][:]['label']

y_true

In [None]:
print(classes)
print(classification_report(y_true, y_preds))

In [None]:
label_counts

In [None]:
text = 'I hate you'
if torch.backends.mps.is_available():
    device = torch.device("mps")
input_encoded = tokenizer(text, return_tensors = 'pt').to(device)
with torch.no_grad():
    outputs = model(**input_encoded)

outputs

In [None]:
logits = outputs.logits
pred = torch.argmax(logits, dim = 1).item()

pred, classes[pred]