In [None]:
!pip install -U transformers
!pip install -U accelerate
!pip install -U datasets
!pip install -U bertviz
!pip install -U umap-learn

*Data Exploration*

In [None]:
import pandas as pd
from datasets import load_dataset

emotion = load_dataset("dair-ai/emotion")
print(emotion)


In [None]:
emotion.set_format(type='pandas')

df = emotion['train'][:]
df.head()

In [None]:
classes = emotion['train'].features['label'].names
classes

Dataset Analysis


In [None]:
import matplotlib.pyplot as plt
label_counts = df['label_name'].value_counts(ascending=True)
label_counts.plot.barh()
plt.title('Frequency of Classes')
plt.show()

In [None]:
df['Words Per Tweet'] = df['text'].str.split().apply(len)
df.boxplot("Words Per Tweet", by='label_name')

**Text to Tokens Conversion**

Transformer models like DistilBERT cannot receive raw strings as input; instead, they assume the text has been tokenized and encoded as numerical vectors.

Tokenization is the step of breaking down a string into the atomic units used in the model

In [None]:
from transformers import AutoTokenizer
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
text = "I love Machine Learning!. Tokenization is awesome"
encoded_text = tokenizer(text)
print(encoded_text)

In [None]:
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)

*Tokenization of the Emotion Data*


In [None]:
emotion.reset_format()


In [None]:
# map() method would be used

def tokenize(batch):
  temp =tokenizer(batch['text'], padding=True, truncation=True)
  return temp

print(tokenize(emotion["train"][:5]))

In [None]:
emotions_encoded = emotion.map(tokenize, batched=True, batch_size=None)


In [None]:
emotions_encoded


*Model Building*

In [None]:
text

In [None]:
inputs = tokenizer(text, return_tensors='pt')
inputs

In [None]:
from transformers import AutoModel
import torch

model = AutoModel.from_pretrained(model_ckpt)

In [None]:
with torch.no_grad():
    outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state

In [None]:
outputs

In [None]:
last_hidden_states.shape


*Fine-Tuning Transformers*

AutoModelForSequenceClassification model has a classification head on top of the pretrained model outputs
* The first thing we need is a pretrained DistilBERT model like the one we used in the feature-based approach.
* The only slight modification is that we use the AutoModelForSequenceClassification model instead of AutoModel.
* The difference is that the AutoModelForSequenceClassification model has a classification head on top of the pretrained model outputs, which can be easily trained with the base model.

In [None]:
from transformers import AutoModelForSequenceClassification

num_labels = len(classes)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels = num_labels).to(device)

In [None]:
device

In [None]:
pip install --upgrade transformers

In [None]:
import transformers
print(transformers.__version__)

In [None]:
from transformers import TrainingArguments

In [None]:
batch_size = 64
model_name = "distilbert-finetuned-emotion"

training_args = TrainingArguments(output_dir = model_name,
                                 num_train_epochs=2,
                                 learning_rate = 2e-5,
                                 per_device_train_batch_size= batch_size,
                                 per_device_eval_batch_size = batch_size,
                                  weight_decay=0.01,
                                  eval_strategy = 'epoch',
                                  disable_tqdm=False)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average='weighted')
  acc = accuracy_score(labels, preds)
  return {"accuracy": acc, "f1": f1}


In [None]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=emotions_encoded['train'],
                  eval_dataset=emotions_encoded['validation'],
                  tokenizer=tokenizer)

In [None]:
trainer.train()

In [None]:
preds_outputs = trainer.predict(emotions_encoded['test'])
preds_outputs.metrics

In [None]:
import numpy as np
y_preds = np.argmax(preds_outputs.predictions, axis=1)
y_true = emotions_encoded['test'][:]['label']

In [None]:
from sklearn.metrics import classification_report
print(classes)
print(classification_report(y_true, y_preds))

In [None]:
label_counts

In [None]:
text = 'i want to kill you'
input_encoded = tokenizer(text, return_tensors='pt').to(device)
with torch.no_grad():
  outputs = model(**input_encoded)

logits = outputs.logits
pred = torch.argmax(logits, dim=1).item()
pred, classes[pred]