In [1]:
pip install datasets



In [2]:
from datasets import load_dataset
emotion_dataset = load_dataset("emotion")
emotion_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [3]:
emotion_dataset['train'][0]

{'text': 'i didnt feel humiliated', 'label': 0}

In [4]:
emotion_dataset['train'][1]

{'text': 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
 'label': 0}

In [5]:
emotion_df = emotion_dataset['train'].to_pandas()
emotion_df.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [6]:
features = emotion_dataset['train'].features
features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}

In [7]:
features['label'].int2str(0)

'sadness'

In [8]:
id2label = {idx:features['label'].int2str(idx) for idx in range(6)}
id2label

{0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}

In [9]:
label2id = {v:k for k,v in id2label.items()}
label2id

{'sadness': 0, 'joy': 1, 'love': 2, 'anger': 3, 'fear': 4, 'surprise': 5}

In [10]:
emotion_df['label'].value_counts(normalize=True).sort_index()

0    0.291625
1    0.335125
2    0.081500
3    0.134937
4    0.121063
5    0.035750
Name: label, dtype: float64

In [11]:
pip install transformers



In [12]:
from transformers import AutoTokenizer

model_ckpt = "microsoft/MiniLM-L12-H384-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [13]:
tokenizer(emotion_dataset['train']['text'][:1])

{'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}

In [14]:
def tokenize_text(examples):
  return tokenizer(examples['text'],truncation=True,max_length = 512)


In [15]:
emotion_dataset = emotion_dataset.map(tokenize_text,batched = True)
emotion_dataset

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [16]:
class_weights = (1 - (emotion_df['label'].value_counts().sort_index() / len(emotion_df))).values
class_weights

array([0.708375 , 0.664875 , 0.9185   , 0.8650625, 0.8789375, 0.96425  ])

In [17]:
import torch
class_weights = torch.from_numpy(class_weights).float().to('cuda')
class_weights

tensor([0.7084, 0.6649, 0.9185, 0.8651, 0.8789, 0.9643], device='cuda:0')

In [18]:
emotion_dataset = emotion_dataset.rename_column('label','labels')

In [19]:
from torch import nn
import torch
from transformers import Trainer

In [20]:
class WeightedLossTrainer(Trainer):
  def compute_loss(self,model,inputs,return_outputs=False):
    outputs = model(**inputs)
    logits = outputs.get('logits')
    labels = inputs.get('labels')
    loss_func = nn.CrossEntropyLoss(weight=class_weights)
    loss = loss_func(logits,labels)
    return (loss,outputs) if return_outputs else loss



In [21]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,num_labels=6,id2label=id2label,label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
from sklearn.metrics import f1_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels,preds,average='weighted')
  return {"f1": f1}

In [23]:
!pip install -q transformers einops accelerate langchain bitsandbytes

In [24]:
pip install transformers[torch]




In [29]:
from transformers import TrainingArguments

batch_size = 64
logging_steps = len(emotion_dataset['train']) // batch_size
output_dir = "minilm-finetuned-emotion"
training_args = TrainingArguments(output_dir=output_dir,
                                  num_train_epochs=5,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  logging_steps=logging_steps,
                                  fp16=True) #make it train fast
                                  # push_to_hub=True)

In [30]:
trainer =  WeightedLossTrainer(model=model,
                               args=training_args,
                               compute_metrics=compute_metrics,
                               train_dataset=emotion_dataset['train'],
                               eval_dataset=emotion_dataset['validation'],
                               tokenizer=tokenizer)


In [31]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,1.3895,1.074062,0.579502
2,0.9192,0.747651,0.812544
3,0.657,0.534364,0.871002
4,0.4946,0.441812,0.89493
5,0.4241,0.408983,0.902276


TrainOutput(global_step=1250, training_loss=0.7768841430664063, metrics={'train_runtime': 144.8863, 'train_samples_per_second': 552.157, 'train_steps_per_second': 8.627, 'total_flos': 582422632630272.0, 'train_loss': 0.7768841430664063, 'epoch': 5.0})

In [35]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [37]:
output_directory = '/content/drive/MyDrive/finetune'
trainer.save_model(output_directory)

In [39]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the trained model and tokenizer for inference
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/finetune")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/finetune")


In [44]:
input_text = "Hi, I'm Sean. Pleased to meet you, Mr. Turner."
inputs = tokenizer(input_text, return_tensors="pt")


In [45]:
# Perform inference
outputs = model(**inputs)
predictions = outputs.logits.argmax(1)

# Decode the predicted label
predicted_label_id = predictions.item()
predicted_label = id2label[predicted_label_id]

print("Predicted label:", predicted_label)


Predicted label: joy
