In [52]:
!pip install datasets
!pip install datasets transformers accelerate



In [53]:
from datasets import load_dataset
from transformers import AutoTokenizer,AutoModelForSequenceClassification,DataCollatorWithPadding,TrainingArguments,Trainer
import torch
from peft import LoraConfig,get_peft_model
import numpy as np

In [54]:
dataset=load_dataset('emotion')
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [55]:
model_name='distilbert-base-uncased'
tokenizer=AutoTokenizer.from_pretrained(model_name,add_prefix_space=True)

In [56]:
id2label={0:'sadness',1:'joy',2:'love',3:'anger',4:'fear',5:'surprise'}
label2id={'sadness':0,'joy':1,'love':2,'anger':3,'fear':4,'surprise':5}
model=AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=6,id2label=id2label,label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [57]:
def tokenize_text(dataset):
  text=dataset['text']

  tokenizer.truncation_side='left'
  tokenized_input=tokenizer(text,return_tensors='np',padding='max_length',truncation=True,max_length=512)
  return tokenized_input

  if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token':['PAD']})
    model.resize_token_embeddings(len(tokenizer))

tokenize_dataset=dataset.map(tokenize_text,batched=True)
tokenize_dataset

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [58]:
tokenize_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [59]:
data_collator=DataCollatorWithPadding(tokenizer=tokenizer)

In [60]:
print('Untrained Result')
text_list=['i didnt feel humiliated','i have been with petronas for years i feel that petronas has performed well and made a huge profit','i become overwhelmed and feel defeated','i remember feeling acutely distressed for a few days','i have seen heard and read over the past couple of days i am left feeling impressed by more than a few companies']

for text in text_list:
  input_tokenized=tokenizer.encode(text,return_tensors='pt')
  logits=model(input_tokenized).logits
  prediction=torch.argmax(logits)
  print(text,' - ',id2label[prediction.tolist()])

Untrained Result
i didnt feel humiliated  -  surprise
i have been with petronas for years i feel that petronas has performed well and made a huge profit  -  surprise
i become overwhelmed and feel defeated  -  surprise
i remember feeling acutely distressed for a few days  -  surprise
i have seen heard and read over the past couple of days i am left feeling impressed by more than a few companies  -  surprise


In [61]:
lora=LoraConfig(
    task_type='SEQ_CLS',r=4,lora_alpha=16,lora_dropout=0.1,target_modules=['q_lin']
)

In [62]:
model=get_peft_model(model,lora)
model.print_trainable_parameters()

trainable params: 632,070 || all params: 67,590,156 || trainable%: 0.9352


In [63]:
training_args=TrainingArguments(
    output_dir="-lora-text-classification",
    learning_rate=0.001,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.1,
    evaluation_strategy='epoch',
    save_strategy="epoch",
    load_best_model_at_end=True,
)



In [64]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
  logits=eval_pred.predictions
  labels=eval_pred.label_ids
  prediction=np.argmax(logits,axis=1)
  accuracy=accuracy_score(labels,prediction)
  return {"Accuracy":accuracy}

In [66]:
trainer=Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    train_dataset=tokenize_dataset['train'],
    eval_dataset=tokenize_dataset['validation']
)

  trainer=Trainer(


In [67]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.456,0.348029,0.8995
2,0.3208,0.328636,0.919
3,0.2625,0.274033,0.926
4,0.2238,0.233596,0.929
5,0.1528,0.212065,0.93


TrainOutput(global_step=20000, training_loss=0.3327621963500977, metrics={'train_runtime': 3124.3633, 'train_samples_per_second': 25.605, 'train_steps_per_second': 6.401, 'total_flos': 1.075348537344e+16, 'train_loss': 0.3327621963500977, 'epoch': 5.0})

In [87]:
text='today was sucks'
input_encoded=tokenizer.encode(text,return_tensors='pt')

if torch.cuda.is_available():
  input_encoded=input_encoded.to('cuda')

logits=model(input_encoded).logits
prediction=torch.argmax(logits).item()
print(text,' - ',id2label[prediction])

today was sucks  -  sadness
