Sangyun Lee, Sreyas Dhulipala, Ye Yu, Yue

In [27]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from helper import Transformer_Dataset, preprocess
import numpy as np
import evaluate
import torch

Labels (Emotions)

In [2]:
id2label = {0 : "sadness", 1 : "joy", 2: "love", 3: "anger", 4 : "fear", 5: "surprise"}
label2id = {"sadness": 0, "joy" : 1, "love":2, "anger": 3, "fear" : 4, "surprise": 5}


## Load Model

Loading Distilbert Model

In [3]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels=6, id2label=id2label, label2id=label2id) 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading Tokenizer for Distilbert Model

In [4]:
checkpoint = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # It automatically does padding to longest length of data in each batch.  


Arugments for training/fine tuning the Model

In [5]:
training_args = TrainingArguments(
    output_dir="output_model",  # output directory
    learning_rate=2e-5,         # learning rate
    per_device_train_batch_size=16, # batch size for training
    per_device_eval_batch_size=16,  # batch size for evaluation
    num_train_epochs=3,         # number of training epochs
    warmup_steps=500,           # number of warmup steps for learning rate scheduler
    weight_decay=0.01,          # strength of weight decay
    evaluation_strategy="epoch", # evaluate the model for each epoch
    save_strategy="epoch",       # save the model for each epoch
    load_best_model_at_end=True, # loading the model with the best result
)


## Preparing the Dataset

In [6]:
df = pd.read_csv("Datasets/training.csv") # Load the data for training
training_input = preprocess(list(df["text"]), tokenizer = tokenizer) # Preprocessing (remove urls in tweets and tokenizing the sentencees)
training_dataset = Transformer_Dataset(input_data = training_input, output_labels = list(df["label"]), device = device)


In [7]:
len(training_dataset)

16000

In [8]:
df2 = pd.read_csv("Datasets/validation.csv") # Load the data for evaluating at each epoch
eval_input = preprocess(list(df2["text"]), tokenizer = tokenizer) # Preprocessing (remove urls in tweets and tokenizing the sentencees)
eval_dataset = Transformer_Dataset(eval_input, list(df2["label"]), device = device)


## Trainer

Function for evaluating during training

In [9]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


Initialize Trainer

In [10]:
trainer = Trainer(
    model=model, # model to train/fine tuning
    args=training_args, # arguments for trianing from above
    train_dataset=training_dataset, # dataset for training
    eval_dataset=eval_dataset, # dataset for evaluation
    tokenizer=tokenizer, # tokenizer
    data_collator=data_collator, # data_collator for padding
    compute_metrics=compute_metrics, # function for evaluation
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Training/Fine tuning

In [11]:
trainer.train()


 17%|█▋        | 500/3000 [18:56<1:37:25,  2.34s/it]

{'loss': 1.1686, 'grad_norm': 9.327946662902832, 'learning_rate': 2e-05, 'epoch': 0.5}


 33%|███▎      | 1000/3000 [38:23<1:20:31,  2.42s/it]

{'loss': 0.3097, 'grad_norm': 11.198054313659668, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.0}


                                                     
 33%|███▎      | 1000/3000 [40:04<1:20:31,  2.42s/it]

{'eval_loss': 0.22844649851322174, 'eval_accuracy': 0.9255, 'eval_runtime': 100.9474, 'eval_samples_per_second': 19.812, 'eval_steps_per_second': 1.238, 'epoch': 1.0}


 50%|█████     | 1500/3000 [1:00:34<1:00:23,  2.42s/it]

{'loss': 0.1704, 'grad_norm': 0.17488795518875122, 'learning_rate': 1.2e-05, 'epoch': 1.5}


 67%|██████▋   | 2000/3000 [1:20:48<40:53,  2.45s/it]  

{'loss': 0.1573, 'grad_norm': 5.540460586547852, 'learning_rate': 8.000000000000001e-06, 'epoch': 2.0}


                                                     
 67%|██████▋   | 2000/3000 [1:22:27<40:53,  2.45s/it]

{'eval_loss': 0.16139106452465057, 'eval_accuracy': 0.9375, 'eval_runtime': 98.7164, 'eval_samples_per_second': 20.26, 'eval_steps_per_second': 1.266, 'epoch': 2.0}


 83%|████████▎ | 2500/3000 [1:42:33<19:21,  2.32s/it]  

{'loss': 0.1164, 'grad_norm': 0.061852917075157166, 'learning_rate': 4.000000000000001e-06, 'epoch': 2.5}


100%|██████████| 3000/3000 [2:02:05<00:00,  2.30s/it]

{'loss': 0.0969, 'grad_norm': 0.8338232040405273, 'learning_rate': 0.0, 'epoch': 3.0}


                                                     
100%|██████████| 3000/3000 [2:03:39<00:00,  2.30s/it]

{'eval_loss': 0.15153935551643372, 'eval_accuracy': 0.938, 'eval_runtime': 93.5121, 'eval_samples_per_second': 21.388, 'eval_steps_per_second': 1.337, 'epoch': 3.0}


100%|██████████| 3000/3000 [2:03:40<00:00,  2.47s/it]

{'train_runtime': 7420.6844, 'train_samples_per_second': 6.468, 'train_steps_per_second': 0.404, 'train_loss': 0.33654150772094726, 'epoch': 3.0}





TrainOutput(global_step=3000, training_loss=0.33654150772094726, metrics={'train_runtime': 7420.6844, 'train_samples_per_second': 6.468, 'train_steps_per_second': 0.404, 'train_loss': 0.33654150772094726, 'epoch': 3.0})

Save the Fine-tuned model Locally

In [12]:
model_path = "./my_fine_tuned_model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)


('./my_fine_tuned_model_final\\tokenizer_config.json',
 './my_fine_tuned_model_final\\special_tokens_map.json',
 './my_fine_tuned_model_final\\vocab.txt',
 './my_fine_tuned_model_final\\added_tokens.json',
 './my_fine_tuned_model_final\\tokenizer.json')

Testing Trained Model

In [41]:
df_test = pd.read_csv("Datasets/test.csv") # loading data for testing
training_input = preprocess(list(df_test["text"]), tokenizer = tokenizer)
training_dataset = Transformer_Dataset(training_input, list(df_test["label"]), device = device)


Testing Single datas

In [45]:
def single_data_test(index):
    single_text = df_test["text"][index]
    real_label = df_test["label"][index]
    print("input_text: ", single_text)
    print("output_label: ", real_label)

    data = training_dataset[index]

    data = {key: value.to("cuda") for key, value in data.items()} # may be need to change "cuda" to "cpu" depends on device

    output = model(**data)
    label_ = output["logits"].argmax(-1)
    print("predicted_label: ", label_)
    return label_

In [46]:
index = 10
single_data_test(index)


input_text:  i don t feel particularly agitated
output_label:  4
predicted_label:  tensor([3], device='cuda:0')


tensor([3], device='cuda:0')

In [47]:
index = 100
single_data_test(index)

input_text:  i feel needy but comfortable with it i feel vulnerable but secure i feel the urge to cum hard but i get no relief
output_label:  0
predicted_label:  tensor([0], device='cuda:0')


tensor([0], device='cuda:0')

In [35]:
index = 1022
single_data_test(index)

input_text:  i don t know if this helps at all but writing all of this has made me feel somewhat regretful of ashamed of who i was and while i have more to share i just don t think i can right now
output_label:  0
predicted_label:  

  data = {key: torch.tensor(value).to("cuda") for key, value in data.items()}


tensor([0], device='cuda:0')


tensor([0], device='cuda:0')

In [48]:
index = 1245
single_data_test(index)

input_text:  i feel the hearts decision to stop caring can it be reversed
output_label:  2
predicted_label:  tensor([2], device='cuda:0')


tensor([2], device='cuda:0')

Whole Dataset Testing

In [51]:
outputs = np.zeros(2000)
real_label = df_test["label"]

for i, data in enumerate(training_dataset):
    label = data.pop("labels")
    data  = {key: value.to("cuda") for key, value in data.items()} # may be need to change "cuda" to "cpu" depends on device
    output = model(**data)
    outputs[i] = output["logits"].argmax(-1)


real_label = np.array(real_label)

total_right = (outputs == real_label).sum()
acc = total_right/2000
print("accuracy: ", acc)

accuracy:  0.9315
