In [1]:
import torch 
from datasets import load_dataset
from transformers import AutoTokenizer
import json
import csv
from datasets import Dataset
from datasets import DatasetDict

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


### preprocess data

In [2]:
def load_data_for_task_0(
        json_path,
        save_path,
        test=False
        ):

    def process_dataset(example):
        class_mapping = {
            "anger":0,
            "disgust":1,
            "fear":2,
            "joy":3,
            "sadness":4,
            "surprise":5,
            "neutral":6
            }

        return {
            "text":example["text"],
            "emotion":example["emotion"],
            "label":class_mapping[example["emotion"]]
        }

    with open(json_path, 'r') as file:
        dataset = json.load(file)

    all_test_data = []

    for val in dataset["conversation"].values():
        all_test_data.extend([(v["text"],v["emotion"]) for v in val])

    is_test = "test" if test else "train"

    with open('{}/task_0_{}.csv'.format(save_path, is_test), 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["text","emotion"])
        writer.writerows(all_test_data)

    raw_dataset = Dataset.from_csv('{}/task_0_{}.csv'.format(save_path, is_test), delimiter=",")

    return raw_dataset.map(process_dataset)
    

#### Load test and train data

In [3]:
test_data_task_0 = load_data_for_task_0(
    json_path = 'test_dataset.json',
    save_path="/media/sml0/RaresPatrascu/projects/Project2",
    test=True)

train_data_task_0 = load_data_for_task_0(
    json_path = 'train_dataset.json',
    save_path="/media/sml0/RaresPatrascu/projects/Project2",
    test=False)

Generating train split: 3373 examples [00:00, 256665.23 examples/s]
Map: 100%|██████████| 3373/3373 [00:00<00:00, 26969.65 examples/s]
Generating train split: 10246 examples [00:00, 241033.57 examples/s]
Map: 100%|██████████| 10246/10246 [00:00<00:00, 27759.62 examples/s]


### Create Dict class with both train and test data 

In [4]:
dataset_task0 = DatasetDict({
    'train': train_data_task_0,
    'test': test_data_task_0
})

dataset_task0

DatasetDict({
    train: Dataset({
        features: ['text', 'emotion', 'label'],
        num_rows: 10246
    })
    test: Dataset({
        features: ['text', 'emotion', 'label'],
        num_rows: 3373
    })
})

#### Train Setup 

##### Setup 1

In [5]:
model_name = "distilbert-base-uncased"
num_labels = 7
batch_size = 16
results = "results/{}".format(model_name)
lr = 2e-5
num_epochs = 2 # [8,5,2]

##### Setup 2

In [62]:
model_name = "bert-base-uncased"
num_labels = 7
batch_size = 16
results = "results/{}".format(model_name)
lr = 2e-5
num_epochs = 3 # [8,5,2]

##### Setup 3

In [29]:
model_name = "roberta-base"
num_labels = 7
batch_size = 16
results = "results/{}".format(model_name)
lr = 2e-5
num_epochs = 5 # [2,5]

##### Setup 4

In [33]:
model_name = "bert-base-multilingual-uncased"
num_labels = 7
batch_size = 16
results = "results/{}".format(model_name)
lr = 2e-5
num_epochs = 3 # [2,5]

##### Setup 5

In [36]:
model_name = "xlnet/xlnet-base-cased"
num_labels = 7
batch_size = 16
results = "results/{}".format(model_name)
lr = 2e-5
num_epochs = 3 # [2,5]

#### Tokenize data

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding=True,
        truncation=True
        )

task_0_data_encoded = dataset_task0.map(tokenize, batched=True, batch_size=None)
task_0_data_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])
task_0_data_encoded

Map: 100%|██████████| 10246/10246 [00:00<00:00, 20143.25 examples/s]
Map: 100%|██████████| 3373/3373 [00:00<00:00, 39562.27 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'emotion', 'label', 'input_ids', 'attention_mask'],
        num_rows: 10246
    })
    test: Dataset({
        features: ['text', 'emotion', 'label', 'input_ids', 'attention_mask'],
        num_rows: 3373
    })
})

### Create method to extract certain metrics dict

In [41]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1_weighted": f1}

### Instantiate Model for pretraining

In [64]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
    ).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Set training environment

In [43]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [65]:
logging_steps = len(task_0_data_encoded["train"]) // batch_size

training_args = TrainingArguments(output_dir=results,
                                  num_train_epochs=num_epochs,
                                  learning_rate=lr,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1_weighted",
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",
                                  disable_tqdm=False)

trainer = Trainer(model=model,
                  args=training_args,
                  data_collator=data_collator,
                  compute_metrics=compute_metrics,
                  train_dataset=task_0_data_encoded["train"],
                  eval_dataset=task_0_data_encoded["test"])


### Setup 1 train 
* model_name = "distilbert-base-uncased"
* num_labels = 7
* batch_size = 16
* results = "results/{}".format(model_name)
* lr = 2e-5
* num_epochs = 2 # [8,5,2]

In [15]:
trainer.train()

 39%|███▉      | 501/1282 [00:42<01:06, 11.82it/s]

{'loss': 1.287, 'learning_rate': 1.2199687987519501e-05, 'epoch': 0.78}


                                                  
 50%|█████     | 641/1282 [00:57<00:50, 12.79it/s]

{'eval_loss': 1.1452741622924805, 'eval_accuracy': 0.6205158612511118, 'eval_f1': 0.5928145360130473, 'eval_runtime': 3.5332, 'eval_samples_per_second': 954.653, 'eval_steps_per_second': 59.719, 'epoch': 1.0}


 78%|███████▊  | 1002/1282 [01:35<00:23, 11.75it/s]

{'loss': 1.0571, 'learning_rate': 4.399375975039002e-06, 'epoch': 1.56}


                                                   
100%|██████████| 1282/1282 [02:02<00:00, 12.71it/s]

{'eval_loss': 1.1321868896484375, 'eval_accuracy': 0.6305959086866291, 'eval_f1': 0.601624597952231, 'eval_runtime': 3.5516, 'eval_samples_per_second': 949.701, 'eval_steps_per_second': 59.409, 'epoch': 2.0}


100%|██████████| 1282/1282 [02:13<00:00,  9.63it/s]

{'train_runtime': 133.1271, 'train_samples_per_second': 153.928, 'train_steps_per_second': 9.63, 'train_loss': 1.1455190483009945, 'epoch': 2.0}





TrainOutput(global_step=1282, training_loss=1.1455190483009945, metrics={'train_runtime': 133.1271, 'train_samples_per_second': 153.928, 'train_steps_per_second': 9.63, 'train_loss': 1.1455190483009945, 'epoch': 2.0})

### Setup 2 train 

* model_name = "bert-base-uncased"
* num_labels = 7
* batch_size = 16
* results = "results/{}".format(model_name)
* lr = 2e-5
* num_epochs = 2

In [66]:
trainer.train()

                                                  
 19%|█▉        | 371/1923 [04:13<04:22,  5.91it/s]

{'loss': 1.2581, 'learning_rate': 1.4799791991679668e-05, 'epoch': 0.78}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                  
[A                                               
 19%|█▉        | 371/1923 [04:44<04:22,  5.91it/s]
[A

{'eval_loss': 1.1263704299926758, 'eval_accuracy': 0.6095463978654018, 'eval_f1_weighted': 0.586506661768034, 'eval_runtime': 7.2059, 'eval_samples_per_second': 468.087, 'eval_steps_per_second': 29.281, 'epoch': 1.0}


                                                  
 19%|█▉        | 371/1923 [05:58<04:22,  5.91it/s] 

{'loss': 0.9777, 'learning_rate': 9.599583983359335e-06, 'epoch': 1.56}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                  
[A                                                
 19%|█▉        | 371/1923 [06:53<04:22,  5.91it/s]
[A

{'eval_loss': 1.1270931959152222, 'eval_accuracy': 0.6335606285206048, 'eval_f1_weighted': 0.6097289428985562, 'eval_runtime': 7.2058, 'eval_samples_per_second': 468.094, 'eval_steps_per_second': 29.282, 'epoch': 2.0}


                                                  
 19%|█▉        | 371/1923 [07:42<04:22,  5.91it/s] 

{'loss': 0.8851, 'learning_rate': 4.399375975039002e-06, 'epoch': 2.34}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                  
[A                                                
 19%|█▉        | 371/1923 [09:02<04:22,  5.91it/s]
[A

{'eval_loss': 1.168025255203247, 'eval_accuracy': 0.6222946931514972, 'eval_f1_weighted': 0.6076702744186467, 'eval_runtime': 7.2352, 'eval_samples_per_second': 466.192, 'eval_steps_per_second': 29.163, 'epoch': 3.0}


                                                  
100%|██████████| 1923/1923 [06:29<00:00,  4.94it/s]

{'train_runtime': 389.4732, 'train_samples_per_second': 78.922, 'train_steps_per_second': 4.937, 'train_loss': 0.9831831005173801, 'epoch': 3.0}





TrainOutput(global_step=1923, training_loss=0.9831831005173801, metrics={'train_runtime': 389.4732, 'train_samples_per_second': 78.922, 'train_steps_per_second': 4.937, 'train_loss': 0.9831831005173801, 'epoch': 3.0})

### Setup 3 train 

* model_name = "roberta-base"
* num_labels = 7
* batch_size = 16
* results = "results/{}".format(model_name)
* lr = 2e-5
* num_epochs = 5 # [2,5]

In [45]:
trainer.train()

  0%|          | 0/3205 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 16%|█▌        | 501/3205 [01:24<07:37,  5.91it/s]

{'loss': 1.2872, 'learning_rate': 1.6879875195007804e-05, 'epoch': 0.78}


 20%|█▉        | 640/3205 [01:47<07:13,  5.92it/s]
 20%|██        | 641/3205 [01:54<07:12,  5.92it/s]

{'eval_loss': 1.1466127634048462, 'eval_accuracy': 0.614882893566558, 'eval_f1': 0.5895575401927462, 'eval_runtime': 6.8337, 'eval_samples_per_second': 493.584, 'eval_steps_per_second': 30.876, 'epoch': 1.0}


 31%|███       | 1001/3205 [03:09<06:16,  5.85it/s] 

{'loss': 1.0492, 'learning_rate': 1.3759750390015602e-05, 'epoch': 1.56}


 40%|███▉      | 1281/3205 [03:56<05:25,  5.92it/s]
 40%|████      | 1282/3205 [04:03<05:24,  5.92it/s]

{'eval_loss': 1.1270663738250732, 'eval_accuracy': 0.6228876371182923, 'eval_f1': 0.6061412655820544, 'eval_runtime': 6.8621, 'eval_samples_per_second': 491.538, 'eval_steps_per_second': 30.748, 'epoch': 2.0}


 47%|████▋     | 1501/3205 [04:54<04:48,  5.90it/s]  

{'loss': 0.9618, 'learning_rate': 1.06396255850234e-05, 'epoch': 2.34}


 60%|█████▉    | 1922/3205 [06:06<03:37,  5.90it/s]
 60%|██████    | 1923/3205 [06:13<03:37,  5.90it/s]

{'eval_loss': 1.1755762100219727, 'eval_accuracy': 0.6208123332345094, 'eval_f1': 0.6076601256607577, 'eval_runtime': 6.8461, 'eval_samples_per_second': 492.688, 'eval_steps_per_second': 30.82, 'epoch': 3.0}


 62%|██████▏   | 2001/3205 [06:40<03:23,  5.90it/s]  

{'loss': 0.8411, 'learning_rate': 7.519500780031202e-06, 'epoch': 3.12}


 78%|███████▊  | 2501/3205 [08:05<02:00,  5.86it/s]

{'loss': 0.7137, 'learning_rate': 4.399375975039002e-06, 'epoch': 3.9}


 80%|███████▉  | 2563/3205 [08:15<01:48,  5.89it/s]
 80%|████████  | 2564/3205 [08:22<01:48,  5.89it/s]

{'eval_loss': 1.2405508756637573, 'eval_accuracy': 0.6181440853839312, 'eval_f1': 0.6059779485302749, 'eval_runtime': 6.9013, 'eval_samples_per_second': 488.747, 'eval_steps_per_second': 30.574, 'epoch': 4.0}


 94%|█████████▎| 3001/3205 [09:50<00:34,  5.87it/s]

{'loss': 0.612, 'learning_rate': 1.2792511700468018e-06, 'epoch': 4.68}


100%|█████████▉| 3204/3205 [10:25<00:00,  5.84it/s]
100%|██████████| 3205/3205 [10:32<00:00,  5.84it/s]

{'eval_loss': 1.2853273153305054, 'eval_accuracy': 0.6154758375333531, 'eval_f1': 0.6069171669696962, 'eval_runtime': 6.9024, 'eval_samples_per_second': 488.672, 'eval_steps_per_second': 30.569, 'epoch': 5.0}


100%|██████████| 3205/3205 [10:50<00:00,  4.93it/s]

{'train_runtime': 650.7125, 'train_samples_per_second': 78.729, 'train_steps_per_second': 4.925, 'train_loss': 0.8916479411251645, 'epoch': 5.0}





TrainOutput(global_step=3205, training_loss=0.8916479411251645, metrics={'train_runtime': 650.7125, 'train_samples_per_second': 78.729, 'train_steps_per_second': 4.925, 'train_loss': 0.8916479411251645, 'epoch': 5.0})

### Setup 4 train

* model_name = "bert-base-multilingual-uncased"
* num_labels = 7
* batch_size = 16
* results = "results/{}".format(model_name)
* lr = 2e-5
* num_epochs = 3 # [2,5]

In [51]:
trainer.train()

  0%|          | 0/1923 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 26%|██▌       | 501/1923 [01:29<04:16,  5.54it/s]

{'loss': 1.3289, 'learning_rate': 1.4799791991679668e-05, 'epoch': 0.78}


 33%|███▎      | 640/1923 [01:54<03:49,  5.58it/s]
 33%|███▎      | 641/1923 [02:01<03:49,  5.58it/s]

{'eval_loss': 1.1996480226516724, 'eval_accuracy': 0.597687518529499, 'eval_f1': 0.5629430787891094, 'eval_runtime': 6.9969, 'eval_samples_per_second': 482.07, 'eval_steps_per_second': 30.156, 'epoch': 1.0}


 52%|█████▏    | 1001/1923 [03:29<02:46,  5.55it/s] 

{'loss': 1.1063, 'learning_rate': 9.599583983359335e-06, 'epoch': 1.56}


 67%|██████▋   | 1281/1923 [04:20<01:55,  5.57it/s]
 67%|██████▋   | 1282/1923 [04:27<01:55,  5.57it/s]

{'eval_loss': 1.175890326499939, 'eval_accuracy': 0.6122146457159798, 'eval_f1': 0.5848778544402612, 'eval_runtime': 7.0298, 'eval_samples_per_second': 479.815, 'eval_steps_per_second': 30.015, 'epoch': 2.0}


 78%|███████▊  | 1501/1923 [05:29<01:16,  5.55it/s]  

{'loss': 1.0236, 'learning_rate': 4.399375975039002e-06, 'epoch': 2.34}


100%|█████████▉| 1922/1923 [06:45<00:00,  5.55it/s]
100%|██████████| 1923/1923 [06:52<00:00,  5.55it/s]

{'eval_loss': 1.2047953605651855, 'eval_accuracy': 0.6015416543136674, 'eval_f1': 0.5809296717149575, 'eval_runtime': 7.0141, 'eval_samples_per_second': 480.889, 'eval_steps_per_second': 30.082, 'epoch': 3.0}


100%|██████████| 1923/1923 [07:21<00:00,  4.35it/s]

{'train_runtime': 441.5898, 'train_samples_per_second': 69.608, 'train_steps_per_second': 4.355, 'train_loss': 1.1028149902851583, 'epoch': 3.0}





TrainOutput(global_step=1923, training_loss=1.1028149902851583, metrics={'train_runtime': 441.5898, 'train_samples_per_second': 69.608, 'train_steps_per_second': 4.355, 'train_loss': 1.1028149902851583, 'epoch': 3.0})

##### Setup 5 train 

* model_name = "xlnet/xlnet-base-cased"
* num_labels = 7
* batch_size = 16
* results = "results/{}".format(model_name)
* lr = 2e-5
* num_epochs = 3 # [2,5]

In [58]:
trainer.train()

  0%|          | 0/1923 [00:00<?, ?it/s]You're using a XLNetTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 26%|██▌       | 500/1923 [01:50<05:16,  4.50it/s]

{'loss': 1.354, 'learning_rate': 1.4799791991679668e-05, 'epoch': 0.78}


 33%|███▎      | 641/1923 [02:22<04:00,  5.32it/s]
 33%|███▎      | 641/1923 [02:31<04:00,  5.32it/s]

{'eval_loss': 1.198628544807434, 'eval_accuracy': 0.6089534538986066, 'eval_f1': 0.5794785447548367, 'eval_runtime': 9.8245, 'eval_samples_per_second': 343.326, 'eval_steps_per_second': 21.477, 'epoch': 1.0}


 52%|█████▏    | 1001/1923 [04:05<03:25,  4.50it/s] 

{'loss': 1.1263, 'learning_rate': 9.599583983359335e-06, 'epoch': 1.56}


 67%|██████▋   | 1282/1923 [05:07<01:59,  5.34it/s]
 67%|██████▋   | 1282/1923 [05:17<01:59,  5.34it/s]

{'eval_loss': 1.1720722913742065, 'eval_accuracy': 0.6119181737325823, 'eval_f1': 0.589517715230408, 'eval_runtime': 9.7961, 'eval_samples_per_second': 344.319, 'eval_steps_per_second': 21.539, 'epoch': 2.0}


 78%|███████▊  | 1501/1923 [06:18<01:34,  4.48it/s]  

{'loss': 1.049, 'learning_rate': 4.399375975039002e-06, 'epoch': 2.34}


100%|██████████| 1923/1923 [07:52<00:00,  5.36it/s]
100%|██████████| 1923/1923 [08:02<00:00,  5.36it/s]

{'eval_loss': 1.1730469465255737, 'eval_accuracy': 0.6101393418321969, 'eval_f1': 0.5898905375341711, 'eval_runtime': 9.8638, 'eval_samples_per_second': 341.957, 'eval_steps_per_second': 21.391, 'epoch': 3.0}


100%|██████████| 1923/1923 [08:19<00:00,  3.85it/s]

{'train_runtime': 499.5051, 'train_samples_per_second': 61.537, 'train_steps_per_second': 3.85, 'train_loss': 1.1300495244907454, 'epoch': 3.0}





TrainOutput(global_step=1923, training_loss=1.1300495244907454, metrics={'train_runtime': 499.5051, 'train_samples_per_second': 61.537, 'train_steps_per_second': 3.85, 'train_loss': 1.1300495244907454, 'epoch': 3.0})

### Evaluate model 

In [55]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch 

model_name = "results/bert-base-uncased_sim_inp/checkpoint-1282"
num_labels = 7

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
    ).to(device)


eval_loader = DataLoader(task_0_data_encoded["test"], batch_size=16)

model.eval()

true_labels = []
pred_labels = []

with torch.no_grad():
    for batch in eval_loader:
        input_ids, attention_mask, labels = batch["input_ids"], batch["attention_mask"], batch["label"]
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        # labels = labels.to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Store true and predicted labels
        true_labels.extend(labels.cpu().numpy())
        pred_labels.extend(torch.argmax(logits, axis=1).cpu().numpy())


from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(true_labels, pred_labels)
report = classification_report(true_labels, pred_labels)

print(f"Accuracy: {accuracy}")
print(report)


Accuracy: 0.6228876371182923
              precision    recall  f1-score   support

           0       0.41      0.56      0.47       374
           1       0.50      0.09      0.15       111
           2       0.35      0.07      0.11       106
           3       0.61      0.58      0.59       594
           4       0.41      0.33      0.37       276
           5       0.69      0.63      0.65       447
           6       0.71      0.79      0.75      1465

    accuracy                           0.62      3373
   macro avg       0.53      0.43      0.44      3373
weighted avg       0.61      0.62      0.61      3373

