In [1]:
import torch 
from datasets import load_dataset
from transformers import AutoTokenizer
import json 
import itertools

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_all_poss_utt_pairs(utt_list):
    """
    Create a list with all possible utterance combinations
      for each conversation

    Returns:
        List[List[int]]

    Example:
        For a conversation that is made by 3 utterances we
        will have 9 utterance combinations.
        [(1,1),(1,2),(1,3),(2,1),(2,2),(2,3),(3,1),(3,2),(3,3)]
    """
    
    ut_id_comb = list(itertools.combinations(range(1,len(utt_list)+1),2))
    ut_id_comb_reverse = [combo[::-1] for combo in ut_id_comb]
    ut_id_comb_same = [(x,x) for x in range(1,len(utt_list)+1)]

    return ut_id_comb + ut_id_comb_reverse + ut_id_comb_same

#### Load data and count nr of labels for each class 

In [3]:
json_path = "full_dataset.json"

with open(json_path, 'r') as file:
    dataset = json.load(file)

all_data = {
    "sen_a" : [],
    "sen_b" : [],
    "label" : []

}

max_distance = 2
n_0 = 0
n_1 = 0

for val in dataset:
    all_pairs = get_all_poss_utt_pairs(
        val["conversation"]
    )

    emot_cause_pair = [
        (int(pair[1][0]),int(pair[0][0]))
        for pair in val["emotion-cause_pairs"]
        ]

    for p in all_pairs:
        if p in emot_cause_pair:
            all_data["label"].append(1)
            n_1 += 1
            all_data["sen_a"].append(val["conversation"][p[0]-1]["text"])
            all_data["sen_b"].append(val["conversation"][p[1]-1]["text"])
        elif abs(p[0]-p[1]) < max_distance:
            all_data["label"].append(0)
            all_data["sen_a"].append(val["conversation"][p[0]-1]["text"])
            all_data["sen_b"].append(val["conversation"][p[1]-1]["text"])
            n_0 += 1

print("Number of positive samples {}".format(n_1))
print("Number of negative samples {}".format(n_0))


Number of positive samples 7040
Number of negative samples 32375


We can see that the data is way too unbalanced for train so we will use max distance of 2 between utterances from a conversation with label 0 that will be selected for train.

In [4]:
from datasets import Dataset

created_dataset = Dataset.from_dict(all_data)
created_dataset

Dataset({
    features: ['sen_a', 'sen_b', 'label'],
    num_rows: 39415
})

#### Train args

In [5]:
model_name = "bert-base-uncased"
num_labels = 2
batch_size = 16
results = "results/{}".format(model_name+"_task_1")
lr = 2e-5
num_epochs = 3 # [8,5,2]

In [6]:
model_name = "bert-large-uncased"
num_labels = 2
batch_size = 8
results = "results/{}".format(model_name+"_task_1")
lr = 2e-5
num_epochs = 3 # [8,5,2]

#### Tokenize data

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(
        batch["input"],
        padding=True,
        truncation=True
        )

def process_dataset(example):
    # apply preprocessing and tokenization here ??
    return {
        "input": example["sen_a"]+ tokenizer.sep_token + example["sen_b"]
    }

created_dataset = created_dataset.map(
    process_dataset
    )
created_dataset_encoded = created_dataset.map(
    tokenize,
    batched=True,
    batch_size=None)

created_dataset_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])

created_dataset_encoded

Map: 100%|██████████| 39415/39415 [00:01<00:00, 21764.89 examples/s]
Map: 100%|██████████| 39415/39415 [00:04<00:00, 8698.28 examples/s]


Dataset({
    features: ['sen_a', 'sen_b', 'label', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 39415
})

#### Split train test 

In [8]:
train_test_data = created_dataset_encoded.train_test_split(
    test_size=0.2,
    shuffle = False,
    seed = 1337
    )

In [9]:
train_test_data

DatasetDict({
    train: Dataset({
        features: ['sen_a', 'sen_b', 'label', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 31532
    })
    test: Dataset({
        features: ['sen_a', 'sen_b', 'label', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7883
    })
})

#### Train env 

In [10]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
    ).to(device)


logging_steps = len(train_test_data["train"]) // batch_size

training_args = TrainingArguments(output_dir=results,
                                  num_train_epochs=num_epochs,
                                  learning_rate=lr,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1",
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",
                                  disable_tqdm=False)

trainer = Trainer(model=model,
                  args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=train_test_data["train"],
                  eval_dataset=train_test_data["test"])


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


* model_name = "bert-base-uncased"
* num_labels = 2
* batch_size = 16
* results = "results/{}".format(model_name+"_task_1")
* lr = 2e-5
* num_epochs = 3 # [8,5,2]

In [12]:
trainer.train()

  8%|▊         | 500/5913 [02:27<26:46,  3.37it/s]

{'loss': 0.4499, 'learning_rate': 1.8308811094199225e-05, 'epoch': 0.25}


 17%|█▋        | 1000/5913 [04:56<24:24,  3.35it/s]

{'loss': 0.4191, 'learning_rate': 1.6617622188398446e-05, 'epoch': 0.51}


 25%|██▌       | 1500/5913 [07:25<21:55,  3.36it/s]

{'loss': 0.4109, 'learning_rate': 1.4926433282597667e-05, 'epoch': 0.76}


                                                   
 33%|███▎      | 1971/5913 [10:33<18:09,  3.62it/s]

{'eval_loss': 0.40926602482795715, 'eval_accuracy': 0.8231637701382722, 'eval_f1': 0.801014017560477, 'eval_runtime': 48.1306, 'eval_samples_per_second': 163.784, 'eval_steps_per_second': 10.243, 'epoch': 1.0}


 34%|███▍      | 2000/5913 [10:54<19:23,  3.36it/s]   

{'loss': 0.3938, 'learning_rate': 1.323524437679689e-05, 'epoch': 1.01}


 42%|████▏     | 2500/5913 [13:23<16:56,  3.36it/s]

{'loss': 0.3443, 'learning_rate': 1.1544055470996111e-05, 'epoch': 1.27}


 51%|█████     | 3000/5913 [15:52<14:28,  3.35it/s]

{'loss': 0.3363, 'learning_rate': 9.852866565195333e-06, 'epoch': 1.52}


 59%|█████▉    | 3500/5913 [18:22<11:58,  3.36it/s]

{'loss': 0.3332, 'learning_rate': 8.161677659394555e-06, 'epoch': 1.78}


                                                   
 67%|██████▋   | 3942/5913 [21:21<09:04,  3.62it/s]

{'eval_loss': 0.425961971282959, 'eval_accuracy': 0.8208803754915641, 'eval_f1': 0.7998168088283532, 'eval_runtime': 47.9172, 'eval_samples_per_second': 164.513, 'eval_steps_per_second': 10.289, 'epoch': 2.0}


 68%|██████▊   | 4000/5913 [21:51<09:29,  3.36it/s]   

{'loss': 0.3159, 'learning_rate': 6.470488753593776e-06, 'epoch': 2.03}


 76%|███████▌  | 4500/5913 [24:20<07:01,  3.36it/s]

{'loss': 0.242, 'learning_rate': 4.779299847792998e-06, 'epoch': 2.28}


 85%|████████▍ | 5000/5913 [26:49<04:31,  3.36it/s]

{'loss': 0.2534, 'learning_rate': 3.0881109419922206e-06, 'epoch': 2.54}


 93%|█████████▎| 5500/5913 [29:18<02:02,  3.36it/s]

{'loss': 0.2408, 'learning_rate': 1.3969220361914426e-06, 'epoch': 2.79}


                                                   
100%|██████████| 5913/5913 [32:09<00:00,  3.61it/s]

{'eval_loss': 0.5551738739013672, 'eval_accuracy': 0.8111125206139794, 'eval_f1': 0.7979120001730833, 'eval_runtime': 48.0787, 'eval_samples_per_second': 163.96, 'eval_steps_per_second': 10.254, 'epoch': 3.0}


100%|██████████| 5913/5913 [32:25<00:00,  3.04it/s]

{'train_runtime': 1945.5141, 'train_samples_per_second': 48.623, 'train_steps_per_second': 3.039, 'train_loss': 0.3331042889227787, 'epoch': 3.0}





TrainOutput(global_step=5913, training_loss=0.3331042889227787, metrics={'train_runtime': 1945.5141, 'train_samples_per_second': 48.623, 'train_steps_per_second': 3.039, 'train_loss': 0.3331042889227787, 'epoch': 3.0})

In [11]:
trainer.train()

  0%|          | 0/5913 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 46.00 MiB. GPU 0 has a total capacty of 7.79 GiB of which 24.12 MiB is free. Process 2041 has 89.99 MiB memory in use. Including non-PyTorch memory, this process has 7.66 GiB memory in use. Of the allocated memory 7.50 GiB is allocated by PyTorch, and 45.07 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

### Evaluate best model

In [14]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch 

model_name = "results/bert-base-uncased_task_1/checkpoint-1971"
num_labels = 2

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
    ).to(device)


eval_loader = DataLoader(train_test_data["test"], batch_size=16)

model.eval()

true_labels = []
pred_labels = []

with torch.no_grad():
    for batch in eval_loader:
        input_ids, attention_mask, labels = batch["input_ids"], batch["attention_mask"], batch["label"]
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Store true and predicted labels
        true_labels.extend(labels.cpu().numpy())
        pred_labels.extend(torch.argmax(logits, axis=1).cpu().numpy())


from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(true_labels, pred_labels)
report = classification_report(true_labels, pred_labels)

print(f"Accuracy: {accuracy}")
print(report)


Accuracy: 0.8231637701382722
              precision    recall  f1-score   support

           0       0.86      0.94      0.90      6450
           1       0.53      0.28      0.37      1433

    accuracy                           0.82      7883
   macro avg       0.69      0.61      0.63      7883
weighted avg       0.80      0.82      0.80      7883

