# Notebook: Train Model

## Packages

In [1]:
from ACD import aspect_category_labels_to_one_hot, CustomDatasetACD, preprocess_data_ACD, create_model_ACD, compute_metrics_ACD, get_trainer_ACD
from OTE import create_model_OTE, get_preprocessed_data_OTE, compute_metrics_OTE, get_trainer_OTE, divide
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import f1_score, accuracy_score, hamming_loss, precision_score, recall_score
from sklearn.metrics import multilabel_confusion_matrix
from torch.utils.data import Dataset as TorchDataset
from helper import format_seconds_to_time_string
from transformers import DataCollatorWithPadding
from transformers import set_seed
from scipy.special import expit
import pandas as pd
import numpy as np
import datetime
import warnings
import random
import shutil
import torch
import json
import time

In [2]:
warnings.filterwarnings("ignore", category=FutureWarning, module="transformers.optimization")
torch.device("mps")

device(type='mps')

## Parameters

In [3]:
## Delete Later:
TEST_FOLDS = 1

In [4]:
LLM_NAME = "Llama13B"
N_REAL = 500
N_SYNTH = 0
TARGET = "aspect_term" # "aspect_term", "aspect_category"
LLM_SAMPLING = "fixed"

## Settings (do not change!)

In [5]:
N_FOLDS = 5

In [6]:
SPLIT_LOOP = [0, 1, 2, 3, 4, 0, 1, 2, 3]

In [7]:
RANDOM_SEED = 43
random.seed(RANDOM_SEED)

In [8]:
ASPECT_CATEGORIES  = ["GENERAL-IMPRESSION", "FOOD", "SERVICE", "AMBIENCE", "PRICE"]
POLARITIES = ["POSITIVE", "NEUTRAL", "NEGATIVE"]

In [9]:
torch.manual_seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(RANDOM_SEED)

In [10]:
set_seed(RANDOM_SEED)

## Code

### Load Dataset

In [11]:
# Load Real Dataset
splits_real = []
for i in range(N_FOLDS):
    with open(f'../03 dataset split/real/real_{i}.json', 'r') as json_datei:
        real_split = json.load(json_datei)[:N_REAL]
        splits_real.append(real_split)

In [12]:
# Load Synth Dataset
splits_synth = []
for i in range(N_FOLDS):
    with open(f'../04 llm synthesis/synth/{LLM_NAME}/{LLM_SAMPLING}/split_{i}.json', 'r') as json_datei:
        synth_split = json.load(json_datei)[:N_SYNTH]
        splits_synth.append(synth_split)

In [13]:
n_splits_map = {
    500: 1,
    1000: 2,
    2000: 4
}
n_splits_required_real = n_splits_map.get(N_REAL, 0)
n_splits_required_synth = n_splits_map.get(N_SYNTH, 0)
n_splits_required_real, n_splits_required_synth

(1, 0)

In [14]:
# Five indexes, each for one cross valdiation run
train_dataset = []
test_dataset = []

for i in range(N_FOLDS):
    test_data = splits_real[i]
    train_data = []
    
    if N_REAL > 0:
        for split_idx in SPLIT_LOOP[i+1: i+1+n_splits_required_real]:
            for example in splits_real[split_idx]:
                train_data.append(example)
 
    if N_SYNTH > 0:
        for split_idx in SPLIT_LOOP[i+1: i+1+n_splits_required_synth]:
            for example in splits_synth[split_idx]:
                train_data.append(example)
                
    random.shuffle(train_data)
    
    train_dataset.append(train_data)
    test_dataset.append(test_data)

### ACD Model

In [15]:
def train_ACD_model():
    results = {
        "LLM_NAME": LLM_NAME,
        "N_REAL": N_REAL,
        "N_SYNTH": N_SYNTH,
        "TARGET": TARGET,
        "LLM_SAMPLING": LLM_SAMPLING,
    }

    f1_micro_scores = []
    f1_macro_scores = []
    f1_weighted_scores = []
    accuracy_scores = []
    class_f1_scores = []
    loss = []
    hamming = []

    start_time = time.time()

    tokenizer = AutoTokenizer.from_pretrained("deepset/gbert-large")
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    for cross_idx in range(N_FOLDS):
        # Load Data
        train_data = preprocess_data_ACD(train_dataset[cross_idx], tokenizer)
        test_data = preprocess_data_ACD(test_dataset[cross_idx], tokenizer)

        # Load Model
        model_ACD = create_model_ACD()

        # Train Model
        trainer = get_trainer_ACD(model_ACD, train_data, test_data, tokenizer)
        trainer.train()

        # Save Evaluation of Test Data
        eval_metrics = trainer.evaluate()

        # Save Metrics for fold
        f1_micro_scores.append(eval_metrics["eval_f1_micro"])
        f1_macro_scores.append(eval_metrics["eval_f1_macro"])
        f1_weighted_scores.append(eval_metrics["eval_f1_weighted"])
        accuracy_scores.append(eval_metrics["eval_accuracy"])
        class_f1_scores.append(eval_metrics["eval_class_f1_scores"])
        loss.append(eval_metrics["eval_loss"])
        hamming.append(eval_metrics["eval_hamming_loss"])

    runtime = time.time() - start_time

    results["loss"] = np.mean(loss)
    results["hamming"] = np.mean(hamming)
    results["accuracy"] = np.mean(accuracy_scores)
    results["f1_micro"] = np.mean(f1_micro_scores)
    results["f1_macro"] = np.mean(f1_macro_scores)
    results["f1_weighted"] = np.mean(f1_weighted_scores)
    results["runtime"] = runtime
    results["runtime_formatted"] = format_seconds_to_time_string(runtime)
    return results

if TARGET == "aspect_category":
   results = train_ACD_model()

### tf

In [16]:
MODEL_NAME = "deepset/gbert-base"
MAX_TOKENS = 256
RANDOM_SEED = 43
BATCH_SIZE = 16
N_EPOCHS = 1
LEARNING_RATE = 5e-6

label2id = {
    'O': 0,
    'B': 1,
    'I': 2,
}
id2label = {
    0: 'O',
    1: 'B',
    2: 'I',
}

n_labels = len(id2label)

In [17]:
# Source: https://github.com/huggingface/transformers/issues/17971
class TrainingArgumentsWithMPSSupport(TrainingArguments):

    @property
    def device(self) -> torch.device:
        if torch.cuda.is_available():
            return torch.device("cuda")
        elif torch.backends.mps.is_available():
            return torch.device("mps")
        else:
            return torch.device("cpu")

def get_trainer_OTE(train_data, test_data, tokenizer):
    training_args = TrainingArgumentsWithMPSSupport(
        output_dir="output2",
        evaluation_strategy="epoch",
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=N_EPOCHS,
        weight_decay=0.01,
        save_strategy="epoch",
        logging_dir="logs",
        logging_steps=100,
        logging_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1_micro",
        fp16=torch.cuda.is_available(),
        report_to="none",
        seed=RANDOM_SEED,
    )

    compute_metrics_OTE = prepare_compute_metrics(test_data)

    trainer = Trainer(
        model_init=create_model_OTE,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_OTE
    )

    return trainer


### OTE Model

In [18]:
def one_hot_to_label(one_hot):
    return next(id2label[idx] for idx in range(len(one_hot)) if one_hot[idx] == 1)   
def find_bio_phrases(bio_list):
    phrases = []
    phrase_start = None

    for i in range(len(bio_list)):
        if bio_list[i] == 'B':
            if phrase_start is not None:
                phrase_end = i - 1
                phrases.append({"start": phrase_start, "end": phrase_end})
            phrase_start = i
        elif bio_list[i] == 'O':
            if phrase_start is not None:
                phrase_end = i - 1
                phrases.append({"start": phrase_start, "end": phrase_end})
                phrase_start = None

    if phrase_start is not None:
        phrases.append({"start": phrase_start, "end": len(bio_list) - 1})

    return phrases

def calculate_tp_tn_fp_fn_spans(pred, label):
    """
    Calculate true positives (TP), true negatives (TN), false positives (FP), and false negatives (FN) based on the provided
    lists of predicted and actual label ranges.

    Args:
        pred (list of dict): A list containing dictionaries representing predicted ranges with 'start' and 'end' values.
        label (list of dict): A list containing dictionaries representing actual label ranges with 'start' and 'end' values.

    Returns:
        tuple: A tuple containing four values - TP (true positives), FP (false positives), and FN (false negatives).
    """
    # Convert ranges to string representations and create sets.
    pred_set = set(f"{range['start']}_{range['end']}" for range in pred)
    label_set = set(f"{range['start']}_{range['end']}" for range in label)

    # Calculate true positives by finding the intersection of the sets.
    tp_set = pred_set & label_set
    tp = len(tp_set)

    # Calculate false positives by subtracting the intersection from the predicted set.
    fp_set = pred_set - tp_set
    fp = len(fp_set)

    # Calculate false negatives by subtracting the intersection from the label set.
    fn_set = label_set - tp_set
    fn = len(fn_set)

    # Calculate true negatives by considering all possible pairs and subtracting TP, FP, and FN.
    total_possible_pairs = len(pred) * len(label)

    return tp, 0, fp, fn

In [19]:
def prepare_compute_metrics(test_data):
    
    def compute_metrics_OTE(p):
        aspect_categories = test_data.aspect_categories
        predictions, true_labels = p

        #np.save("metrics_data.npy", { "aspect_categories": aspect_categories, "predictions": predictions, "true_labels": true_labels})

        loaded_data = np.load("metrics_data.npy", allow_pickle=True)
        aspect_categories = loaded_data.item()["aspect_categories"]
        predictions = loaded_data.item()["predictions"]
        true_labels = loaded_data.item()["true_labels"]

        metrics = {}

        for ac in ASPECT_CATEGORIES:
            examples_predictions = np.array([predictions[i] for i in range(len(predictions)) if aspect_categories[i] == ac])
            examples_labels = np.array([true_labels[i] for i in range(len(true_labels)) if aspect_categories[i] == ac])

            examples_predictions = (examples_predictions == examples_predictions.max(axis=2)[:,:,np.newaxis]).astype(int)

            examples_predictions = find_bio_phrases([one_hot_to_label(p) for p in examples_predictions[i]])
            examples_labels = find_bio_phrases([one_hot_to_label(p) for p in examples_labels[i]])
            
            tp, tn, fp, fn = calculate_tp_tn_fp_fn_spans(examples_predictions, examples_labels)
            precision = divide(tp, tp + fp)
            recall = divide(tp, tp + fn)
            f1 = divide(2 * precision * recall, precision + recall)
            metrics[f"f1_{ac}"] = f1
    
        metrics["f1_micro"] = sum(metrics[key] for key in [f"f1_{ac}" for ac in ASPECT_CATEGORIES]) / (len(ASPECT_CATEGORIES) - 1)

        return metrics

    return compute_metrics_OTE

In [20]:
def train_OTE_model():
    model_name = "deepset/gbert-base"
    results = {
       "LLM_NAME": LLM_NAME,
       "N_REAL": N_REAL,
       "N_SYNTH": N_SYNTH,
       "TARGET": TARGET,
       "LLM_SAMPLING": LLM_SAMPLING,
    }

    f1_micro_scores = []
    eval_loss = []

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    start_time = time.time()

    for cross_idx in range(N_FOLDS)[0:TEST_FOLDS]:
        # Load Data
        train_data = train_dataset[cross_idx]
        test_data = test_dataset[cross_idx]
        train_data, test_data = get_preprocessed_data_OTE(train_data, test_data, tokenizer)
        trainer = get_trainer_OTE(train_data, test_data, tokenizer)
        trainer.train()
        

        # Save Evaluation Metrics
        eval_metrics = trainer.evaluate()
        f1_micro_scores.append(eval_metrics["eval_f1_micro"])
        eval_loss.append(eval_metrics["eval_loss"])

    runtime = time.time() - start_time
    results["runtime"] = runtime
    results["runtime_formatted"] = format_seconds_to_time_string(runtime)
    results["eval_loss"] = np.mean(eval_loss)
    results["f1_micro"] = np.mean(f1_micro_scores)
    return results

In [21]:
if TARGET == "aspect_term":
   results = train_OTE_model()

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForSpanCategorizationOTE: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSpanCategorizationOTE from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSpanCategorizationOTE from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSpanCategorizationOTE were not initialized from the model checkpoint at de

Epoch,Training Loss,Validation Loss,F1 General-impression,F1 Food,F1 Service,F1 Ambience,F1 Price,F1 Micro
1,473.2563,473.127045,0.117647,0,0,0,0,0.058824


### Save Results

In [22]:
with open(f'results_json/results_{LLM_NAME}_real{N_REAL}_synth{N_SYNTH}_{TARGET}_{LLM_SAMPLING}.json', 'w') as json_file:
    json.dump(results, json_file)

In [23]:
df = pd.DataFrame([results])
df.to_csv(f'results_csv/results_{LLM_NAME}_real{N_REAL}_synth{N_SYNTH}_{TARGET}_{LLM_SAMPLING}.csv', index=False)

In [24]:
results

{'LLM_NAME': 'Llama13B',
 'N_REAL': 500,
 'N_SYNTH': 0,
 'TARGET': 'aspect_term',
 'LLM_SAMPLING': 'fixed',
 'runtime': 58.23281717300415,
 'runtime_formatted': '58s',
 'eval_loss': 473.1270446777344,
 'f1_micro': 0.058823529411764705}

### Remove useless folders

In [25]:
folders_to_delete = ['output', 'output2', 'token_classifier']
for folder in folders_to_delete:
   try:
       shutil.rmtree(folder)
   except:
       pass