# Notebook: Train Model

## Packages

In [1]:
from ACD import aspect_category_labels_to_one_hot, CustomDatasetACD, preprocess_data_ACD, create_model_ACD, compute_metrics_ACD, get_trainer_ACD
from OTE import create_model_OTE, get_preprocessed_data_OTE, compute_metrics_OTE, get_trainer_OTE, divide
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import f1_score, accuracy_score, hamming_loss, precision_score, recall_score
from sklearn.metrics import multilabel_confusion_matrix
from torch.utils.data import Dataset as TorchDataset
from helper import format_seconds_to_time_string
from transformers import DataCollatorWithPadding
from transformers import set_seed
from scipy.special import expit
import pandas as pd
import numpy as np
import datetime
import warnings
import random
import shutil
import torch
import json
import time

In [2]:
warnings.filterwarnings("ignore", category=FutureWarning, module="transformers.optimization")
torch.device("mps")

device(type='mps')

## Parameters

In [3]:
## Delete Later:
TEST_FOLDS = 1

In [4]:
LLM_NAME = "Llama13B"
N_REAL = 500
N_SYNTH = 0
TARGET = "aspect_term" # "aspect_term", "aspect_category"
LLM_SAMPLING = "fixed"

## Settings (do not change!)

In [5]:
N_FOLDS = 5

In [6]:
SPLIT_LOOP = [0, 1, 2, 3, 4, 0, 1, 2, 3]

In [7]:
RANDOM_SEED = 43
random.seed(RANDOM_SEED)

In [8]:
ASPECT_CATEGORIES  = ["GENERAL-IMPRESSION", "FOOD", "SERVICE", "AMBIENCE", "PRICE"]
POLARITIES = ["POSITIVE", "NEUTRAL", "NEGATIVE"]

In [9]:
torch.manual_seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(RANDOM_SEED)

In [10]:
set_seed(RANDOM_SEED)

## Code

### Load Dataset

In [11]:
# Load Real Dataset
splits_real = []
for i in range(N_FOLDS):
    with open(f'../03 dataset split/real/real_{i}.json', 'r') as json_datei:
        real_split = json.load(json_datei)[:N_REAL]
        splits_real.append(real_split)

In [12]:
# Load Synth Dataset
splits_synth = []
for i in range(N_FOLDS):
    with open(f'../04 llm synthesis/synth/{LLM_NAME}/{LLM_SAMPLING}/split_{i}.json', 'r') as json_datei:
        synth_split = json.load(json_datei)[:N_SYNTH]
        splits_synth.append(synth_split)

In [13]:
n_splits_map = {
    500: 1,
    1000: 2,
    2000: 4
}
n_splits_required_real = n_splits_map.get(N_REAL, 0)
n_splits_required_synth = n_splits_map.get(N_SYNTH, 0)
n_splits_required_real, n_splits_required_synth

(1, 0)

In [14]:
# Five indexes, each for one cross valdiation run
train_dataset = []
test_dataset = []

for i in range(N_FOLDS):
    test_data = splits_real[i]
    train_data = []
    
    if N_REAL > 0:
        for split_idx in SPLIT_LOOP[i+1: i+1+n_splits_required_real]:
            for example in splits_real[split_idx]:
                train_data.append(example)
 
    if N_SYNTH > 0:
        for split_idx in SPLIT_LOOP[i+1: i+1+n_splits_required_synth]:
            for example in splits_synth[split_idx]:
                train_data.append(example)
                
    random.shuffle(train_data)
    
    train_dataset.append(train_data)
    test_dataset.append(test_data)

### ACD Model

In [15]:
def train_ACD_model():
    results = {
        "LLM_NAME": LLM_NAME,
        "N_REAL": N_REAL,
        "N_SYNTH": N_SYNTH,
        "TARGET": TARGET,
        "LLM_SAMPLING": LLM_SAMPLING,
    }

    f1_micro_scores = []
    f1_macro_scores = []
    f1_weighted_scores = []
    accuracy_scores = []
    class_f1_scores = []
    loss = []
    hamming = []

    start_time = time.time()

    tokenizer = AutoTokenizer.from_pretrained("deepset/gbert-large")
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    for cross_idx in range(N_FOLDS):
        # Load Data
        train_data = preprocess_data_ACD(train_dataset[cross_idx], tokenizer)
        test_data = preprocess_data_ACD(test_dataset[cross_idx], tokenizer)

        # Load Model
        model_ACD = create_model_ACD()

        # Train Model
        trainer = get_trainer_ACD(model_ACD, train_data, test_data, tokenizer)
        trainer.train()

        # Save Evaluation of Test Data
        eval_metrics = trainer.evaluate()

        # Save Metrics for fold
        f1_micro_scores.append(eval_metrics["eval_f1_micro"])
        f1_macro_scores.append(eval_metrics["eval_f1_macro"])
        f1_weighted_scores.append(eval_metrics["eval_f1_weighted"])
        accuracy_scores.append(eval_metrics["eval_accuracy"])
        class_f1_scores.append(eval_metrics["eval_class_f1_scores"])
        loss.append(eval_metrics["eval_loss"])
        hamming.append(eval_metrics["eval_hamming_loss"])

    runtime = time.time() - start_time

    results["loss"] = np.mean(loss)
    results["hamming"] = np.mean(hamming)
    results["accuracy"] = np.mean(accuracy_scores)
    results["f1_micro"] = np.mean(f1_micro_scores)
    results["f1_macro"] = np.mean(f1_macro_scores)
    results["f1_weighted"] = np.mean(f1_weighted_scores)
    results["runtime"] = runtime
    results["runtime_formatted"] = format_seconds_to_time_string(runtime)
    return results

if TARGET == "aspect_category":
   results = train_ACD_model()

### tf

In [16]:
MODEL_NAME = "deepset/gbert-base"
MAX_TOKENS = 256
RANDOM_SEED = 43
BATCH_SIZE = 16
N_EPOCHS = 3
LEARNING_RATE = 5e-6

label2id = {
    'O': 0,
    'B': 1,
    'I': 2,
}
id2label = {
    0: 'O',
    1: 'B',
    2: 'I',
}

n_labels = len(id2label)

In [17]:
# Source: https://github.com/huggingface/transformers/issues/17971
class TrainingArgumentsWithMPSSupport(TrainingArguments):

    @property
    def device(self) -> torch.device:
        if torch.cuda.is_available():
            return torch.device("cuda")
        elif torch.backends.mps.is_available():
            return torch.device("mps")
        else:
            return torch.device("cpu")

def get_trainer_OTE(train_data, test_data, tokenizer):
    training_args = TrainingArgumentsWithMPSSupport(
        output_dir="output2",
        evaluation_strategy="epoch",
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=N_EPOCHS,
        weight_decay=0.01,
        save_strategy="epoch",
        logging_dir="logs",
        logging_steps=100,
        logging_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1_micro",
        fp16=torch.cuda.is_available(),
        report_to="none",
        seed=RANDOM_SEED,
    )

    compute_metrics_OTE = prepare_compute_metrics(test_data)

    trainer = Trainer(
        model_init=create_model_OTE,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_OTE
    )

    return trainer


### OTE Model

In [18]:
def prepare_compute_metrics(test_data):
    
    def compute_metrics_OTE(p):
        aspect_categories = test_data.aspect_categories
        predictions, true_labels = p

        np.save("metrics_data.npy", { "aspect_categories": aspect_categories, "predictions": predictions, "true_labels": true_labels})
        
        predicted_labels = np.where(predictions > 0, np.ones(
            predictions.shape), np.zeros(predictions.shape))
        metrics = {}
        cm = multilabel_confusion_matrix(
            true_labels.reshape(-1, n_labels), predicted_labels.reshape(-1, n_labels))

        for label_idx, matrix in enumerate(cm):
            if label_idx == 0:
                continue  # We don't care about the label "O"
            tp, fp, fn = matrix[1, 1], matrix[0, 1], matrix[1, 0]
            precision = divide(tp, tp + fp)
            recall = divide(tp, tp + fn)
            f1 = divide(2 * precision * recall, precision + recall)
            metrics[f"f1_{id2label[label_idx]}"] = f1

        f1_micro = sum(list(metrics.values())) / (n_labels - 1)
        metrics["f1_micro"] = f1_micro

        return metrics

    return compute_metrics_OTE

In [19]:
def train_OTE_model():
    model_name = "deepset/gbert-base"
    results = {
       "LLM_NAME": LLM_NAME,
       "N_REAL": N_REAL,
       "N_SYNTH": N_SYNTH,
       "TARGET": TARGET,
       "LLM_SAMPLING": LLM_SAMPLING,
    }

    f1_micro_scores = []
    eval_loss = []

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    start_time = time.time()

    for cross_idx in range(N_FOLDS)[0:TEST_FOLDS]:
        # Load Data
        train_data = train_dataset[cross_idx]
        test_data = test_dataset[cross_idx]
        train_data, test_data = get_preprocessed_data_OTE(train_data, test_data, tokenizer)
        trainer = get_trainer_OTE(train_data, test_data, tokenizer)
        trainer.train()
        

        # Save Evaluation Metrics
        eval_metrics = trainer.evaluate()
        f1_micro_scores.append(eval_metrics["eval_f1_micro"])
        eval_loss.append(eval_metrics["eval_loss"])

    runtime = time.time() - start_time
    results["runtime"] = runtime
    results["runtime_formatted"] = format_seconds_to_time_string(runtime)
    results["eval_loss"] = np.mean(eval_loss)
    results["f1_micro"] = np.mean(f1_micro_scores)
    return results

In [20]:
if TARGET == "aspect_term":
   results = train_OTE_model()

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForSpanCategorizationOTE: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSpanCategorizationOTE from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSpanCategorizationOTE from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSpanCategorizationOTE were not initialized from the model checkpoint at de

Epoch,Training Loss,Validation Loss,F1 B,F1 I,F1 Micro
1,473.2204,473.025055,0.008294,0.006246,0.00727
2,472.9817,472.882416,0.008294,0.006246,0.00727
3,472.9025,472.851501,0.008294,0.006246,0.00727


### Save Results

In [21]:
with open(f'results_json/results_{LLM_NAME}_real{N_REAL}_synth{N_SYNTH}_{TARGET}_{LLM_SAMPLING}.json', 'w') as json_file:
    json.dump(results, json_file)

In [22]:
df = pd.DataFrame([results])
df.to_csv(f'results_csv/results_{LLM_NAME}_real{N_REAL}_synth{N_SYNTH}_{TARGET}_{LLM_SAMPLING}.csv', index=False)

In [23]:
results

{'LLM_NAME': 'Llama13B',
 'N_REAL': 500,
 'N_SYNTH': 0,
 'TARGET': 'aspect_term',
 'LLM_SAMPLING': 'fixed',
 'runtime': 148.95559811592102,
 'runtime_formatted': '2m 28s',
 'eval_loss': 473.0250549316406,
 'f1_micro': 0.007269823623350144}

### Remove useless folders

In [24]:
folders_to_delete = ['output', 'output2', 'token_classifier']
for folder in folders_to_delete:
   try:
       shutil.rmtree(folder)
   except:
       pass