# Notebook: Train Model for a given Condition

## Packages

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import f1_score, accuracy_score, hamming_loss, precision_score, recall_score
from torch.utils.data import Dataset as TorchDataset
from transformers import DataCollatorWithPadding
from scipy.special import expit
import random
import torch
import json
import time

In [2]:
import torch
torch.device("mps")

device(type='mps')

## Parameters

In [3]:
LLM_NAME = "Llama13B"
N_REAL = 500
N_SYNTH = 0
TARGET = "aspect_category"
LLM_SAMPLING = "fixed"

## Settings (do not change!)

In [4]:
N_FOLDS = 5

In [5]:
SPLIT_LOOP = [0, 1, 2, 3, 4, 0, 1, 2, 3]

In [6]:
RANDOM_SEED = 43
random.seed(RANDOM_SEED)

In [7]:
ASPECT_CATEGORIES  = ["GENERAL-IMPRESSION", "FOOD", "SERVICE", "AMBIENCE", "PRICE"]
POLARITIES = ["POSITIVE", "NEUTRAL", "NEGATIVE"]

## Code

### Load Dataset

In [8]:
# Load Real Dataset
splits_real = []
for i in range(N_FOLDS):
    with open(f'../03 dataset split/real/real_{i}.json', 'r') as json_datei:
        real_split = json.load(json_datei)[:N_REAL]
        splits_real.append(real_split)

In [9]:
# Load Synth Dataset
splits_synth = []
for i in range(N_FOLDS):
    with open(f'../04 llm synthesis/synth/{LLM_NAME}/{LLM_SAMPLING}/split_{i}.json', 'r') as json_datei:
        synth_split = json.load(json_datei)[:N_SYNTH]
        splits_synth.append(synth_split)

In [10]:
n_splits_map = {
    500: 1,
    1000: 2,
    2000: 4
}
n_splits_required_real = n_splits_map.get(N_REAL, 0)
n_splits_required_synth = n_splits_map.get(N_SYNTH, 0)
n_splits_required_real, n_splits_required_synth

(1, 0)

In [11]:
# Five indexes, each for one cross valdiation run
train_dataset = []
test_dataset = []

for i in range(N_FOLDS):
    test_data = splits_real[i]
    train_data = []
    
    if N_REAL > 0:
        for split_idx in SPLIT_LOOP[i+1: i+1+n_splits_required_real]:
            for example in splits_real[split_idx]:
                train_data.append(example)
 
    if N_SYNTH > 0:
        for split_idx in SPLIT_LOOP[i+1: i+1+n_splits_required_synth]:
            for example in splits_synth[split_idx]:
                train_data.append(example)
                
    random.shuffle(train_data)
    
    train_dataset.append(train_data)
    test_dataset.append(test_data)

### Preprocessing

### Load Model

In [12]:
model_name_ACD = "deepset/gbert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name_ACD)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
def aspect_category_labels_to_one_hot(labels):
    one_hot = []
    for label in ASPECT_CATEGORIES:
        if label in labels:
            one_hot.append(1)
        else:
            one_hot.append(0)
    return one_hot

In [14]:
class CustomDatasetACD(TorchDataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["label"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [15]:
def preprocess_data_ACD(dataset, tokenizer):
    texts = [example["text"] for example in dataset]

    #list(set(original_list))
    
    labels = [list(set([tag["label"] for tag in example["tags"]])) for example in dataset]
    labels = [aspect_category_labels_to_one_hot(label) for label in labels]
    labels = torch.tensor(labels, dtype=torch.float32)
    encodings = tokenizer(texts, padding=True, truncation=True, max_length=256, return_tensors="pt")
    return CustomDatasetACD(encodings, labels)

In [16]:
def create_model_ACD():
    model = AutoModelForSequenceClassification.from_pretrained(
        pretrained_model_name_or_path=model_name_ACD,
        num_labels=len(ASPECT_CATEGORIES),
        problem_type="multi_label_classification"
    )
    return model

In [17]:
def compute_metrics_ACD(eval_pred):
    predictions, lab = eval_pred

    predictions = (expit(predictions) > 0.5)
    labels = [l==1 for l in lab]

    print(labels[0], predictions[0])

    accuracy = accuracy_score(labels, predictions)

    f1_macro = f1_score(labels, predictions, average="macro")
    f1_micro = f1_score(labels, predictions, average="micro")
    f1_weighted = f1_score(labels, predictions, average="weighted")

    class_f1_scores = f1_score(labels, predictions, average=None)

    hamming = hamming_loss(labels, predictions)

    metrics = {
        "hamming_loss": hamming,
        "accuracy": accuracy,
        "f1_macro": f1_macro,
        "f1_micro": f1_micro,
        "f1_weighted": f1_weighted,
        "class_f1_scores": class_f1_scores.tolist(),
    }

    return metrics

In [18]:
def get_trainer_ACD(model, train_data, test_data, tokenizer):
    batch_size = 16
    epochs = 5
    learning_rate = 5e-06
    training_args = TrainingArguments(
        output_dir="output",
        learning_rate=learning_rate,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_dir="logs",
        logging_steps=5,
        logging_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1_micro",
      # ONLY CUDA:  fp16=True,
        report_to="none"
    )

    trainer = Trainer(
        model=model.to(torch.device("mps")),
        args=training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_ACD
    )
    
    return trainer

In [19]:
for cross_idx in range(5):
    # Load Data
    train_data = preprocess_data_ACD(train_dataset[cross_idx], tokenizer)
    test_data = preprocess_data_ACD(test_dataset[cross_idx], tokenizer)

    # Load Model
    model_ACD = create_model_ACD()

    trainer = get_trainer_ACD(model_ACD, train_data, test_data, tokenizer)
    trainer.train()
    eval_metrics = trainer.evaluate()
    print(eval_metrics)

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

Epoch,Training Loss,Validation Loss,Loss,Accuracy,F1 Macro,F1 Micro,F1 Weighted,Class F1 Scores,Runtime,Samples Per Second,Steps Per Second
1,0.5683,0.1196,0.443236,0.47,0.459431,0.691434,0.601071,"[0.0, 0.8970099667774087, 0.9076086956521738, 0.4925373134328358, 0.0]",6.1452,81.364,5.207
2,0.4021,0.0268,0.319253,0.866,0.759524,0.944213,0.915709,"[1.0, 1.0, 1.0, 0.7976190476190476, 0.0]",6.1315,81.545,5.219
3,0.3104,0.0,0.247975,1.0,1.0,1.0,1.0,"[1.0, 1.0, 1.0, 1.0, 1.0]",6.1191,81.711,5.23


[False False  True False False] [False False  True False False]


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["label"] = torch.tensor(self.labels[idx])


[False False  True False False] [False False  True False False]


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["label"] = torch.tensor(self.labels[idx])


[False False  True False False] [False False  True False False]


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["label"] = torch.tensor(self.labels[idx])

KeyboardInterrupt



In [None]:
eval_metrics