In [None]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForSeq2Seq
from peft import PeftModel, PeftConfig

from utils import (
    enable_tf32,
    set_seed,
)
from metrics import compute_metrics
from process_data import load_data
from model import load_tokenizer, load_model
from config import SPLITS

enable_tf32()

DATA_DOCUMENTS = "/home/nub/Bachelor/bachelor-thesis/data/processed/documents.csv"
DATA_DOCUMENTS_AUG = "/home/nub/Bachelor/bachelor-thesis/data/processed/documents_aug.csv"
DATA_TRAIN_PROC = "/home/nub/Bachelor/bachelor-thesis/data/processed/train.csv"
DATA_EVAL_PROC = "/home/nub/Bachelor/bachelor-thesis/data/processed/eval.csv"
DATA_TEST_PROC = "/home/nub/Bachelor/bachelor-thesis/data/processed/test.csv"
MODELS_DIR = "/home/nub/Bachelor/bachelor-thesis/models"

num_cpus = 16

# Enable debug to drastically reduce values
DEBUG = False
DEBUG_SIZE = 4

USE_COT = False
USE_AUG = False

SEED = 42
set_seed(SEED)

SAVE_DIR = "/home/nub/Bachelor/bachelor-thesis/models/finqa_full_base_10"

peft_config = PeftConfig.from_pretrained(SAVE_DIR)
model_name = peft_config.base_model_name_or_path

tokenizer = load_tokenizer(model_name, MODELS_DIR)
model = load_model(model_name, MODELS_DIR)

model = PeftModel.from_pretrained(model, SAVE_DIR)

_, data = load_data(USE_AUG, DEBUG, DEBUG_SIZE)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
def evaluate_model(model, dataset):
    model.eval()
    dataloader = DataLoader(
        dataset,
        batch_size=32,
        collate_fn=data_collator,
        shuffle=False,
        drop_last=False,
        num_workers=0 if DEBUG else num_cpus,
        prefetch_factor=None if DEBUG else 2,
        pin_memory=False if DEBUG else True,
    )

    all_preds = []
    all_labels = []

    for batch in tqdm(dataloader):
        input_ids = batch["input_ids"].to(model.device)
        attention_mask = batch["attention_mask"].to(model.device)
        labels = batch["labels"].to(model.device)

        with torch.no_grad():
            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
                max_new_tokens=128,
                num_beams=10,
                early_stopping=True,
            )
        
        decoded_preds = model.base_model.decode_tokens(generated_ids.detach().cpu().numpy())
        decoded_labels = model.base_model.decode_tokens(labels.detach().cpu().numpy())

        all_preds.extend(decoded_preds)
        all_labels.extend(decoded_labels)
    
    metrics, _ = compute_metrics(all_preds, all_labels, USE_COT)
    
    return metrics

results = {}

for split in SPLITS:
    print(f"Evaluating split: {split}")
    results[split] = {
        **evaluate_model(model, data[split])
    }

print(results)

Evaluating split: train


100%|██████████| 196/196 [06:06<00:00,  1.87s/it]


Evaluating split: eval


100%|██████████| 28/28 [00:51<00:00,  1.82s/it]


Evaluating split: test


100%|██████████| 36/36 [01:06<00:00,  1.84s/it]

{'train': {'penalty_scaled': np.float64(1.1730827122363106), 'penalty_capped': np.float64(1.1730827122363106), 'penalty_uncapped': np.float64(1.290326881032368), 'missing': 0.0, 'exact_match_accuracy': 0.8149096144616861, 'part_match_accuracy': 0.8858049378766015, 'set_match_accuracy': 0.8946355249826652, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0}, 'eval': {'penalty_scaled': np.float64(1.4055336802607197), 'penalty_capped': np.float64(1.4055336802607197), 'penalty_uncapped': np.float64(1.6835787089467724), 'missing': 0.0, 'exact_match_accuracy': 0.5719139297848245, 'part_match_accuracy': 0.731974329935826, 'set_match_accuracy': 0.7477538693846743, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0}, 'test': {'penalty_scaled': np.float64(1.377799639262459), 'penalty_capped': np.float64(1.377799639262459), 'penalty_uncapped': np.float64(1.6360360360360362), 'missing': 0.0, 'exact_match_accuracy': 0.600697471665213




/home/nub/Bachelor/bachelor-thesis/models/finqa_full_base_pseudo

{'train': {'penalty_scaled': np.float64(1.0127699568069108), 'penalty_capped': np.float64(1.0127699568069108), 'penalty_uncapped': np.float64(1.0127699568069108), 'missing': 0.0, 'exact_match_accuracy': 0.9705647096464566, 'part_match_accuracy': 0.9849357436143541, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'eval': {'penalty_scaled': np.float64(1.2865515288788223), 'penalty_capped': np.float64(1.2865515288788223), 'penalty_uncapped': np.float64(1.2865515288788223), 'missing': 0.0, 'exact_match_accuracy': 0.45300113250283125, 'part_match_accuracy': 0.6543978859947137, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'test': {'penalty_scaled': np.float64(1.2886878814298168), 'penalty_capped': np.float64(1.2886878814298168), 'penalty_uncapped': np.float64(1.2886878814298168), 'missing': 0.0, 'exact_match_accuracy': 0.44638186573670446, 'part_match_accuracy': 0.6519907003778002, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0}}

/home/nub/Bachelor/bachelor-thesis/models/finqa_full_base_prompt

{'train': {'penalty_scaled': np.float64(1.1170916653335468), 'penalty_capped': np.float64(1.1170916653335468), 'penalty_uncapped': np.float64(1.1170916653335468), 'missing': 0.0, 'exact_match_accuracy': 0.8341065429531275, 'part_match_accuracy': 0.8994827494267531, 'set_match_accuracy': 0.908041380045855, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'eval': {'penalty_scaled': np.float64(1.3450887127217819), 'penalty_capped': np.float64(1.3450887127217819), 'penalty_uncapped': np.float64(1.3450887127217819), 'missing': 0.0, 'exact_match_accuracy': 0.5209513023782559, 'part_match_accuracy': 0.7095130237825605, 'set_match_accuracy': 0.7258588146470372, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'test': {'penalty_scaled': np.float64(1.3277361232199942), 'penalty_capped': np.float64(1.3277361232199942), 'penalty_uncapped': np.float64(1.3277361232199942), 'missing': 0.0, 'exact_match_accuracy': 0.5527462946817786, 'part_match_accuracy': 0.7213019471084022, 'set_match_accuracy': 0.740860215053766, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0}}

/home/nub/Bachelor/bachelor-thesis/models/finqa_full_base_low

{'train': {'penalty_scaled': np.float64(1.0816701327787555), 'penalty_capped': np.float64(1.0816701327787555), 'penalty_uncapped': np.float64(1.0857974724044153), 'missing': 0.0, 'exact_match_accuracy': 0.9054551271796513, 'part_match_accuracy': 0.9445155441795946, 'set_match_accuracy': 0.9493840985442297, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'eval': {'penalty_scaled': np.float64(1.4816459041147605), 'penalty_capped': np.float64(1.4816459041147605), 'penalty_uncapped': np.float64(1.5104643261608155), 'missing': 0.0, 'exact_match_accuracy': 0.4824462061155153, 'part_match_accuracy': 0.6753491883729695, 'set_match_accuracy': 0.6946772366930908, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'test': {'penalty_scaled': np.float64(1.4557221737866899), 'penalty_capped': np.float64(1.4557221737866899), 'penalty_uncapped': np.float64(1.4821505376344086), 'missing': 0.0, 'exact_match_accuracy': 0.5047951176983435, 'part_match_accuracy': 0.6887532693984343, 'set_match_accuracy': 0.7145015983725687, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0}}

/home/nub/Bachelor/bachelor-thesis/models/finqa_full_base_5

{'train': {'penalty_scaled': np.float64(1.0872346950946008), 'penalty_capped': np.float64(1.0872346950946008), 'penalty_uncapped': np.float64(1.145038127232976), 'missing': 0.0, 'exact_match_accuracy': 0.9046552551591746, 'part_match_accuracy': 0.9430224497413715, 'set_match_accuracy': 0.9473684210526293, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'eval': {'penalty_scaled': np.float64(1.4494812625132452), 'penalty_capped': np.float64(1.4494812625132452), 'penalty_uncapped': np.float64(1.7561721404303514), 'missing': 0.0, 'exact_match_accuracy': 0.5232163080407701, 'part_match_accuracy': 0.7025292563231402, 'set_match_accuracy': 0.7252170630426575, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'test': {'penalty_scaled': np.float64(1.4678271314769105), 'penalty_capped': np.float64(1.4678271314769105), 'penalty_uncapped': np.float64(1.7880557977332177), 'missing': 0.0, 'exact_match_accuracy': 0.5074106364428945, 'part_match_accuracy': 0.6896251089799512, 'set_match_accuracy': 0.7146469049694882, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0}}

/home/nub/Bachelor/bachelor-thesis/models/finqa_full_base_10_fail

{'train': {'penalty_scaled': np.float64(1.143484172315532), 'penalty_capped': np.float64(1.143484172315532), 'penalty_uncapped': np.float64(1.2391697328427453), 'missing': 0.0, 'exact_match_accuracy': 0.8435450327947528, 'part_match_accuracy': 0.9059083879912494, 'set_match_accuracy': 0.9136724790700117, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'eval': {'penalty_scaled': np.float64(1.3992709022117895), 'penalty_capped': np.float64(1.3992709022117895), 'penalty_uncapped': np.float64(1.6751038127595321), 'missing': 0.0, 'exact_match_accuracy': 0.5832389580973952, 'part_match_accuracy': 0.7349943374858449, 'set_match_accuracy': 0.7514911287278228, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'test': {'penalty_scaled': np.float64(1.4000266012448492), 'penalty_capped': np.float64(1.4000266012448492), 'penalty_uncapped': np.float64(1.6749055507120025), 'missing': 0.0, 'exact_match_accuracy': 0.5806451612903226, 'part_match_accuracy': 0.7340889276373177, 'set_match_accuracy': 0.7557686718977067, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0}}

/home/nub/Bachelor/bachelor-thesis/models/finqa_full_base_10

{'train': {'penalty_scaled': np.float64(1.1730827122363106), 'penalty_capped': np.float64(1.1730827122363106), 'penalty_uncapped': np.float64(1.290326881032368), 'missing': 0.0, 'exact_match_accuracy': 0.8149096144616861, 'part_match_accuracy': 0.8858049378766015, 'set_match_accuracy': 0.8946355249826652, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'eval': {'penalty_scaled': np.float64(1.4055336802607197), 'penalty_capped': np.float64(1.4055336802607197), 'penalty_uncapped': np.float64(1.6835787089467724), 'missing': 0.0, 'exact_match_accuracy': 0.5719139297848245, 'part_match_accuracy': 0.731974329935826, 'set_match_accuracy': 0.7477538693846743, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'test': {'penalty_scaled': np.float64(1.377799639262459), 'penalty_capped': np.float64(1.377799639262459), 'penalty_uncapped': np.float64(1.6360360360360362), 'missing': 0.0, 'exact_match_accuracy': 0.6006974716652136, 'part_match_accuracy': 0.7502179598953824, 'set_match_accuracy': 0.7669863411798921, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0}}