In [None]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForSeq2Seq
from peft import PeftModel, PeftConfig

from utils import (
    enable_tf32,
    set_seed,
)
from metrics import compute_metrics
from process_data import load_data, DynamicDataset
from model import load_tokenizer, load_model
from config import SPLITS

%cd ..

enable_tf32()

num_cpus = 16

# Enable debug to drastically reduce values
DEBUG = False
DEBUG_INPUTS = False
DEBUG_SIZE = 4

USE_COT = False
USE_AUG = True

BATCH_SIZE = 16
N_EXAMPLES = 2

SEED = 42
set_seed(SEED)

SAVE_DIR = "/home/nub/Bachelor/bachelor-thesis/models/finqa_base_10_cot"

peft_config = PeftConfig.from_pretrained(SAVE_DIR)
model_name = peft_config.base_model_name_or_path

tokenizer = load_tokenizer(model_name)
model = load_model(model_name, tokenizer, USE_COT)

model = PeftModel.from_pretrained(model, SAVE_DIR)

_, data = load_data(USE_AUG, DEBUG, DEBUG_SIZE)

eval_data = {}
for split in SPLITS:
    eval_data[split] = DynamicDataset(
        data[split],
        tokenizer=tokenizer,
        documents_ds=None,
        format_id=0,
        n_examples=N_EXAMPLES,
        seed=SEED,
        indexing=not USE_AUG,
        use_cot=USE_COT,
        debug=DEBUG_INPUTS,
    )

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

/home/nub/Bachelor/bachelor-thesis


In [2]:
def evaluate_model(model, dataset):
    model.eval()
    dataloader = DataLoader(
        dataset,
        batch_size=BATCH_SIZE,
        collate_fn=data_collator,
        shuffle=False,
        drop_last=False,
        num_workers=0 if DEBUG else num_cpus,
        prefetch_factor=None if DEBUG else 2,
        pin_memory=False if DEBUG else True,
    )

    all_preds = []
    all_labels = []

    for batch in tqdm(dataloader):
        input_ids = batch["input_ids"].to(model.device)
        attention_mask = batch["attention_mask"].to(model.device)
        labels = batch["labels"].to(model.device)

        with torch.no_grad():
            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
                max_new_tokens=tokenizer.model_max_length,
                num_beams=10,
                early_stopping=True,
            )
        
        decoded_preds = model.base_model.decode_tokens(generated_ids.detach().cpu().numpy())
        decoded_labels = model.base_model.decode_tokens(labels.detach().cpu().numpy())

        all_preds.extend(decoded_preds)
        all_labels.extend(decoded_labels)
    
    metrics, _ = compute_metrics(all_preds, all_labels, USE_COT)
    
    return metrics

results = {}

for split in SPLITS:
    print(f"Evaluating split: {split}")
    results[split] = {
        **evaluate_model(model, eval_data[split])
    }

print(results)

Evaluating split: train


  2%|▏         | 3/196 [00:24<26:09,  8.13s/it]


KeyboardInterrupt: 

/home/nub/Bachelor/bachelor-thesis/models/finqa_full_base_pseudo

{'train': {'penalty_scaled': np.float64(1.0127699568069108), 'penalty_capped': np.float64(1.0127699568069108), 'penalty_uncapped': np.float64(1.0127699568069108), 'missing': 0.0, 'exact_match_accuracy': 0.9705647096464566, 'part_match_accuracy': 0.9849357436143541, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'eval': {'penalty_scaled': np.float64(1.2865515288788223), 'penalty_capped': np.float64(1.2865515288788223), 'penalty_uncapped': np.float64(1.2865515288788223), 'missing': 0.0, 'exact_match_accuracy': 0.45300113250283125, 'part_match_accuracy': 0.6543978859947137, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'test': {'penalty_scaled': np.float64(1.2886878814298168), 'penalty_capped': np.float64(1.2886878814298168), 'penalty_uncapped': np.float64(1.2886878814298168), 'missing': 0.0, 'exact_match_accuracy': 0.44638186573670446, 'part_match_accuracy': 0.6519907003778002, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0}}

/home/nub/Bachelor/bachelor-thesis/models/finqa_full_base_prompt

{'train': {'penalty_scaled': np.float64(1.1170916653335468), 'penalty_capped': np.float64(1.1170916653335468), 'penalty_uncapped': np.float64(1.1170916653335468), 'missing': 0.0, 'exact_match_accuracy': 0.8341065429531275, 'part_match_accuracy': 0.8994827494267531, 'set_match_accuracy': 0.908041380045855, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'eval': {'penalty_scaled': np.float64(1.3450887127217819), 'penalty_capped': np.float64(1.3450887127217819), 'penalty_uncapped': np.float64(1.3450887127217819), 'missing': 0.0, 'exact_match_accuracy': 0.5209513023782559, 'part_match_accuracy': 0.7095130237825605, 'set_match_accuracy': 0.7258588146470372, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'test': {'penalty_scaled': np.float64(1.3277361232199942), 'penalty_capped': np.float64(1.3277361232199942), 'penalty_uncapped': np.float64(1.3277361232199942), 'missing': 0.0, 'exact_match_accuracy': 0.5527462946817786, 'part_match_accuracy': 0.7213019471084022, 'set_match_accuracy': 0.740860215053766, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0}}

/home/nub/Bachelor/bachelor-thesis/models/finqa_full_base_low

{'train': {'penalty_scaled': np.float64(1.0816701327787555), 'penalty_capped': np.float64(1.0816701327787555), 'penalty_uncapped': np.float64(1.0857974724044153), 'missing': 0.0, 'exact_match_accuracy': 0.9054551271796513, 'part_match_accuracy': 0.9445155441795946, 'set_match_accuracy': 0.9493840985442297, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'eval': {'penalty_scaled': np.float64(1.4816459041147605), 'penalty_capped': np.float64(1.4816459041147605), 'penalty_uncapped': np.float64(1.5104643261608155), 'missing': 0.0, 'exact_match_accuracy': 0.4824462061155153, 'part_match_accuracy': 0.6753491883729695, 'set_match_accuracy': 0.6946772366930908, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'test': {'penalty_scaled': np.float64(1.4557221737866899), 'penalty_capped': np.float64(1.4557221737866899), 'penalty_uncapped': np.float64(1.4821505376344086), 'missing': 0.0, 'exact_match_accuracy': 0.5047951176983435, 'part_match_accuracy': 0.6887532693984343, 'set_match_accuracy': 0.7145015983725687, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0}}

/home/nub/Bachelor/bachelor-thesis/models/finqa_full_base_5

{'train': {'penalty_scaled': np.float64(1.0872346950946008), 'penalty_capped': np.float64(1.0872346950946008), 'penalty_uncapped': np.float64(1.145038127232976), 'missing': 0.0, 'exact_match_accuracy': 0.9046552551591746, 'part_match_accuracy': 0.9430224497413715, 'set_match_accuracy': 0.9473684210526293, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'eval': {'penalty_scaled': np.float64(1.4494812625132452), 'penalty_capped': np.float64(1.4494812625132452), 'penalty_uncapped': np.float64(1.7561721404303514), 'missing': 0.0, 'exact_match_accuracy': 0.5232163080407701, 'part_match_accuracy': 0.7025292563231402, 'set_match_accuracy': 0.7252170630426575, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'test': {'penalty_scaled': np.float64(1.4678271314769105), 'penalty_capped': np.float64(1.4678271314769105), 'penalty_uncapped': np.float64(1.7880557977332177), 'missing': 0.0, 'exact_match_accuracy': 0.5074106364428945, 'part_match_accuracy': 0.6896251089799512, 'set_match_accuracy': 0.7146469049694882, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0}}

/home/nub/Bachelor/bachelor-thesis/models/finqa_full_base_10_fail

{'train': {'penalty_scaled': np.float64(1.143484172315532), 'penalty_capped': np.float64(1.143484172315532), 'penalty_uncapped': np.float64(1.2391697328427453), 'missing': 0.0, 'exact_match_accuracy': 0.8435450327947528, 'part_match_accuracy': 0.9059083879912494, 'set_match_accuracy': 0.9136724790700117, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'eval': {'penalty_scaled': np.float64(1.3992709022117895), 'penalty_capped': np.float64(1.3992709022117895), 'penalty_uncapped': np.float64(1.6751038127595321), 'missing': 0.0, 'exact_match_accuracy': 0.5832389580973952, 'part_match_accuracy': 0.7349943374858449, 'set_match_accuracy': 0.7514911287278228, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'test': {'penalty_scaled': np.float64(1.4000266012448492), 'penalty_capped': np.float64(1.4000266012448492), 'penalty_uncapped': np.float64(1.6749055507120025), 'missing': 0.0, 'exact_match_accuracy': 0.5806451612903226, 'part_match_accuracy': 0.7340889276373177, 'set_match_accuracy': 0.7557686718977067, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0}}

/home/nub/Bachelor/bachelor-thesis/models/finqa_base_10 - local run

{'train': {'penalty_scaled': np.float64(1.1730827122363106), 'penalty_capped': np.float64(1.1730827122363106), 'penalty_uncapped': np.float64(1.290326881032368), 'missing': 0.0, 'exact_match_accuracy': 0.8149096144616861, 'part_match_accuracy': 0.8858049378766015, 'set_match_accuracy': 0.8946355249826652, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'eval': {'penalty_scaled': np.float64(1.4055336802607197), 'penalty_capped': np.float64(1.4055336802607197), 'penalty_uncapped': np.float64(1.6835787089467724), 'missing': 0.0, 'exact_match_accuracy': 0.5719139297848245, 'part_match_accuracy': 0.731974329935826, 'set_match_accuracy': 0.7477538693846743, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'test': {'penalty_scaled': np.float64(1.377799639262459), 'penalty_capped': np.float64(1.377799639262459), 'penalty_uncapped': np.float64(1.6360360360360362), 'missing': 0.0, 'exact_match_accuracy': 0.6006974716652136, 'part_match_accuracy': 0.7502179598953824, 'set_match_accuracy': 0.7669863411798921, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0}}

/home/nub/Bachelor/bachelor-thesis/models/finqa_base_10_2 - eval_12656897

{'train': {'penalty_scaled': np.float64(1.1727041476025999), 'penalty_capped': np.float64(1.1727041476025999), 'penalty_uncapped': np.float64(1.289710979576601), 'missing': 0.0, 'exact_match_accuracy': 0.8153895376739722, 'part_match_accuracy': 0.8860448994827441, 'set_match_accuracy': 0.8948594891483985, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'eval': {'penalty_scaled': np.float64(1.4104767536068494), 'penalty_capped': np.float64(1.4104767536068494), 'penalty_uncapped': np.float64(1.691751604379011), 'missing': 0.0, 'exact_match_accuracy': 0.5662514156285391, 'part_match_accuracy': 0.7287655719139311, 'set_match_accuracy': 0.7448093620234059, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'test': {'penalty_scaled': np.float64(1.3828317894746809), 'penalty_capped': np.float64(1.3828317894746809), 'penalty_uncapped': np.float64(1.6445945945945948), 'missing': 0.0, 'exact_match_accuracy': 0.5954664341761116, 'part_match_accuracy': 0.7468759081662338, 'set_match_accuracy': 0.7637605347282791, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0}}

/home/nub/Bachelor/bachelor-thesis/models/finqa_base_10_cot - eval_12377400

{'train': {'penalty_scaled': np.float64(1.1948870535669478), 'penalty_capped': np.float64(1.1948870535669478), 'penalty_uncapped': np.float64(1.3254092678504776), 'missing': 0.00015997440409534473, 'exact_match_accuracy': 0.7885138377859543, 'part_match_accuracy': 0.871913827120989, 'set_match_accuracy': 0.882722764357699, 'structure_score_norm': -0.00015997440409534473, 'structure_score_pos': 0.0, 'structure_score_neg': 0.00015997440409534473},

'eval': {'penalty_scaled': np.float64(1.3764056100308362), 'penalty_capped': np.float64(1.3764056100308362), 'penalty_uncapped': np.float64(1.6384107210268026), 'missing': 0.0011325028312570782, 'exact_match_accuracy': 0.608154020385051, 'part_match_accuracy': 0.7497168742921869, 'set_match_accuracy': 0.7638354095885249, 'structure_score_norm': -0.0011325028312570782, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0011325028312570782},

'test': {'penalty_scaled': np.float64(1.3694422018641628), 'penalty_capped': np.float64(1.3694422018641628), 'penalty_uncapped': np.float64(1.6226242371403663), 'missing': 0.0, 'exact_match_accuracy': 0.6102877070619006, 'part_match_accuracy': 0.7548677709968074, 'set_match_accuracy': 0.7742516710258677, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0}}

/home/nub/Bachelor/bachelor-thesis/models/finqa_base_10_ex - eval_12380506

{'train': {'penalty_scaled': np.float64(1.118395289878825), 'penalty_capped': np.float64(1.118395289878825), 'penalty_uncapped': np.float64(1.197946995147443), 'missing': 0.0, 'exact_match_accuracy': 0.8725003999360103, 'part_match_accuracy': 0.9220658028048813, 'set_match_accuracy': 0.9285927584919719, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'eval': {'penalty_scaled': np.float64(1.385036678563092), 'penalty_capped': np.float64(1.385036678563092), 'penalty_uncapped': np.float64(1.650906002265006), 'missing': 0.0, 'exact_match_accuracy': 0.5968289920724802, 'part_match_accuracy': 0.7448093620234066, 'set_match_accuracy': 0.7592676481691215, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'test': {'penalty_scaled': np.float64(1.3388400301269252), 'penalty_capped': np.float64(1.3388400301269252), 'penalty_uncapped': np.float64(1.5724498692240627), 'missing': 0.0, 'exact_match_accuracy': 0.6460331299040977, 'part_match_accuracy': 0.7746294681778592, 'set_match_accuracy': 0.7919790758500463, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0}}

/home/nub/Bachelor/bachelor-thesis/models/finqa_base_10_no_ex - eval_12579165

{'train': {'penalty_scaled': np.float64(1.1082237537323392), 'penalty_capped': np.float64(1.1082237537323392), 'penalty_uncapped': np.float64(1.1811976750386606), 'missing': 0.0, 'exact_match_accuracy': 0.8840185570308751, 'part_match_accuracy': 0.928651415773473, 'set_match_accuracy': 0.934623793526367, 'structure_score_norm': -0.00015997440409534473, 'structure_score_pos': 0.0, 'structure_score_neg': 2.6662400682557456e-05},

'eval': {'penalty_scaled': np.float64(1.3755304144493903), 'penalty_capped': np.float64(1.3755304144493903), 'penalty_uncapped': np.float64(1.6348999622499059), 'missing': 0.0, 'exact_match_accuracy': 0.6070215175537939, 'part_match_accuracy': 0.7512268780671965, 'set_match_accuracy': 0.7645904114760296, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'test': {'penalty_scaled': np.float64(1.3582187531308252), 'penalty_capped': np.float64(1.3582187531308252), 'penalty_uncapped': np.float64(1.6057686718977042), 'missing': 0.0, 'exact_match_accuracy': 0.6268526591107236, 'part_match_accuracy': 0.7616971810520236, 'set_match_accuracy': 0.7789886660854435, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0}}

/home/nub/Bachelor/bachelor-thesis/models/finqa_base_10_no_cot - eval_12579313

{'train': {'penalty_scaled': np.float64(1.1110066563142513), 'penalty_capped': np.float64(1.1110066563142513), 'penalty_uncapped': np.float64(1.1855089852290301), 'missing': 0.0, 'exact_match_accuracy': 0.8800191969284914, 'part_match_accuracy': 0.9269983469311539, 'set_match_accuracy': 0.9329867221244574, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'eval': {'penalty_scaled': np.float64(1.4095822559774305), 'penalty_capped': np.float64(1.4095822559774305), 'penalty_uncapped': np.float64(1.6915817289543225), 'missing': 0.0, 'exact_match_accuracy': 0.5707814269535674, 'part_match_accuracy': 0.7280105700264253, 'set_match_accuracy': 0.7477161192902986, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0},

'test': {'penalty_scaled': np.float64(1.369272273927545), 'penalty_capped': np.float64(1.369272273927545), 'penalty_uncapped': np.float64(1.622682359779134), 'missing': 0.0, 'exact_match_accuracy': 0.6120313862249346, 'part_match_accuracy': 0.7548677709968068, 'set_match_accuracy': 0.7739610578320285, 'structure_score_norm': 0.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.0}}

/home/nub/Bachelor/bachelor-thesis/models/finqa_base_10_full - eval_12657074

{'train': {'penalty_scaled': np.float64(2.0), 'penalty_capped': np.float64(2.0), 'penalty_uncapped': np.float64(3.675317282568123), 'missing': 0.9492881139017757, 'exact_match_accuracy': 0.0, 'part_match_accuracy': 0.00978510105049858, 'set_match_accuracy': 0.010225030661760778, 'structure_score_norm': -1.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.9830960379672619},

'eval': {'penalty_scaled': np.float64(2.0), 'penalty_capped': np.float64(2.0), 'penalty_uncapped': np.float64(3.6755756889392233), 'missing': 0.9490373725934315, 'exact_match_accuracy': 0.0, 'part_match_accuracy': 0.009626274065685165, 'set_match_accuracy': 0.010343525858814649, 'structure_score_norm': -1.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.9826349565873908},

'test': {'penalty_scaled': np.float64(2.0), 'penalty_capped': np.float64(2.0), 'penalty_uncapped': np.float64(3.6746294681778546), 'missing': 0.948561464690497, 'exact_match_accuracy': 0.0, 'part_match_accuracy': 0.01002615518744551, 'set_match_accuracy': 0.010520197616971812, 'structure_score_norm': -1.0, 'structure_score_pos': 0.0, 'structure_score_neg': 0.9821272885789013}}


/home/nub/Bachelor/bachelor-thesis/models/finqa_base_10_full_ex - eval_



/home/nub/Bachelor/bachelor-thesis/models/finqa_base_10_full_cot - eval_

