In [None]:
import os
import sys
import logging
import traceback
import multiprocessing
import json
import re
import numpy as np
import torch
import torch.distributed as dist
from torch.optim.lr_scheduler import ReduceLROnPlateau
from datasets import load_dataset, concatenate_datasets
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback,
    BitsAndBytesConfig,
)
from peft import (
    LoraConfig,
    TaskType,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from peft.optimizers import create_loraplus_optimizer
import bitsandbytes as bnb

from config import (
    DATA_DOCUMENTS,
    DATA_TRAIN_PROC,
    DATA_EVAL_PROC,
    DATA_TEST_PROC,
    MODELS_DIR
)

DATA_DOCUMENTS = "/home/nub/Bachelor/bachelor-thesis/data/processed/documents.csv"
DATA_TRAIN_PROC = "/home/nub/Bachelor/bachelor-thesis/data/processed/train.csv"
DATA_EVAL_PROC = "/home/nub/Bachelor/bachelor-thesis/data/processed/eval.csv"
DATA_TEST_PROC = "/home/nub/Bachelor/bachelor-thesis/data/processed/test.csv"
MODELS_DIR = "/home/nub/Bachelor/bachelor-thesis/models"

# ENVIRONMENT SETUP

# Set logging
logging.basicConfig(
    format="[%(asctime)s] [%(levelname)s] [%(filename)s:%(lineno)d:%(funcName)s] %(message)s",
    level=logging.INFO,
    handlers=[logging.StreamHandler(sys.stdout)],
)
logger = logging.getLogger(__name__)

# from transformers.utils import logging as hf_logging
# hf_logging.set_verbosity_info()

logger.info("Script started...")

# Enable debug to drastically reduce values
DEBUG = False
DEBUG_SIZE = 4
SPLITS = ["train", "eval", "test"]

COT = False

SEED = 42
BATCH_SIZE = min(4, DEBUG_SIZE) if DEBUG else 16
ACCUMULATION_STEPS = 1 if DEBUG else 2
LEARNING_RATE = 4e-4
EPOCHS = 100

MODEL_NAME = "google/flan-t5-base"
logger.info(f"Using model: {MODEL_NAME}")

OUTPUT_DIR = os.path.join(MODELS_DIR, "finqa_full_cot")
logger.info(f"Output location: {OUTPUT_DIR}")

# Detect number of CPUs and GPUs
num_cpus = 1 # int(os.getenv("SLURM_JOB_CPUS_PER_NODE", multiprocessing.cpu_count()))
logger.info(f"Using {num_cpus} CPU core(s)")

num_gpus = torch.cuda.device_count()
logger.info(f"Using {num_gpus} CUDA device(s)")


# DATA PREPROCESSING

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    cache_dir=MODELS_DIR,
    use_fast=True
)


def tokenize(prompt, target):
    model_inputs = tokenizer(
        prompt,
        truncation=True,
        max_length=512,  # Model will silently truncate above 512
    )
    labels = tokenizer(
        text_target=target,
        truncation=True,
        max_length=512,
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def build_process_fn(indexing: bool, use_cot: bool):
    if indexing:
        if use_cot:
            prefix = "Retrieve the document id by reasoning step-by-step: Document: "
        else:
            prefix = "Retrieve the document id: Document: "

        input_key = "document"
    else:
        if use_cot:
            prefix = "Answer the question with a document id by reasoning step-by-step: Question: "
        else:
            prefix = "Answer the question with a document id: Question: "

        input_key = "question"

    def process_examples(examples):
        prompts = []
        answers = []
        for input_text, docid in zip(examples[input_key], examples["document_id"]):
            company, year, page = docid.split("/")

            prompt = prefix + f"{company} in {year}, {input_text}"
            prompts.append(prompt)

            if use_cot:
                answer = (
                    f"This is about {company} in the year {year} and it is on page {page}. "
                    f"Therefore, the final answer is {docid}."
                )
            else:
                answer = docid
            answers.append(answer)

        tokenized = tokenize(prompts, answers)
        return tokenized
    return process_examples


# Process documents for indexing
raw_documents_ds = load_dataset("csv", data_files=DATA_DOCUMENTS, split="train")
documents_ds = raw_documents_ds.map(
    build_process_fn(True, COT),
    remove_columns=raw_documents_ds.column_names,
    num_proc=num_cpus,
    batched=True,
    batch_size=min(1, len(raw_documents_ds) // num_cpus)
)

# Process data for retrieval (train, valid, test)
file_mapping = {
    "train": DATA_TRAIN_PROC,
    "eval": DATA_EVAL_PROC,
    "test": DATA_TEST_PROC,
}

# Process queries for retrieval
raw_data_ds = load_dataset("csv", data_files=file_mapping)
tokenized_ds = raw_data_ds.map(
    build_process_fn(False, COT),
    remove_columns=raw_data_ds["train"].column_names,
    num_proc=num_cpus,
    batched=True,
    batch_size=min(1, len(raw_data_ds["eval"]) // num_cpus)
)

# Merge the indexing stage into the train split
tokenized_ds["train"] = concatenate_datasets([tokenized_ds["train"], documents_ds])

# Reduce data size for all splits
if DEBUG:
    for split in SPLITS:
        tokenized_ds[split] = tokenized_ds[split].select(range(DEBUG_SIZE))
        raw_data_ds[split] = raw_data_ds[split].select(range(DEBUG_SIZE))


# MODEL SETUP

# Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load model
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    cache_dir=MODELS_DIR,
    torch_dtype="auto",
    local_files_only=True,  # Change for first time downloads
    low_cpu_mem_usage=True,
    quantization_config=bnb_config,
)

# Fix for gradient checkpoints
model.config.use_cache = False
model.enable_input_require_grads()

model = prepare_model_for_kbit_training(model)

# LoRA config (QLoRA + OLoRA)
lora_config = LoraConfig(
    init_lora_weights="olora",
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules="all-linear",
)
model = get_peft_model(model, lora_config)

# Print model statistics
# Code from model.print_trainable_parameters()
trainable_params, all_param = model.get_nb_trainable_parameters()

logger.info(
    f"trainable params: {trainable_params:,d} || "
    f"all params: {all_param:,d} || "
    f"trainable%: {100 * trainable_params / all_param:.4f}"
)

logger.info(f"Memory footprint: {model.get_memory_footprint():,}")


[2025-05-30 20:04:12,225] [INFO] [307873862.py:58:<module>] Script started...
[2025-05-30 20:04:12,226] [INFO] [307873862.py:74:<module>] Using model: google/flan-t5-base
[2025-05-30 20:04:12,227] [INFO] [307873862.py:77:<module>] Output location: /home/nub/Bachelor/bachelor-thesis/models/finqa_full_cot
[2025-05-30 20:04:12,227] [INFO] [307873862.py:81:<module>] Using 1 CPU core(s)
[2025-05-30 20:04:12,227] [INFO] [307873862.py:84:<module>] Using 1 CUDA device(s)


Map:   0%|          | 0/2789 [00:00<?, ? examples/s]

True ['Retrieve the document id: Document: ADI in 2009, interest rate to a variable interest rate based on the three-month libor plus 2.05% ( 2.05 % ) ( 2.34% ( 2.34 % ) as of october 31 , 2009 ) . if libor changes by 100 basis points , our annual interest expense would change by $ 3.8 million . foreign currency exposure as more fully described in note 2i . in the notes to consolidated financial statements contained in item 8 of this annual report on form 10-k , we regularly hedge our non-u.s . dollar-based exposures by entering into forward foreign currency exchange contracts . the terms of these contracts are for periods matching the duration of the underlying exposure and generally range from one month to twelve months . currently , our largest foreign currency exposure is the euro , primarily because our european operations have the highest proportion of our local currency denominated expenses . relative to foreign currency exposures existing at october 31 , 2009 and november 1 , 2

Map:   0%|          | 0/6251 [00:00<?, ? examples/s]

False ['Answer the question with a document id: Question: ADI in 2009, what is the the interest expense in 2009?'] ['ADI/2009/49']


Map:   0%|          | 0/883 [00:00<?, ? examples/s]

Map:   0%|          | 0/1147 [00:00<?, ? examples/s]

[2025-05-30 20:04:27,764] [INFO] [307873862.py:236:<module>] trainable params: 6,782,976 || all params: 254,360,832 || trainable%: 2.6667
[2025-05-30 20:04:27,770] [INFO] [307873862.py:242:<module>] Memory footprint: 455,930,880


In [21]:
model.config

T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
 

In [30]:
import pandas as pd

count = []

for item in documents_ds["input_ids"]:
    count.append(len(item))

count_pd = pd.Series(count)
print(count_pd.describe())

count    2789.000000
mean      956.324131
std       346.305001
min       108.000000
25%       756.000000
50%       935.000000
75%      1138.000000
max      3477.000000
dtype: float64


In [None]:
import pandas as pd

count = {split: [] for split in SPLITS}
count_pd = {}

for split in SPLITS:
    for item in tokenized_ds[split]["input_ids"]:
        count[split].append(len(item))
    
    count_pd[split] = pd.Series(count[split])
    print(count_pd[split].describe())

# mask = count_pd > 512


count    9040.000000
mean      319.019801
std       467.195520
min        19.000000
25%        31.000000
50%        38.000000
75%       688.000000
max      3477.000000
dtype: float64
count    883.000000
mean      34.375991
std        7.738144
min       20.000000
25%       29.000000
50%       32.000000
75%       38.000000
max       74.000000
dtype: float64
count    1147.000000
mean       34.732345
std         7.629443
min        21.000000
25%        29.000000
50%        33.000000
75%        39.000000
max        75.000000
dtype: float64


In [22]:
import re

def contains_whole_word(word, text):
    # Strip whitespace from both
    word = word.strip()
    text = text.strip()
    
    # Build a regex pattern that matches the exact word as a whole word
    pattern = rf'(?i)\b{re.escape(word)}\b'  # (?i) for case-insensitive, \b for word boundaries

    # Search using regex
    return bool(re.search(pattern, text))

decoded_preds = ["hi/12/34", "the answer is Cd/34/56", "fe/78/90"]
decoded_labels = ["ab/12/34", "cD/34/56", "FE/78/90"]
matches = [
        contains_whole_word(label, pred)
        for pred, label in zip(decoded_preds, decoded_labels)
    ]
accuracy = sum(matches) / len(matches)
print(accuracy)

0.6666666666666666


In [33]:
total = []
for i in range(len(tokenized_ds)):
    total.append(0 + torch.count_nonzero(tokenized_ds[i]["labels"]).numpy())
pd.Series(total).describe()


count    8281.000000
mean       11.812342
std         0.672473
min         9.000000
25%        11.000000
50%        12.000000
75%        12.000000
max        14.000000
dtype: float64

In [4]:
table = "year | 2020 | 2021 \n movies | 12 | 23 \n games | 67 | 54"

# document = data["table"][0]
# input_text = f"You are a highly intelligent bot. How many games were there in 2020? Here is the table: \n {table}"
# input_text = f"You are a highly intelligent bot. The following text is made with HTML. What is second sentence:\n <h1>I love walking on the beach.</h1><p>My dog is fat.</p><h2>He was yelling at a tree.</h2>"
# input_text = f"Let's think step by step. Generate the document ID for the document:\nI think we should go to the beach. Afterwards, we can go the cinema to watch a movie."
input_text = f"{train_df['id'][0]}"
print(input_text)

ADI/2009/page_49.pdf-1


In [5]:
model.eval()

inputs = tokenizer(
    input_text,
    return_tensors="pt",
    truncation=True,
    max_length=8192
)

generated_ids = model.generate(
    inputs.input_ids.to(model.device),
    attention_mask=inputs.attention_mask.to(model.device),
    max_length=512
)

output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(output)


ADI/2009/page_49.pdf-1


In [None]:
def build_process_fn(indexing: bool, use_cot: bool):
    if indexing:
        if use_cot:
            prefix = "Retrieve the document id by reasoning step-by-step: Document: "
        else:
            prefix = "Retrieve the document id: Document: "

        input_key = "document"
    else:
        if use_cot:
            prefix = "Answer the question with a document id by reasoning step-by-step: Question: "
        else:
            prefix = "Answer the question with a document id: Question: "

        input_key = "question"

    def process_examples(examples):
        prompts = []
        answers = []
        for example in examples:
            input_value = example[input_key]
            docid = example["document_id"]
            company, year, page = docid.split("/")

            prompt = prefix + f"{company} in {year}, {input_value}"
            prompts.append(prompt)

            if use_cot:
                answer = (
                    f"This is about {company} in the year {year} and it is on page {page}. "
                    f"Therefore, the final answer is {docid}."
                )
            else:
                answer = docid
            answers.append(answer)

        # tokenized = tokenize(prompts, answers)
        print(prompts)
        print(answers)
        return
    return process_examples

cot = True

doc_func = build_process_fn(True, cot)
query_func = build_process_fn(False, cot)

doc = ["DOCUMENT BLA bla DOC"]
query = ["QUESTION quest?"]
docid = ["COMP/2000/12"]

example = {"document": doc, "question": query, "document_id": docid}

doc_func(example)
query_func(example)

TypeError: string indices must be integers, not 'str'