In [None]:
import os
import sys
import logging
import traceback
import multiprocessing
import json
import re
import numpy as np
import torch
import torch.distributed as dist
from torch.optim.lr_scheduler import ReduceLROnPlateau
from datasets import load_dataset, concatenate_datasets
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback,
    BitsAndBytesConfig,
)
from peft import (
    LoraConfig,
    TaskType,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from peft.optimizers import create_loraplus_optimizer
import bitsandbytes as bnb

from config import (
    DATA_DOCUMENTS,
    DATA_TRAIN_PROC,
    DATA_EVAL_PROC,
    DATA_TEST_PROC,
    MODELS_DIR
)

DATA_DOCUMENTS = "/home/nub/Bachelor/bachelor-thesis/data/processed/documents.csv"
DATA_TRAIN_PROC = "/home/nub/Bachelor/bachelor-thesis/data/processed/train.csv"
DATA_EVAL_PROC = "/home/nub/Bachelor/bachelor-thesis/data/processed/eval.csv"
DATA_TEST_PROC = "/home/nub/Bachelor/bachelor-thesis/data/processed/test.csv"
MODELS_DIR = "/home/nub/Bachelor/bachelor-thesis/models"

# ENVIRONMENT SETUP

# Set logging
logging.basicConfig(
    format="[%(asctime)s] [%(levelname)s] [%(filename)s:%(lineno)d:%(funcName)s] %(message)s",
    level=logging.INFO,
    handlers=[logging.StreamHandler(sys.stdout)],
)
logger = logging.getLogger(__name__)

# from transformers.utils import logging as hf_logging
# hf_logging.set_verbosity_info()

logger.info("Script started...")

# Enable debug to drastically reduce values
DEBUG = False
DEBUG_SIZE = 4
SPLITS = ["train", "eval", "test"]

COT = False

SEED = 42
BATCH_SIZE = min(4, DEBUG_SIZE) if DEBUG else 16
ACCUMULATION_STEPS = 1 if DEBUG else 2
LEARNING_RATE = 4e-4
EPOCHS = 100

MODEL_NAME = "google/flan-t5-base"
logger.info(f"Using model: {MODEL_NAME}")

OUTPUT_DIR = os.path.join(MODELS_DIR, "finqa_full_cot")
logger.info(f"Output location: {OUTPUT_DIR}")

# Detect number of CPUs and GPUs
num_cpus = 1 # int(os.getenv("SLURM_JOB_CPUS_PER_NODE", multiprocessing.cpu_count()))
logger.info(f"Using {num_cpus} CPU core(s)")

num_gpus = torch.cuda.device_count()
logger.info(f"Using {num_gpus} CUDA device(s)")


# DATA PREPROCESSING

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    cache_dir=MODELS_DIR,
    use_fast=True
)


def tokenize(prompt, target):
    model_inputs = tokenizer(
        prompt,
        truncation=True,
        max_length=512,  # Model will silently truncate above 512
    )
    labels = tokenizer(
        text_target=target,
        truncation=True,
        max_length=512,
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def build_process_fn(indexing: bool, use_cot: bool):
    if indexing:
        if use_cot:
            prefix = "Retrieve the document id by reasoning step-by-step: Document: "
        else:
            prefix = "Retrieve the document id: Document: "

        input_key = "document"
    else:
        if use_cot:
            prefix = "Answer the question with a document id by reasoning step-by-step: Question: "
        else:
            prefix = "Answer the question with a document id: Question: "

        input_key = "question"

    def process_examples(examples):
        prompts = []
        answers = []
        for input_text, docid in zip(examples[input_key], examples["document_id"]):
            company, year, page = docid.split("/")

            prompt = prefix + f"{company} in {year}, {input_text}"
            prompts.append(prompt)

            if use_cot:
                answer = (
                    f"This is about {company} in the year {year} and it is on page {page}. "
                    f"Therefore, the final answer is {docid}."
                )
            else:
                answer = docid
            answers.append(answer)

        tokenized = tokenize(prompts, answers)
        return tokenized
    return process_examples


# Process documents for indexing
raw_documents_ds = load_dataset("csv", data_files=DATA_DOCUMENTS, split="train")
documents_ds = raw_documents_ds.map(
    build_process_fn(True, COT),
    remove_columns=raw_documents_ds.column_names,
    num_proc=num_cpus,
    batched=True,
    batch_size=min(1, len(raw_documents_ds) // num_cpus)
)

# Process data for retrieval (train, valid, test)
file_mapping = {
    "train": DATA_TRAIN_PROC,
    "eval": DATA_EVAL_PROC,
    "test": DATA_TEST_PROC,
}

# Process queries for retrieval
raw_data_ds = load_dataset("csv", data_files=file_mapping)
tokenized_ds = raw_data_ds.map(
    build_process_fn(False, COT),
    remove_columns=raw_data_ds["train"].column_names,
    num_proc=num_cpus,
    batched=True,
    batch_size=min(1, len(raw_data_ds["eval"]) // num_cpus)
)

# Merge the indexing stage into the train split
tokenized_ds["train"] = concatenate_datasets([tokenized_ds["train"], documents_ds])

# Reduce data size for all splits
if DEBUG:
    for split in SPLITS:
        tokenized_ds[split] = tokenized_ds[split].select(range(DEBUG_SIZE))
        raw_data_ds[split] = raw_data_ds[split].select(range(DEBUG_SIZE))


# MODEL SETUP

# Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load model
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    cache_dir=MODELS_DIR,
    torch_dtype="auto",
    local_files_only=True,  # Change for first time downloads
    low_cpu_mem_usage=True,
    quantization_config=bnb_config,
)

# Fix for gradient checkpoints
model.config.use_cache = False
model.enable_input_require_grads()

model = prepare_model_for_kbit_training(model)

# LoRA config (QLoRA + OLoRA)
lora_config = LoraConfig(
    init_lora_weights="olora",
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules="all-linear",
)
model = get_peft_model(model, lora_config)

# Print model statistics
# Code from model.print_trainable_parameters()
trainable_params, all_param = model.get_nb_trainable_parameters()

logger.info(
    f"trainable params: {trainable_params:,d} || "
    f"all params: {all_param:,d} || "
    f"trainable%: {100 * trainable_params / all_param:.4f}"
)

logger.info(f"Memory footprint: {model.get_memory_footprint():,}")


[2025-05-30 20:04:12,225] [INFO] [307873862.py:58:<module>] Script started...
[2025-05-30 20:04:12,226] [INFO] [307873862.py:74:<module>] Using model: google/flan-t5-base
[2025-05-30 20:04:12,227] [INFO] [307873862.py:77:<module>] Output location: /home/nub/Bachelor/bachelor-thesis/models/finqa_full_cot
[2025-05-30 20:04:12,227] [INFO] [307873862.py:81:<module>] Using 1 CPU core(s)
[2025-05-30 20:04:12,227] [INFO] [307873862.py:84:<module>] Using 1 CUDA device(s)


Map:   0%|          | 0/2789 [00:00<?, ? examples/s]

True ['Retrieve the document id: Document: ADI in 2009, interest rate to a variable interest rate based on the three-month libor plus 2.05% ( 2.05 % ) ( 2.34% ( 2.34 % ) as of october 31 , 2009 ) . if libor changes by 100 basis points , our annual interest expense would change by $ 3.8 million . foreign currency exposure as more fully described in note 2i . in the notes to consolidated financial statements contained in item 8 of this annual report on form 10-k , we regularly hedge our non-u.s . dollar-based exposures by entering into forward foreign currency exchange contracts . the terms of these contracts are for periods matching the duration of the underlying exposure and generally range from one month to twelve months . currently , our largest foreign currency exposure is the euro , primarily because our european operations have the highest proportion of our local currency denominated expenses . relative to foreign currency exposures existing at october 31 , 2009 and november 1 , 2

Map:   0%|          | 0/6251 [00:00<?, ? examples/s]

False ['Answer the question with a document id: Question: ADI in 2009, what is the the interest expense in 2009?'] ['ADI/2009/49']


Map:   0%|          | 0/883 [00:00<?, ? examples/s]

Map:   0%|          | 0/1147 [00:00<?, ? examples/s]

[2025-05-30 20:04:27,764] [INFO] [307873862.py:236:<module>] trainable params: 6,782,976 || all params: 254,360,832 || trainable%: 2.6667
[2025-05-30 20:04:27,770] [INFO] [307873862.py:242:<module>] Memory footprint: 455,930,880


In [21]:
model.config

T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
 

In [30]:
import pandas as pd

count = []

for item in documents_ds["input_ids"]:
    count.append(len(item))

count_pd = pd.Series(count)
print(count_pd.describe())

count    2789.000000
mean      956.324131
std       346.305001
min       108.000000
25%       756.000000
50%       935.000000
75%      1138.000000
max      3477.000000
dtype: float64


In [None]:
import pandas as pd

count = {split: [] for split in SPLITS}
count_pd = {}

for split in SPLITS:
    for item in tokenized_ds[split]["input_ids"]:
        count[split].append(len(item))
    
    count_pd[split] = pd.Series(count[split])
    print(count_pd[split].describe())

# mask = count_pd > 512


count    9040.000000
mean      319.019801
std       467.195520
min        19.000000
25%        31.000000
50%        38.000000
75%       688.000000
max      3477.000000
dtype: float64
count    883.000000
mean      34.375991
std        7.738144
min       20.000000
25%       29.000000
50%       32.000000
75%       38.000000
max       74.000000
dtype: float64
count    1147.000000
mean       34.732345
std         7.629443
min        21.000000
25%        29.000000
50%        33.000000
75%        39.000000
max        75.000000
dtype: float64


In [None]:
import re
import pandas as pd

DATA_TRAIN_RAW = "/home/nub/Bachelor/bachelor-thesis/data/raw/train.json"
DATA_EVAL_RAW = "/home/nub/Bachelor/bachelor-thesis/data/raw/dev.json"
DATA_TEST_RAW = "/home/nub/Bachelor/bachelor-thesis/data/raw/test.json"

USED_COLUMNS = [
    "document",
    "document_id",
    "question",
    "answer",
    "exe_ans",
    "steps",
    "program",
    "program_re",
]

pattern = re.compile(r'(.*)/page_(\d+)\.pdf')


def convert_filename(path: str):
    """Convert filename into id (ABC/2010/page_12.pdf) -> (ABC/2010/12)"""
    match = pattern.search(path)
    if match:
        return f"{match.group(1)}/{match.group(2)}"
    return path


def convert_table(table: list[list[str]]):
    """Convert nested table structure to csv."""
    header, *rows = table
    df = pd.DataFrame(rows, columns=header)
    return df.to_csv(index=False)

def reformat_data(input_file_path: str, output_file_path: str):
    """Reformat the FinQA dataset."""
    raw_df = pd.read_json(input_file_path)

    # Unnest the question data
    qa_df = pd.DataFrame(raw_df["qa"].to_dict()).T
    raw_df = pd.concat([raw_df, qa_df], axis="columns")

    # Format into plain text
    raw_df.loc[:, "pre_text"] = raw_df["pre_text"].map(" ".join)
    raw_df.loc[:, "post_text"] = raw_df["post_text"].map(" ".join)
    raw_df.loc[:, "table"] = raw_df["table"].map(convert_table)

    raw_df.loc[:, "document"] = (
        (raw_df["pre_text"] + " ")
        + raw_df["post_text"]
        + ("\nTable:\n" + raw_df["table"])
    )

    # Drop the unused columns
    raw_df["filename"] = raw_df["filename"].apply(convert_filename)
    raw_df.rename(columns={"filename": "document_id"}, inplace=True)
    df = raw_df[USED_COLUMNS]

    # df.to_csv(output_file_path, index=False)
    return df

def create_documents_data(
    train_df: pd.DataFrame,
    eval_df: pd.DataFrame,
    test_df: pd.DataFrame,
    output_file_path: str,
):
    """Create document and docid data."""
    document_columns = ["document", "document_id"]
    documents_df = pd.concat(
        [
            train_df[document_columns],
            eval_df[document_columns],
            test_df[document_columns],
        ],
        axis="index",
    )
    # documents_df["gold_inds"] = documents_df["gold_inds"].apply(len)
    documents_df.drop_duplicates(inplace=True)

    # documents_df.to_csv(output_file_path, index=False)
    return documents_df

train_df = reformat_data(DATA_TRAIN_RAW, "DATA_TRAIN_PROC")
eval_df = reformat_data(DATA_EVAL_RAW, "DATA_EVAL_PROC")
test_df = reformat_data(DATA_TEST_RAW, "DATA_TEST_PROC")

documents_df = create_documents_data(train_df, eval_df, test_df, "DATA_DOCUMENTS")

In [None]:
data_ds["train"] = concatenate_datasets([data_ds["train"], documents_ds])

In [22]:
import re

def contains_whole_word(word, text):
    # Strip whitespace from both
    word = word.strip()
    text = text.strip()
    
    # Build a regex pattern that matches the exact word as a whole word
    pattern = rf'(?i)\b{re.escape(word)}\b'  # (?i) for case-insensitive, \b for word boundaries

    # Search using regex
    return bool(re.search(pattern, text))

decoded_preds = ["hi/12/34", "the answer is Cd/34/56", "fe/78/90"]
decoded_labels = ["ab/12/34", "cD/34/56", "FE/78/90"]
matches = [
        contains_whole_word(label, pred)
        for pred, label in zip(decoded_preds, decoded_labels)
    ]
accuracy = sum(matches) / len(matches)
print(accuracy)

0.6666666666666666


In [30]:
import traceback
try:
    raise Exception("aaaaa")
except Exception as e:
    a = traceback.format_exc()
    print(a)

Traceback (most recent call last):
  File "/tmp/ipykernel_25272/546137021.py", line 3, in <module>
    raise Exception("aaaaa")
Exception: aaaaa



In [29]:
data_ds["train"] = con

In [30]:
data_ds["train"]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 11829
})

In [None]:
import os
import torch
import pandas as pd
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, T5Tokenizer
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
import multiprocessing
from config import *

num_cpus = int(os.getenv("SLURM_CPUS_PER_TASK", multiprocessing.cpu_count()))

MODEL_NAME = "google/flan-t5-xl"
OUTPUT_DIR = os.path.join(MODELS_DIR, "finqa_indexer")
DS_CONFIG = "./ds_config.json"

train_batch_size = 2
gradient_accumulation_steps = 8

# duplicate questions

# sinfo -p gpu_h100 -N -o "%N %t %C"
# scontrol show node gcn150



#SBATCH --mail-type=ALL
#SBATCH --mail-user=steven.dong@student.uva.nl

        # "offload_optimizer": {
        #     "device": "cpu",
        #     "pin_memory": true
        # },
        # "allgather_partitions": true,
        # "allgather_bucket_size": 2e8,
        # "overlap_comm": true,
        # "reduce_scatter": true,
        # "reduce_bucket_size": 2e8,
        # "contiguous_gradients": true,
        # "round_robin_gradients": true

[2025-05-17 00:12:19,352] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/nub/miniconda3/envs/venv_glen/bin/../lib/gcc/x86_64-conda-linux-gnu/11.2.0/../../../../x86_64-conda-linux-gnu/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/nub/miniconda3/envs/venv_glen/bin/../lib/gcc/x86_64-conda-linux-gnu/11.2.0/../../../../x86_64-conda-linux-gnu/bin/ld: /home/nub/miniconda3/envs/venv_glen/lib/libcufile.so: undefined reference to `pthread_rwlock_trywrlock@GLIBC_2.2.5'
/home/nub/miniconda3/envs/venv_glen/bin/../lib/gcc/x86_64-conda-linux-gnu/11.2.0/../../../../x86_64-conda-linux-gnu/bin/ld: /home/nub/miniconda3/envs/venv_glen/lib/libcufile.so: undefined reference to `pthread_getspecific@GLIBC_2.2.5'
/home/nub/miniconda3/envs/venv_glen/bin/../lib/gcc/x86_64-conda-linux-gnu/11.2.0/../../../../x86_64-conda-linux-gnu/bin/ld: /home/nub/miniconda3/envs/venv_glen/lib/libcufile.so: undefined reference to `pthread_rwlock_timedrdlock@GLIBC_2.2.5'
/home/nub/miniconda3/envs/venv_glen/bin/../lib/gcc/x86_64-conda-linux-g

In [2]:
model_name = "google/flan-t5-large"
tokenizer  = T5Tokenizer.from_pretrained(model_name, cache_dir="/home/nub/Bachelor/bachelor-thesis/models", use_fast=True)
model      = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir="/home/nub/Bachelor/bachelor-thesis/models", device_map="auto", local_files_only=True,
    low_cpu_mem_usage=True)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [12]:
tokenizer.unk_token_id

2

In [13]:
labels = torch.tensor([1, 2, -100, 3, 4, -100, -100, 5, 0])
output = torch.where(labels != -100, labels, tokenizer.pad_token_id)
output

tensor([1, 2, 0, 3, 4, 0, 0, 5, 0])

In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim="adamw_torch",
    learning_rate=1e-5,
    num_train_epochs=2,
    bf16=True,
    deepspeed=DS_CONFIG,
    logging_strategy="steps",
    logging_steps=50,
    save_strategy="epoch",
    eval_strategy="no",
    save_total_limit=2,
    load_best_model_at_end=False,
    predict_with_generate=False,
)

In [3]:
raw_ds = load_dataset("csv", data_files="/home/nub/Bachelor/bachelor-thesis/data/processed/documents.csv", split="train")

In [4]:
def preprocess_fn(example):
    inputs = tokenizer(
        example["full_text"],
        truncation=True,
        max_length=4096,
    )
    targets = tokenizer(
        example["id"],
        truncation=True,
        max_length=32,
    )
    return {
        "input_ids": inputs.input_ids,
        "attention_mask": inputs.attention_mask,
        "labels": targets.input_ids,
    }


# Map & set format for PyTorch
tokenized_ds = raw_ds.map(
    preprocess_fn,
    remove_columns=raw_ds.column_names,
    num_proc=num_cpus
)
tokenized_ds.set_format(type="torch")

Map (num_proc=16):   0%|          | 0/8281 [00:00<?, ? examples/s]

In [6]:
token = tokenizer(
        "I am walking to the store",
        truncation=True,
        max_length=16,
    )
token

{'input_ids': [27, 183, 3214, 12, 8, 1078, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [9]:
tokenized_ds["input_ids"]

[tensor([ 1046,  1080,    12,     3,     9,  7660,  1046,  1080,     3,   390,
            30,     8,   386,    18,  7393,     3,  6856,   127,   303,  6864,
          2712,    41,  1682,  3076,     3,  1454,     3,    61,    41,     3,
         18561,  5988,    41,  1682,  3710,     3,  1454,     3,    61,    38,
            13,     3,    32,    75,   235,  1152,  2664,     3,     6,  2464,
             3,    61,     3,     5,     3,    99,     3,  6856,   127,  1112,
            57,   910,  1873,   979,     3,     6,    69,  2041,  1046,  8225,
           133,   483,    57,  1514,     3, 26195,   770,     3,     5,  2959,
          7481,  4773,    38,    72,  1540,  3028,    16,  2232,   204,    23,
             3,     5,    16,     8,  3358,    12,     3, 27356,   981,  6643,
          6966,    16,  2118,   505,    13,    48,  2041,   934,    30,   607,
          9445,   157,     3,     6,    62,  3842, 18179,    69,   529,    18,
            76,     5,     7,     3,     5,  6816,  

In [33]:
total = []
for i in range(len(tokenized_ds)):
    total.append(0 + torch.count_nonzero(tokenized_ds[i]["labels"]).numpy())
pd.Series(total).describe()


count    8281.000000
mean       11.812342
std         0.672473
min         9.000000
25%        11.000000
50%        12.000000
75%        12.000000
max        14.000000
dtype: float64

In [13]:
DATA_DIR_RAW = "data/raw"
DATA_DIR_PROC = "data/processed"

MAX_INPUT_LENGTH = 4096
MAX_TARGET_LENGTH = 64
USED_COLUMNS = ["full_text", "table", "id", "question", "answer", "exe_ans", "steps", "program", "program_re"]


def convert_table(table: list[list[str]]):
    """Convert nested table structure to csv."""
    header, *rows = table
    df = pd.DataFrame(rows, columns=header)
    return df.to_csv(index=False)


def reformat_data(file_name: str):
    """Reformat the FinQA dataset."""
    raw_df = pd.read_json(os.path.join(DATA_DIR_RAW, file_name))
    
    # Unnest the question data
    qa_df = pd.DataFrame(raw_df["qa"].to_dict()).T
    raw_df = pd.concat([raw_df, qa_df], axis="columns")
    
    
    raw_df.loc[:, "pre_text"] = raw_df["pre_text"].map(" ".join)
    raw_df.loc[:, "post_text"] = raw_df["post_text"].map(" ".join)
    raw_df.loc[:, "table"] = raw_df["table"].map(convert_table)

    raw_df.loc[:, "full_text"] = raw_df["pre_text"] + raw_df["post_text"] + "\nThis is a table:\n" + raw_df["table"]
    
    # Drop the unused columns
    # df = raw_df[USED_COLUMNS]
    # df.to_csv(os.path.join(DATA_DIR_PROC, file_name))
    return raw_df


def create_documents_data(train_df: pd.DataFrame, valid_df: pd.DataFrame, test_df: pd.DataFrame):
    document_columns = ["full_text", "id"]
    documents_df = pd.concat([train_df[document_columns], valid_df[document_columns], test_df[document_columns]], axis="index")
    return documents_df


def prepare_sample(data: pd.Series, tokenizer: T5Tokenizer):
    prompt = f"Generate the document ID for this text:\n{data['full_text']}"
    
    inputs = tokenizer(
        prompt,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length"
    )
    targets = tokenizer(
        data["id"],
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding="max_length"
    )


train_df = reformat_data("/home/nub/Bachelor/bachelor-thesis/data/raw/train.json")
valid_df = reformat_data("/home/nub/Bachelor/bachelor-thesis/data/raw/dev.json")
test_df = reformat_data("//home/nub/Bachelor/bachelor-thesis/data/raw/test.json")
documents_df = create_documents_data(train_df, valid_df, test_df)

In [17]:
train_df = pd.read_csv("/home/nub/Bachelor/bachelor-thesis/data/processed/train.csv")
valid_df = pd.read_csv("/home/nub/Bachelor/bachelor-thesis/data/processed/valid.csv")
test_df = pd.read_csv("/home/nub/Bachelor/bachelor-thesis/data/processed/test.csv")
documents_df = pd.read_csv("/home/nub/Bachelor/bachelor-thesis/data/processed/documents.csv")

In [18]:
lengths = []

df = documents_df

for i in df.index:
    lengths.append(len(df["full_text"][i].split()))

pd.Series(lengths).describe()

count    8281.000000
mean      673.093225
std       255.771629
min        24.000000
25%       535.000000
50%       667.000000
75%       811.000000
max      2674.000000
dtype: float64

In [288]:
print(data["full_text"][0])

interest rate to a variable interest rate based on the three-month libor plus 2.05% ( 2.05 % ) ( 2.34% ( 2.34 % ) as of october 31 , 2009 ) . if libor changes by 100 basis points , our annual interest expense would change by $ 3.8 million . foreign currency exposure as more fully described in note 2i . in the notes to consolidated financial statements contained in item 8 of this annual report on form 10-k , we regularly hedge our non-u.s . dollar-based exposures by entering into forward foreign currency exchange contracts . the terms of these contracts are for periods matching the duration of the underlying exposure and generally range from one month to twelve months . currently , our largest foreign currency exposure is the euro , primarily because our european operations have the highest proportion of our local currency denominated expenses . relative to foreign currency exposures existing at october 31 , 2009 and november 1 , 2008 , a 10% ( 10 % ) unfavorable movement in foreign cur

In [8]:
def preprocess(example):
    inputs  = tokenizer(example["text"], truncation=True, max_length=8192, padding="max_length")
    targets = tokenizer(example["summary"], truncation=True, max_length=512, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

In [23]:
print(data["pre_text"][0])

interest rate to a variable interest rate based on the three-month libor plus 2.05% ( 2.05 % ) ( 2.34% ( 2.34 % ) as of october 31 , 2009 ) . if libor changes by 100 basis points , our annual interest expense would change by $ 3.8 million . foreign currency exposure as more fully described in note 2i . in the notes to consolidated financial statements contained in item 8 of this annual report on form 10-k , we regularly hedge our non-u.s . dollar-based exposures by entering into forward foreign currency exchange contracts . the terms of these contracts are for periods matching the duration of the underlying exposure and generally range from one month to twelve months . currently , our largest foreign currency exposure is the euro , primarily because our european operations have the highest proportion of our local currency denominated expenses . relative to foreign currency exposures existing at october 31 , 2009 and november 1 , 2008 , a 10% ( 10 % ) unfavorable movement in foreign cur

In [4]:
table = "year | 2020 | 2021 \n movies | 12 | 23 \n games | 67 | 54"

# document = data["table"][0]
# input_text = f"You are a highly intelligent bot. How many games were there in 2020? Here is the table: \n {table}"
# input_text = f"You are a highly intelligent bot. The following text is made with HTML. What is second sentence:\n <h1>I love walking on the beach.</h1><p>My dog is fat.</p><h2>He was yelling at a tree.</h2>"
# input_text = f"Let's think step by step. Generate the document ID for the document:\nI think we should go to the beach. Afterwards, we can go the cinema to watch a movie."
input_text = f"{train_df['id'][0]}"
print(input_text)

ADI/2009/page_49.pdf-1


In [5]:
model.eval()

inputs = tokenizer(
    input_text,
    return_tensors="pt",
    truncation=True,
    max_length=8192
)

generated_ids = model.generate(
    inputs.input_ids.to(model.device),
    attention_mask=inputs.attention_mask.to(model.device),
    max_length=512
)

output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(output)


ADI/2009/page_49.pdf-1


In [None]:
def build_process_fn(indexing: bool, use_cot: bool):
    if indexing:
        if use_cot:
            prefix = "Retrieve the document id by reasoning step-by-step: Document: "
        else:
            prefix = "Retrieve the document id: Document: "

        input_key = "document"
    else:
        if use_cot:
            prefix = "Answer the question with a document id by reasoning step-by-step: Question: "
        else:
            prefix = "Answer the question with a document id: Question: "

        input_key = "question"

    def process_examples(examples):
        prompts = []
        answers = []
        for example in examples:
            input_value = example[input_key]
            docid = example["document_id"]
            company, year, page = docid.split("/")

            prompt = prefix + f"{company} in {year}, {input_value}"
            prompts.append(prompt)

            if use_cot:
                answer = (
                    f"This is about {company} in the year {year} and it is on page {page}. "
                    f"Therefore, the final answer is {docid}."
                )
            else:
                answer = docid
            answers.append(answer)

        # tokenized = tokenize(prompts, answers)
        print(prompts)
        print(answers)
        return
    return process_examples

cot = True

doc_func = build_process_fn(True, cot)
query_func = build_process_fn(False, cot)

doc = ["DOCUMENT BLA bla DOC"]
query = ["QUESTION quest?"]
docid = ["COMP/2000/12"]

example = {"document": doc, "question": query, "document_id": docid}

doc_func(example)
query_func(example)

TypeError: string indices must be integers, not 'str'