In [None]:
import os
import pandas as pd
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, T5Tokenizer

In [2]:
model_name = "google/flan-t5-large"
tokenizer  = T5Tokenizer.from_pretrained(model_name, cache_dir="/home/nub/Bachelor/bachelor-thesis/models")
model      = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir="/home/nub/Bachelor/bachelor-thesis/models").to(device="cuda")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


[2025-05-10 21:50:50,963] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [None]:
DATA_DIR_RAW = "data/raw"
DATA_DIR_PROC = "data/processed"

MAX_INPUT_LENGTH = 4096
MAX_TARGET_LENGTH = 64
USED_COLUMNS = ["full_text", "table", "id", "question", "answer", "exe_ans", "steps", "program", "program_re"]


def convert_table(table: list[list[str]]):
    """Convert nested table structure to csv."""
    header, *rows = table
    df = pd.DataFrame(rows, columns=header)
    return df.to_csv(index=False)


def reformat_data(file_name: str):
    """Reformat the FinQA dataset."""
    raw_df = pd.read_json(os.path.join(DATA_DIR_RAW, file_name))
    
    # Unnest the question data
    qa_df = pd.DataFrame(raw_df["qa"].to_dict()).T
    raw_df = pd.concat([raw_df, qa_df], axis="columns")
    
    
    raw_df.loc[:, "pre_text"] = raw_df["pre_text"].map(" ".join)
    raw_df.loc[:, "post_text"] = raw_df["post_text"].map(" ".join)
    raw_df.loc[:, "table"] = raw_df["table"].map(convert_table)

    raw_df.loc[:, "full_text"] = raw_df["pre_text"] + raw_df["post_text"] + "\nThis is a table:\n" + raw_df["table"]
    
    # Drop the unused columns
    df = raw_df[USED_COLUMNS]
    df.to_csv(os.path.join(DATA_DIR_PROC, file_name))


def create_documents_data(train_df: pd.DataFrame, valid_df: pd.DataFrame, test_df: pd.DataFrame):
    document_columns = ["full_text", "id"]
    documents_df = pd.concat([train_df[document_columns], valid_df[document_columns], test_df[document_columns]], axis="index")
    return documents_df


def prepare_sample(data: pd.Series, tokenizer: T5Tokenizer):
    prompt = f"Generate the document ID for this text:\n{data['full_text']}"
    
    inputs = tokenizer(
        prompt,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length"
    )
    targets = tokenizer(
        data["id"],
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding="max_length"
    )


train_df = reformat_data("/home/nub/Bachelor/bachelor-thesis/FinQA-main/dataset/train.json")
valid_df = reformat_data("/home/nub/Bachelor/bachelor-thesis/FinQA-main/dataset/dev.json")
test_df = reformat_data("/home/nub/Bachelor/bachelor-thesis/FinQA-main/dataset/test.json")
documents_df = create_documents_data(train_df, valid_df, test_df)

In [21]:
lengths = []

for i in data.index:
    pre_text = len(data["pre_text"][i].split())
    post_text = len(data["post_text"][i].split())
    table = len(" ".join([item for sublist in data["table"][i] for item in sublist]).split())
    lengths.append(pre_text + post_text + table)

pd.Series(lengths).describe()

count    6251.000000
mean      894.160934
std       292.695415
min       151.000000
25%       718.000000
50%       870.000000
75%      1050.000000
max      3713.000000
dtype: float64

In [288]:
print(data["full_text"][0])

interest rate to a variable interest rate based on the three-month libor plus 2.05% ( 2.05 % ) ( 2.34% ( 2.34 % ) as of october 31 , 2009 ) . if libor changes by 100 basis points , our annual interest expense would change by $ 3.8 million . foreign currency exposure as more fully described in note 2i . in the notes to consolidated financial statements contained in item 8 of this annual report on form 10-k , we regularly hedge our non-u.s . dollar-based exposures by entering into forward foreign currency exchange contracts . the terms of these contracts are for periods matching the duration of the underlying exposure and generally range from one month to twelve months . currently , our largest foreign currency exposure is the euro , primarily because our european operations have the highest proportion of our local currency denominated expenses . relative to foreign currency exposures existing at october 31 , 2009 and november 1 , 2008 , a 10% ( 10 % ) unfavorable movement in foreign cur

In [8]:
def preprocess(example):
    inputs  = tokenizer(example["text"], truncation=True, max_length=8192, padding="max_length")
    targets = tokenizer(example["summary"], truncation=True, max_length=512, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

In [23]:
print(data["pre_text"][0])

interest rate to a variable interest rate based on the three-month libor plus 2.05% ( 2.05 % ) ( 2.34% ( 2.34 % ) as of october 31 , 2009 ) . if libor changes by 100 basis points , our annual interest expense would change by $ 3.8 million . foreign currency exposure as more fully described in note 2i . in the notes to consolidated financial statements contained in item 8 of this annual report on form 10-k , we regularly hedge our non-u.s . dollar-based exposures by entering into forward foreign currency exchange contracts . the terms of these contracts are for periods matching the duration of the underlying exposure and generally range from one month to twelve months . currently , our largest foreign currency exposure is the euro , primarily because our european operations have the highest proportion of our local currency denominated expenses . relative to foreign currency exposures existing at october 31 , 2009 and november 1 , 2008 , a 10% ( 10 % ) unfavorable movement in foreign cur

In [25]:
table = "year | 2020 | 2021 \n movies | 12 | 23 \n games | 67 | 54"

# document = data["table"][0]
# input_text = f"You are a highly intelligent bot. How many games were there in 2020? Here is the table: \n {table}"
# input_text = f"You are a highly intelligent bot. The following text is made with HTML. What is second sentence:\n <h1>I love walking on the beach.</h1><p>My dog is fat.</p><h2>He was yelling at a tree.</h2>"
# input_text = f"Let's think step by step. Generate the document ID for the document:\nI think we should go to the beach. Afterwards, we can go the cinema to watch a movie."
input_text = f"{train_df['id'][0]}"
print(input_text)

ADI/2009/page_49.pdf-1


In [26]:
model.eval()

inputs = tokenizer(
    input_text,
    return_tensors="pt",
    truncation=True,
    max_length=8192
)

generated_ids = model.generate(
    inputs.input_ids.to(model.device),
    attention_mask=inputs.attention_mask.to(model.device),
    max_length=512
)

output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(output)


ADI/2009/page_49.pdf-1


In [42]:
len(output.split())

343