In [27]:
import os
import torch
import pandas as pd
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, T5Tokenizer
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
import multiprocessing
from config import *

num_cpus = int(os.getenv("SLURM_CPUS_PER_TASK", multiprocessing.cpu_count()))

MODEL_NAME = "google/flan-t5-xl"
OUTPUT_DIR = os.path.join(MODELS_DIR, "finqa_indexer")
DS_CONFIG = "./ds_config.json"

train_batch_size = 2
gradient_accumulation_steps = 8

#SBATCH --mail-type=ALL
#SBATCH --mail-user=steven.dong@student.uva.nl

In [2]:
model_name = "google/flan-t5-large"
tokenizer  = T5Tokenizer.from_pretrained(model_name, cache_dir="/home/nub/Bachelor/bachelor-thesis/models", use_fast=True)
model      = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir="/home/nub/Bachelor/bachelor-thesis/models", device_map="auto", local_files_only=True,
    low_cpu_mem_usage=True)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim="adamw_torch",
    learning_rate=1e-5,
    num_train_epochs=2,
    bf16=True,
    deepspeed=DS_CONFIG,
    logging_strategy="steps",
    logging_steps=50,
    save_strategy="epoch",
    eval_strategy="no",
    save_total_limit=2,
    load_best_model_at_end=False,
    predict_with_generate=False,
)

In [3]:
raw_ds = load_dataset("csv", data_files="/home/nub/Bachelor/bachelor-thesis/data/processed/documents.csv", split="train")

In [40]:
def preprocess_fn(example):
    inputs = tokenizer(
        example["full_text"],
        truncation=True,
        max_length=4096,
        return_tensors="pt",
    )
    targets = tokenizer(
        example["id"],
        truncation=True,
        max_length=32,
        return_tensors="pt",
    )
    return {
        "input_ids": inputs.input_ids.squeeze(0),
        "attention_mask": inputs.attention_mask.squeeze(0),
        "labels": targets.input_ids.squeeze(0),
    }


# Map & set format for PyTorch
tokenized_ds = raw_ds.map(
    preprocess_fn,
    remove_columns=raw_ds.column_names,
    num_proc=num_cpus
)
tokenized_ds.set_format(type="torch")

Map (num_proc=16):   0%|          | 0/8281 [00:00<?, ? examples/s]

In [37]:
token = tokenizer(
        "I am walking to the store",
        truncation=True,
        max_length=16,
        return_tensors="pt",
    )
token

{'input_ids': tensor([[  27,  183, 3214,   12,    8, 1078,    1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}

In [41]:
tokenized_ds[0]["labels"]

tensor([    3, 27775,    87, 16660,    87,  6492,   834,  3647,     5, 17388,
         2292,     1])

In [33]:
total = []
for i in range(len(tokenized_ds)):
    total.append(0 + torch.count_nonzero(tokenized_ds[i]["labels"]).numpy())
pd.Series(total).describe()


count    8281.000000
mean       11.812342
std         0.672473
min         9.000000
25%        11.000000
50%        12.000000
75%        12.000000
max        14.000000
dtype: float64

In [13]:
DATA_DIR_RAW = "data/raw"
DATA_DIR_PROC = "data/processed"

MAX_INPUT_LENGTH = 4096
MAX_TARGET_LENGTH = 64
USED_COLUMNS = ["full_text", "table", "id", "question", "answer", "exe_ans", "steps", "program", "program_re"]


def convert_table(table: list[list[str]]):
    """Convert nested table structure to csv."""
    header, *rows = table
    df = pd.DataFrame(rows, columns=header)
    return df.to_csv(index=False)


def reformat_data(file_name: str):
    """Reformat the FinQA dataset."""
    raw_df = pd.read_json(os.path.join(DATA_DIR_RAW, file_name))
    
    # Unnest the question data
    qa_df = pd.DataFrame(raw_df["qa"].to_dict()).T
    raw_df = pd.concat([raw_df, qa_df], axis="columns")
    
    
    raw_df.loc[:, "pre_text"] = raw_df["pre_text"].map(" ".join)
    raw_df.loc[:, "post_text"] = raw_df["post_text"].map(" ".join)
    raw_df.loc[:, "table"] = raw_df["table"].map(convert_table)

    raw_df.loc[:, "full_text"] = raw_df["pre_text"] + raw_df["post_text"] + "\nThis is a table:\n" + raw_df["table"]
    
    # Drop the unused columns
    # df = raw_df[USED_COLUMNS]
    # df.to_csv(os.path.join(DATA_DIR_PROC, file_name))
    return raw_df


def create_documents_data(train_df: pd.DataFrame, valid_df: pd.DataFrame, test_df: pd.DataFrame):
    document_columns = ["full_text", "id"]
    documents_df = pd.concat([train_df[document_columns], valid_df[document_columns], test_df[document_columns]], axis="index")
    return documents_df


def prepare_sample(data: pd.Series, tokenizer: T5Tokenizer):
    prompt = f"Generate the document ID for this text:\n{data['full_text']}"
    
    inputs = tokenizer(
        prompt,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length"
    )
    targets = tokenizer(
        data["id"],
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding="max_length"
    )


train_df = reformat_data("/home/nub/Bachelor/bachelor-thesis/data/raw/train.json")
valid_df = reformat_data("/home/nub/Bachelor/bachelor-thesis/data/raw/dev.json")
test_df = reformat_data("//home/nub/Bachelor/bachelor-thesis/data/raw/test.json")
documents_df = create_documents_data(train_df, valid_df, test_df)

In [17]:
train_df = pd.read_csv("/home/nub/Bachelor/bachelor-thesis/data/processed/train.csv")
valid_df = pd.read_csv("/home/nub/Bachelor/bachelor-thesis/data/processed/valid.csv")
test_df = pd.read_csv("/home/nub/Bachelor/bachelor-thesis/data/processed/test.csv")
documents_df = pd.read_csv("/home/nub/Bachelor/bachelor-thesis/data/processed/documents.csv")

In [18]:
lengths = []

df = documents_df

for i in df.index:
    lengths.append(len(df["full_text"][i].split()))

pd.Series(lengths).describe()

count    8281.000000
mean      673.093225
std       255.771629
min        24.000000
25%       535.000000
50%       667.000000
75%       811.000000
max      2674.000000
dtype: float64

In [288]:
print(data["full_text"][0])

interest rate to a variable interest rate based on the three-month libor plus 2.05% ( 2.05 % ) ( 2.34% ( 2.34 % ) as of october 31 , 2009 ) . if libor changes by 100 basis points , our annual interest expense would change by $ 3.8 million . foreign currency exposure as more fully described in note 2i . in the notes to consolidated financial statements contained in item 8 of this annual report on form 10-k , we regularly hedge our non-u.s . dollar-based exposures by entering into forward foreign currency exchange contracts . the terms of these contracts are for periods matching the duration of the underlying exposure and generally range from one month to twelve months . currently , our largest foreign currency exposure is the euro , primarily because our european operations have the highest proportion of our local currency denominated expenses . relative to foreign currency exposures existing at october 31 , 2009 and november 1 , 2008 , a 10% ( 10 % ) unfavorable movement in foreign cur

In [8]:
def preprocess(example):
    inputs  = tokenizer(example["text"], truncation=True, max_length=8192, padding="max_length")
    targets = tokenizer(example["summary"], truncation=True, max_length=512, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

In [23]:
print(data["pre_text"][0])

interest rate to a variable interest rate based on the three-month libor plus 2.05% ( 2.05 % ) ( 2.34% ( 2.34 % ) as of october 31 , 2009 ) . if libor changes by 100 basis points , our annual interest expense would change by $ 3.8 million . foreign currency exposure as more fully described in note 2i . in the notes to consolidated financial statements contained in item 8 of this annual report on form 10-k , we regularly hedge our non-u.s . dollar-based exposures by entering into forward foreign currency exchange contracts . the terms of these contracts are for periods matching the duration of the underlying exposure and generally range from one month to twelve months . currently , our largest foreign currency exposure is the euro , primarily because our european operations have the highest proportion of our local currency denominated expenses . relative to foreign currency exposures existing at october 31 , 2009 and november 1 , 2008 , a 10% ( 10 % ) unfavorable movement in foreign cur

In [4]:
table = "year | 2020 | 2021 \n movies | 12 | 23 \n games | 67 | 54"

# document = data["table"][0]
# input_text = f"You are a highly intelligent bot. How many games were there in 2020? Here is the table: \n {table}"
# input_text = f"You are a highly intelligent bot. The following text is made with HTML. What is second sentence:\n <h1>I love walking on the beach.</h1><p>My dog is fat.</p><h2>He was yelling at a tree.</h2>"
# input_text = f"Let's think step by step. Generate the document ID for the document:\nI think we should go to the beach. Afterwards, we can go the cinema to watch a movie."
input_text = f"{train_df['id'][0]}"
print(input_text)

ADI/2009/page_49.pdf-1


In [5]:
model.eval()

inputs = tokenizer(
    input_text,
    return_tensors="pt",
    truncation=True,
    max_length=8192
)

generated_ids = model.generate(
    inputs.input_ids.to(model.device),
    attention_mask=inputs.attention_mask.to(model.device),
    max_length=512
)

output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(output)


ADI/2009/page_49.pdf-1


In [6]:
len(output.split())

1