In [1]:
!pip install -q transformers datasets peft accelerate bitsandbytes


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
model_name = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Set a padding token for the tokenizer
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto"
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
from datasets import load_dataset

ds = load_dataset("AfterQuery/FinanceQA")

README.md: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating test split:   0%|          | 0/148 [00:00<?, ? examples/s]

In [12]:
def tokenize_function(examples):
    # Adjust this based on the actual column names in your dataset
    # Assuming the dataset contains 'question' and 'answer' fields for QA tasks
    # You might need to inspect ds.column_names to confirm
    return tokenizer(examples['question'], examples['answer'], truncation=True, padding='max_length', max_length=512)

In [14]:
tokenized_ds = ds.map(tokenize_function, batched=True)
print(f"Available dataset splits: {tokenized_ds.keys()}")
print("Tokenization complete. Displaying first tokenized example from the first available split:")
# Assuming there's at least one split and we want to access the first one
first_split_name = list(tokenized_ds.keys())[0]
print(tokenized_ds[first_split_name][0])

Map:   0%|          | 0/148 [00:00<?, ? examples/s]

Available dataset splits: dict_keys(['test'])
Tokenization complete. Displaying first tokenized example from the first available split:
{'context': "COSTCO WHOLESALE CORPORATION\nCONSOLIDATED STATEMENTS OF INCOME\n(amounts in millions, except per share data)\n52 Weeks Ended 53 Weeks Ended 52 Weeks Ended\nSeptember 1,\n2024\nSeptember 3,\n2023\nAugust 28,\n2022\nREVENUE\nNet sales $ 249,625 $ 237,710 $ 222,730\nMembership fees 4,828 4,580 4,224\nTotal revenue 254,453 242,290 226,954\nOPERATING EXPENSES\nMerchandise costs 222,358 212,586 199,382\nSelling, general and administrative 22,810 21,590 19,779\nOperating income 9,285 8,114 7,793\nOTHER INCOME (EXPENSE)\nInterest expense (169) (160) (158)\nInterest income and other, net 624 533 205\nINCOME BEFORE INCOME TAXES 9,740 8,487 7,840\nProvision for income taxes 2,373 2,195 1,925\nNet income including noncontrolling interests 7,367 6,292 5,915\nNet income attributable to noncontrolling interests —\n—\n(71)\nNET INCOME ATTRIBUTABLE TO COS

In [15]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

print("LoRA configuration defined.")

LoRA configuration defined.


In [16]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
print("LoRA applied to the model.")

trainable params: 5,242,880 || all params: 2,784,926,720 || trainable%: 0.1883
LoRA applied to the model.


In [17]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    logging_steps=10,
)

print("TrainingArguments defined.")

TrainingArguments defined.


In [18]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['test'],
    tokenizer=tokenizer
)

print("Trainer successfully initialized.")

Trainer successfully initialized.


  trainer = Trainer(


In [19]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['test'],
    processing_class=tokenizer
)

print("Trainer successfully initialized.")

Trainer successfully initialized.


In [20]:
trainer.train()
print("Model training started.")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"
[34m[1mwandb[0m: Using W&B in offline mode.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


ValueError: The model did not return a loss from the inputs, only the following keys: logits,past_key_values. For reference, the inputs it received are input_ids,attention_mask.

In [21]:
def tokenize_function(examples):
    # Combine question and answer into a single text string
    # For causal LM, the model will predict the next token based on previous tokens.
    # So, the labels for training are essentially the input_ids themselves.
    # This format is typical for fine-tuning a causal language model for text generation.
    text = [q + " " + a for q, a in zip(examples['question'], examples['answer'])]
    tokenized_inputs = tokenizer(text, truncation=True, padding='max_length', max_length=512)
    # For causal language modeling, the labels are typically the input_ids themselves.
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs

In [22]:
tokenized_ds = ds.map(tokenize_function, batched=True)
print(f"Available dataset splits: {tokenized_ds.keys()}")
print("Tokenization complete. Displaying first tokenized example from the first available split:")
# Assuming there's at least one split and we want to access the first one
first_split_name = list(tokenized_ds.keys())[0]
print(tokenized_ds[first_split_name][0])

Map:   0%|          | 0/148 [00:00<?, ? examples/s]

Available dataset splits: dict_keys(['test'])
Tokenization complete. Displaying first tokenized example from the first available split:
{'context': "COSTCO WHOLESALE CORPORATION\nCONSOLIDATED STATEMENTS OF INCOME\n(amounts in millions, except per share data)\n52 Weeks Ended 53 Weeks Ended 52 Weeks Ended\nSeptember 1,\n2024\nSeptember 3,\n2023\nAugust 28,\n2022\nREVENUE\nNet sales $ 249,625 $ 237,710 $ 222,730\nMembership fees 4,828 4,580 4,224\nTotal revenue 254,453 242,290 226,954\nOPERATING EXPENSES\nMerchandise costs 222,358 212,586 199,382\nSelling, general and administrative 22,810 21,590 19,779\nOperating income 9,285 8,114 7,793\nOTHER INCOME (EXPENSE)\nInterest expense (169) (160) (158)\nInterest income and other, net 624 533 205\nINCOME BEFORE INCOME TAXES 9,740 8,487 7,840\nProvision for income taxes 2,373 2,195 1,925\nNet income including noncontrolling interests 7,367 6,292 5,915\nNet income attributable to noncontrolling interests —\n—\n(71)\nNET INCOME ATTRIBUTABLE TO COS

In [23]:
trainer.train()
print("Model training started.")

OutOfMemoryError: CUDA out of memory. Tried to allocate 40.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 34.12 MiB is free. Process 10737 has 14.71 GiB memory in use. Of the allocated memory 14.50 GiB is allocated by PyTorch, and 80.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [24]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=1,  # Reduced batch size
    logging_steps=10,
    gradient_checkpointing=True, # Enable gradient checkpointing to save memory
)

print("TrainingArguments defined with reduced batch size and gradient checkpointing.")

TrainingArguments defined with reduced batch size and gradient checkpointing.


In [25]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['test'],
    processing_class=tokenizer
)

print("Trainer re-initialized with updated TrainingArguments.")

trainer.train()
print("Model training started.")

Trainer re-initialized with updated TrainingArguments.


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [26]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    modules_to_save=["lm_head"] # Add lm_head to modules_to_save
)

print("LoRA configuration defined.")

LoRA configuration defined.


In [27]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
print("LoRA applied to the model.")



trainable params: 136,366,080 || all params: 2,916,049,920 || trainable%: 4.6764
LoRA applied to the model.


In [28]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
model_name = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Set a padding token for the tokenizer
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto"
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [29]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
print("LoRA applied to the model.")



trainable params: 136,366,080 || all params: 2,916,049,920 || trainable%: 4.6764
LoRA applied to the model.


In [30]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['test'],
    processing_class=tokenizer
)

print("Trainer re-initialized with updated TrainingArguments and model.")

trainer.train()
print("Model training started.")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


Trainer re-initialized with updated TrainingArguments and model.




Step,Training Loss
10,11.7645
20,23.0371
30,19.479
40,15.5819
50,20.8687
60,21.8573
70,14.1219
80,32.2716
90,15.0747
100,27.7396


Model training started.


In [31]:
print("Model training completed successfully.")

Model training completed successfully.


In [40]:
first_example = tokenized_ds['test'][0]
print(f"Keys in tokenized_ds['test'] examples: {first_example.keys()}")
print("\nFirst example from tokenized_ds['test']:")
print(f"Question: {first_example['question']}")
print(f"Answer: {first_example['answer']}")
print(f"Input IDs (first 10): {first_example['input_ids'][:10]}")
print(f"Attention Mask (first 10): {first_example['attention_mask'][:10]}")
print(f"Labels (first 10): {first_example['labels'][:10]}")


Keys in tokenized_ds['test'] examples: dict_keys(['context', 'question', 'chain_of_thought', 'answer', 'file_link', 'file_name', 'company', 'question_type', 'input_ids', 'attention_mask', 'labels'])

First example from tokenized_ds['test']:
Question: What is Gross Profit in the year ending 2024?
Answer: $32,095 (in millions)
Input IDs (first 10): [2061, 318, 21796, 42886, 287, 262, 614, 7464, 48609, 30]
Attention Mask (first 10): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Labels (first 10): [2061, 318, 21796, 42886, 287, 262, 614, 7464, 48609, 30]


In [41]:
def generate_response(question, model, tokenizer, max_new_tokens=50):
    # Prepare the input for the model
    input_text = question
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)

    # Move inputs to the same device as the model
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)

    # Generate output from the model
    # The model generates tokens based on the input_ids. We need to specify `max_new_tokens`
    # to control the length of the generated response.
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        do_sample=True, # Use sampling for more diverse responses
        temperature=0.7, # Control randomness
        top_k=50, # Consider only the top 50 most likely tokens
        top_p=0.95, # Nucleus sampling
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

    # Decode the generated tokens
    # We typically want to skip the input tokens when decoding the answer.
    response_tokens = outputs[0][len(input_ids[0]):]
    response = tokenizer.decode(response_tokens, skip_special_tokens=True)

    return response.strip()

print("Response generation function defined.")

Response generation function defined.


In [42]:
num_examples = 5

print(f"Generating predictions for the first {num_examples} examples from the test set:")
for i in range(num_examples):
    example = tokenized_ds['test'][i]
    question = example['question']
    true_answer = example['answer']

    # Generate a response using the fine-tuned model
    generated_answer = generate_response(question, model, tokenizer)

    print(f"\n--- Example {i+1} ---")
    print(f"Question: {question}")
    print(f"True Answer: {true_answer}")
    print(f"Generated Answer: {generated_answer}")

Generating predictions for the first 5 examples from the test set:





--- Example 1 ---
Question: What is Gross Profit in the year ending 2024?
True Answer: $32,095 (in millions)
Generated Answer: $458. Over four years ending in 2024. Over four years ending in 2024. Over 600, assuming a firm X soda to 200 in 2024. Overculateed in 2024, adjusted E soda for 2024, adjusted E soda for 2024, 200

--- Example 2 ---
Question: What is unadjusted EBITDA for the year ending in 2024?
True Answer: $11,522 (in millions)
Generated Answer: $255, assuming a merger, 200, adjusted Eimate the 200, assuming a merger in 200, adjusted E soda for 2024. Over 600, assuming a firm X is a coffee on 200, adjusted Eimate the 200, adjusted Eimate the

--- Example 3 ---
Question: What is adjusted EBITDA for the year ending in 2024?
True Answer: $11,969 (in millions)
Generated Answer: $255, assuming a firm X soda for 2024. Over 600, assuming a merger in 2024. Over four years ending in 2024. Over 600, adjusted E soda to 200, adjusted E soda to 200 members churned to 200, adjusted E sod

In [47]:
num_examples = 20

print(f"Generating predictions for the first {num_examples} examples from the test set:")
for i in range(num_examples):
    example = tokenized_ds['test'][i]
    question = example['question']
    true_answer = example['answer']

    # Generate a response using the fine-tuned model
    generated_answer = generate_response(question, model, tokenizer)

    print(f"\n--- Example {i+1} ---")
    print(f"Question: {question}")
    print(f"True Answer: {true_answer}")
    print(f"Generated Answer: {generated_answer}")

Generating predictions for the first 20 examples from the test set:





--- Example 1 ---
Question: What is Gross Profit in the year ending 2024?
True Answer: $32,095 (in millions)
Generated Answer: $255, adjusted E soda to 200, 200 million in 2024. Over four years ending in 2024, adjusted E soda to 200, adjusted Eimate the 200, adjusted E soda to 200, adjusted E soda to 200, assuming a merger in 2024

--- Example 2 ---
Question: What is unadjusted EBITDA for the year ending in 2024?
True Answer: $11,522 (in millions)
Generated Answer: $458, adjusted Eimate the 200, assuming a coffee on 200, assuming a coffee on 200, assuming a coffee on 200, assuming a coffee on 200, adjusted E soda for 2024, 200, assuming a coffee on 200, assuming a coffee

--- Example 3 ---
Question: What is adjusted EBITDA for the year ending in 2024?
True Answer: $11,969 (in millions)
Generated Answer: 2 gadgets for 2024, adjusted E is a merger, assuming a merger, adjusted Eimate the liabilities in 2024. Over 600, adjusted E soda to 200 million in 2024. Over 600, assuming a firm X so