In [15]:
pip install -r requirements.txt

In [16]:
import  pandas as pd

# Preprocessing
- Load datatset
- Remove duplicates
- Strip leading and trailing whitespace
- Save dataset

In [17]:
data = pd.read_csv('data/train_data.csv')
data = data.drop_duplicates()

data['question'] = data['question'].str.strip()
data['answer'] = data['answer'].str.strip()

data.to_csv("data/cleaned_data.csv", index=False)

In [18]:
data = pd.read_csv('data/cleaned_data.csv')

We define a function called apply_chat_template(), which takes one row from the dataset at a time.
We create a structured conversation format:

- The user's question (role: user).
- The FinBot’s response (role: assistant).

In [19]:
def apply_chat_template(row):
    messages = [
        {"role": "user", "content": row["question"]},
        {"role": "assistant", "content": row["answer"]}
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

We're working with Meta’s instruction-tuned version of Llama-3.2 We pass the authentication token so Hugging Face grants us access to the model without needing manual login. The tokenizer is responsible for structuring text properly before feeding it into the AI model.

In [20]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

In [21]:
model_id = "meta-llama/Llama-3.2-1B-Instruct"
token = "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"
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, device_map="auto", use_auth_token=token)
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
tokenizer.pad_token = tokenizer.eos_token



This ensures the correct format for fine-tuning Llama-3.2.
Llama-3.2 expects conversations to follow a structured pattern (user prompt => assistant response).

In [22]:
data["formatted_prompt"] = data.apply(apply_chat_template, axis=1)

Since Llama was fine-tuned using this format, structuring conversations like this helps the model understand where user input starts and ends.

In [23]:
print(data['formatted_prompt'][1])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 22 Apr 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

Cheapest way to wire or withdraw money from US account while living in Europe<|eot_id|><|start_header_id|>assistant<|end_header_id|>

There is a number of cheaper online options that you could use. TranferWise was already mentioned here. Other options i know are Paysera or TransferGo. They state that international transfers are processed on the next day and they are substantially cheaper than those of banks. Currency exchange rate is usually not bad.<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [24]:
data[["formatted_prompt"]].to_csv("data/finbot_data.csv", index=False)


# Teach the model to specialize in financial literacy conversations.

In [25]:
dataset = load_dataset("csv", data_files="data/finbot_data.csv", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

In [26]:
def tokenize_function(example):
    tokens = tokenizer(example['formatted_prompt'], padding="max_length", truncation=True, max_length=128)
    tokens['labels'] = [-100 if token == tokenizer.pad_token_id else token for token in tokens['input_ids']]
    return tokens

In [27]:
tokenized_dataset = dataset.map(tokenize_function)

Map:   0%|          | 0/12047 [00:00<?, ? examples/s]

In [28]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.05)

In [29]:
training_args = TrainingArguments(
    output_dir="./finbot_model",
    evaluation_strategy="steps",
    eval_steps=50,
    logging_steps=50,
    save_steps=200,
    per_device_train_batch_size=2,  # Adjust based on GPU power
    per_device_eval_batch_size=2,
    num_train_epochs=3,  # Number of training loops
    learning_rate=2e-5,
    max_grad_norm=1,  # Gradient clipping to stabilize training
    fp16=False,  # Set False for MacBooks
)



In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer
)

In [31]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mshirleymalefane0019[0m ([33mshirleymalefane0019-shaper[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 30.12 MiB is free. Process 35842 has 14.71 GiB memory in use. Of the allocated memory 14.56 GiB is allocated by PyTorch, and 23.22 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
trainer.save_model("./finbot_model")
tokenizer.save_pretrained("./finbot_model")