In [None]:
pip install transformers datasets peft bitsandbytes accelerate


In [52]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

model_id = "openai-community/gpt2-medium"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
).to(device)

In [53]:
import pandas as pd
from datasets import load_dataset


data = pd.read_csv("data/train_data.csv").drop_duplicates()


def format_chat(row):
    return f"<|user|>{row['question']}\n<|assistant|>{row['answer']}"

data["formatted_prompt"] = data.apply(format_chat, axis=1)
data.to_csv("data/finbot_data.csv", index=False)


dataset = load_dataset("csv", data_files="data/finbot_data.csv", split="train")


Generating train split: 0 examples [00:00, ? examples/s]

In [54]:
def tokenize_function(example):
    tokens = tokenizer(example['formatted_prompt'], padding="max_length", truncation=True, max_length=128)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_dataset = dataset.map(tokenize_function)

Map:   0%|          | 0/12047 [00:00<?, ? examples/s]

In [55]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.1
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 2,162,688 || all params: 356,985,856 || trainable%: 0.6058




In [56]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./finbot_model",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    learning_rate=3e-5,
    bf16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

trainer.train()
trainer.save_model("data/finbot_model")
tokenizer.save_pretrained("data/finbot_model")


  trainer = Trainer(
No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss


('data/finbot_model/tokenizer_config.json',
 'data/finbot_model/special_tokens_map.json',
 'data/finbot_model/vocab.json',
 'data/finbot_model/merges.txt',
 'data/finbot_model/added_tokens.json',
 'data/finbot_model/tokenizer.json')

**Test**

In [59]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Set device (GPU if available, otherwise CPU)
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load tokenizer & model
model_path = "data/finbot_model"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to(device)

# Set pad token
tokenizer.pad_token = tokenizer.eos_token


In [87]:
input_text = "NEW YORK STOCK EXCHANGE"

# Tokenize input and move to correct device
input_tokens = tokenizer(input_text, return_tensors="pt").to(device)

# Generate response
output_tokens = model.generate(**input_tokens, max_new_tokens=50)
response = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

print("FinBot Response:", response)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


FinBot Response: NEW YORK STOCK EXCHANGE

The NYSE is a stock exchange, and it is a stock exchange. It is not a stock broker. It is not a stock broker's office. It is not a stock broker's office. It is not a stock broker's office
