In [1]:
%%capture
# Installs Unsloth, Xformers, and other necessary tools
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [2]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048 # Supports long documents
dtype = None
load_in_4bit = True # Use 4bit quantization to reduce memory usage

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# Add LoRA adapters (This is what makes the model "trainable")
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Unsloth 2025.11.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [3]:
from datasets import load_dataset

# 1. Load the Financial Dataset from Hugging Face
dataset = load_dataset("virattt/financial-qa-10K", split="train")

# 2. Define the "Financial Analyst" Persona
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are a senior financial analyst. Answer the question based on the context provided. Be professional and concise.

### Input:
{question}

### Context:
{context}

### Response:
{answer}"""

EOS_TOKEN = tokenizer.eos_token

# 3. Format the data so the model can read it
def formatting_prompts_func(examples):
    inputs       = examples["question"]
    contexts     = examples["context"]
    outputs      = examples["answer"]
    texts = []
    for input, context, output in zip(inputs, contexts, outputs):
        text = alpaca_prompt.format(question=input, context=context, answer=output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

dataset = dataset.map(formatting_prompts_func, batched = True)

README.md:   0%|          | 0.00/419 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

In [4]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60, # We run 60 steps for speed. (For a real product, you might use 300+)
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

# Start the training!
trainer_stats = trainer.train()

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/7000 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 7,000 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mshehabhegab20[0m ([33mshehabhegab20-ai[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.3368
2,2.061
3,2.1604
4,2.0941
5,1.6642
6,1.555
7,1.2887
8,1.2643
9,1.2096
10,1.0404


0,1
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,▃▂▃▃▆▃▃█▄▂▂▃▂▂▂▂▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,▂▄▅▇█▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁
train/loss,█▇▇▅▄▃▃▃▃▂▂▂▃▂▂▂▂▁▂▂▂▂▁▁▂▁▁▁▁▃▂▂▂▂▂▂▂▂▂▂

0,1
total_flos,3678373510299648.0
train/epoch,0.06857
train/global_step,60.0
train/grad_norm,0.478
train/learning_rate,0.0
train/loss,0.85
train_loss,0.97056
train_runtime,690.5408
train_samples_per_second,0.695
train_steps_per_second,0.087


In [5]:
# 1. Prepare model for inference (makes it faster)
FastLanguageModel.for_inference(model)

# 2. Ask a test question with context
inputs = tokenizer(
[
    alpaca_prompt.format(
        question = "What was the primary reason for revenue growth?",
        context = "In 2023, the company saw a 20% increase in revenue, primarily driven by strong demand in the cloud computing sector and a one-time tax benefit.",
        answer = "", # We leave this blank so the AI fills it in
    )
], return_tensors = "pt").to("cuda")

# 3. Generate the answer
outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
print(tokenizer.batch_decode(outputs)[0])

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are a senior financial analyst. Answer the question based on the context provided. Be professional and concise.

### Input:
What was the primary reason for revenue growth?

### Context:
In 2023, the company saw a 20% increase in revenue, primarily driven by strong demand in the cloud computing sector and a one-time tax benefit.

### Response:
The primary reason for revenue growth in 2023 was strong demand in the cloud computing sector and a one-time tax benefit.<|end_of_text|>


In [19]:
# 1. Install libraries
!pip install -q langchain langchain-community sentence-transformers pypdf faiss-cpu

import torch
# Updated imports for new libraries
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

# --- 2. Load the PDF ---
try:
    # It will load YOUR real file now
    loader = PyPDFLoader("doc.pdf")
    pages = loader.load()
    print(f"✅ Successfully loaded {len(pages)} pages.")
except Exception as e:
    print("❌ Error loading PDF. Did you upload 'doc.pdf'?")
    print(e)

# Split text (100 pages needs more chunks!)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = text_splitter.split_documents(pages)

# --- 3. Create Search Engine ---
print(f"Indexing {len(docs)} chunks... this might take 1-2 minutes for a large PDF...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.from_documents(docs, embeddings)

# --- 4. Chat Function ---
def ask_financial_bot(question):
    # Search PDF for top 3 matches (k=3 gives more context for big docs)
    relevant_docs = db.similarity_search(question, k=3)
    context_text = "\n".join([d.page_content for d in relevant_docs])

    prompt = f"""### Instruction:
You are a senior financial analyst. Answer the question based on the context provided.

### Input:
{question}

### Context:
{context_text}

### Response:
"""
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=200, use_cache=True)
    response = tokenizer.batch_decode(outputs)[0]
    return response.split("### Response:")[-1].strip()

# --- 5. Test It ---
print("\n" + "="*30)
# Ask a specific question about your real PDF
print(ask_financial_bot("Summarize the financial risks mentioned in this document."))
print("="*30)

✅ Successfully loaded 130 pages.
Indexing 533 chunks... this might take 1-2 minutes for a large PDF...

Financial risks include potential losses from litigation, cybersecurity breaches, and natural disasters.<|end_of_text|>
