In [1]:
# Environment setup for Llama 3.1 fine-tuning
import torch

if not torch.cuda.is_available():
    raise SystemExit("CUDA device not detected. Enable a GPU runtime.")

device_props = torch.cuda.get_device_properties(0)
gpu_name = torch.cuda.get_device_name(0)
gpu_memory = device_props.total_memory / 1e9
print(f"Detected GPU: {gpu_name} ({gpu_memory:.1f} GB)")

%pip install -q transformers>=4.40.0 datasets>=2.14.0 accelerate>=0.24.0
%pip install -q peft>=0.6.0 bitsandbytes>=0.41.0
%pip install -q scipy scikit-learn pandas numpy
%pip install -q wandb

Detected GPU: NVIDIA A100-SXM4-80GB (85.2 GB)


In [2]:
# Authentication & Drive setup
from google.colab import drive
from getpass import getpass
from huggingface_hub import login
from transformers import AutoTokenizer
import os

drive.mount('/content/drive')
model_save_path = "/content/drive/MyDrive/financial_llama_models"
os.makedirs(model_save_path, exist_ok=True)
print(f"Models will be stored in {model_save_path}")

hf_token = getpass("Enter your Hugging Face token: ")
login(token=hf_token)
# Quick access check
AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
print("Hugging Face authentication complete.")

Mounted at /content/drive
Models will be stored in /content/drive/MyDrive/financial_llama_models
Enter your Hugging Face token: ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Hugging Face authentication complete.


In [3]:
# Import libraries
import torch
import pandas as pd
import numpy as np
import random
import json
import warnings
from datetime import datetime

from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, TaskType, PeftModel

warnings.filterwarnings("ignore")

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Check GPU
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")


GPU Available: True
GPU Name: NVIDIA A100-SXM4-80GB
GPU Memory: 85.2 GB


In [4]:
# Training configuration

gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"
gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9 if torch.cuda.is_available() else 0

train_batch_size = 16
gradient_accumulation_steps = 4
max_length = 2048

CONFIG = {
    "model_name": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "dataset_name": "Josephgflowers/Finance-Instruct-500k",
    "dataset_config": "default",
    "max_length": max_length,
    "train_batch_size": train_batch_size,
    "eval_batch_size": 8,
    "gradient_accumulation_steps": gradient_accumulation_steps,
    "learning_rate": 1e-4,
    "num_epochs": 5,
    "lora_r": 64,
    "lora_alpha": 128,
    "lora_dropout": 0.1,
    "warmup_ratio": 0.05,
    "weight_decay": 0.01,
    "max_grad_norm": 1.0,
    "save_steps": 100,
    "eval_steps": 50,
    "logging_steps": 25,
    "output_dir": f"{model_save_path}/checkpoints",
    "save_dir": f"{model_save_path}/final_model",
    "fp16": True,
    "gradient_checkpointing": True,
    "dataloader_num_workers": 4,
    "quantization": None,
    "precision": "fp16",
    "length_bucket_boundaries": [512, 1024, 1536, max_length],
    "length_stats_sample_size": 4000,
    "length_stats_percentile": 0.98,
    "align_save_with_eval": True,
    "optim": "paged_adamw_8bit",
    "max_train_samples": 40_000,
    "max_val_samples": 4_000,
}

print("Configuration:")
print(f"  GPU: {gpu_name} ({gpu_memory:.1f} GB)")
print(f"  Effective batch size: {CONFIG['train_batch_size'] * CONFIG['gradient_accumulation_steps']}")
print(f"  Max sequence length: {CONFIG['max_length']}")
print(f"  Training epochs: {CONFIG['num_epochs']}")
print(f"  Saving checkpoints to: {CONFIG['output_dir']}")

os.makedirs(CONFIG['output_dir'], exist_ok=True)
os.makedirs(CONFIG['save_dir'], exist_ok=True)

Configuration:
  GPU: NVIDIA A100-SXM4-80GB (85.2 GB)
  Effective batch size: 64
  Max sequence length: 2048
  Training epochs: 5
  Saving checkpoints to: /content/drive/MyDrive/financial_llama_models/checkpoints


In [5]:
# Quick sanity checks
from transformers import AutoTokenizer

try:
    tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name"])
    print(f"Tokenizer loaded ({len(tokenizer)} tokens)")
except Exception as exc:
    raise RuntimeError("Unable to load tokenizer. Verify HF auth and license.") from exc

if torch.cuda.is_available():
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"GPU memory available: {gpu_memory:.1f} GB")
else:
    raise SystemError("GPU disappeared after setup.")

Tokenizer loaded (128256 tokens)
GPU memory available: 85.2 GB


In [6]:
# Dataset loading
from datasets import DatasetDict, load_dataset

VAL_SPLIT_FRACTION = 0.01
MAX_VAL_SAMPLES = 5_000


def load_finance_instruct_dataset(dataset_name: str) -> DatasetDict:
    """Load the Hugging Face dataset without mutating text fields."""
    dataset = load_dataset(dataset_name)
    if not isinstance(dataset, DatasetDict):
        raise ValueError("Expected a DatasetDict with at least a 'train' split")

    if "validation" not in dataset:
        if "train" not in dataset:
            raise ValueError("Dataset must expose a 'train' split to create validation data")
        val_size = min(
            MAX_VAL_SAMPLES,
            max(1, int(len(dataset["train"]) * VAL_SPLIT_FRACTION)),
        )
        split = dataset["train"].train_test_split(test_size=val_size, seed=42)
        dataset = DatasetDict({"train": split["train"], "validation": split["test"]})

    return dataset


print("Loading Finance-Instruct-500k dataset")
raw_dataset = load_finance_instruct_dataset(CONFIG["dataset_name"])
print("‚úÖ Dataset loaded successfully")

print("\nüìä Dataset Structure:")
print(f"   Splits: {list(raw_dataset.keys())}")
print(f"   Training samples: {len(raw_dataset['train']):,}")
print(f"   Validation samples: {len(raw_dataset['validation']):,}")

processed_dataset = raw_dataset
print("\n‚úÖ Original dataset ready for Llama 3.1 fine-tuning")


# Load Finance-Instruct-500k
print("Loading Finance-Instruct-500k dataset...")
raw_dataset = load_finance_instruct_dataset(CONFIG["dataset_name"])
print(f"‚úÖ Dataset loaded successfully!")

# Display dataset information
print(f"\nüìä Dataset Structure:")
print(f"   Dataset: {raw_dataset}")
print(f"   Splits: {list(raw_dataset.keys())}")
if 'train' in raw_dataset:
    print(f"   Training samples: {len(raw_dataset['train']):,}")
if 'validation' in raw_dataset:
    print(f"   Validation samples: {len(raw_dataset['validation']):,}")

# Show a sample of the raw data
print(f"\nüìÑ Sample Raw Conversation:")
sample = raw_dataset['train'][0]
print(f"   System: {(sample.get('system') or 'N/A')[:200]}")
print(f"   User: {(sample.get('user') or 'N/A')[:200]}")
print(f"   Assistant: {(sample.get('assistant') or 'N/A')[:200]}...")

# Preprocess: the loader already formatted text, so no extra SEC-specific processing
processed_dataset = raw_dataset

# Final statistics
print(f"\nüìä Final Dataset Statistics:")
print(f"   ‚úÖ Training samples: {len(processed_dataset['train']):,}")
print(f"   ‚úÖ Validation samples: {len(processed_dataset['validation']):,}")
print(f"   ‚úÖ Processed dataset ready for Llama 3.1 fine-tuning! üöÄ")

Loading Finance-Instruct-500k dataset


README.md: 0.00B [00:00, ?B/s]

train.json:   0%|          | 0.00/580M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/518185 [00:00<?, ? examples/s]

‚úÖ Dataset loaded successfully

üìä Dataset Structure:
   Splits: ['train', 'validation']
   Training samples: 513,185
   Validation samples: 5,000

‚úÖ Original dataset ready for Llama 3.1 fine-tuning
Loading Finance-Instruct-500k dataset...
‚úÖ Dataset loaded successfully!

üìä Dataset Structure:
   Dataset: DatasetDict({
    train: Dataset({
        features: ['system', 'user', 'assistant'],
        num_rows: 513185
    })
    validation: Dataset({
        features: ['system', 'user', 'assistant'],
        num_rows: 5000
    })
})
   Splits: ['train', 'validation']
   Training samples: 513,185
   Validation samples: 5,000

üìÑ Sample Raw Conversation:
   System: As a finance expert, your role is to provide clear, concise, and informative responses to finance-related questions. When presented with a question, draw upon your extensive knowledge and expertise to
   User: Question:
Choose online stock trading companies
   Assistant: Lower fees are always better, everything else equa

In [8]:
# Convert structured conversations to chat-formatted `text`
from transformers import AutoTokenizer
import torch
import subprocess

print("Formatting Finance-Instruct-500k conversations with chat template...")
_chat_tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name"])


def log_gpu_memory(prefix: str = "GPU status"):
    if not torch.cuda.is_available():
        print(f"{prefix}: CUDA not available")
        return
    allocated = torch.cuda.memory_allocated() / 1e9
    reserved = torch.cuda.memory_reserved() / 1e9
    total = torch.cuda.get_device_properties(0).total_memory / 1e9
    try:
        smi = subprocess.run(
            [
                "nvidia-smi",
                "--query-gpu=memory.used,memory.free",
                "--format=csv,noheader",
            ],
            capture_output=True,
            text=True,
        )
        gpu_used, gpu_free = [x.strip() for x in smi.stdout.strip().split("\n")[0].split(",")]
        print(
            f"{prefix}: allocated={allocated:.2f} GB, reserved={reserved:.2f} GB, total={total:.2f} GB | nvidia-smi used={gpu_used}, free={gpu_free}"
        )
    except Exception:
        print(
            f"{prefix}: allocated={allocated:.2f} GB, reserved={reserved:.2f} GB, total={total:.2f} GB (nvidia-smi not available)"
        )


MIN_CHAT_TOKENS = 30
MAX_CHAT_TOKENS = int(CONFIG.get("max_length", 2048))
LENGTH_SAMPLE_SIZE = int(CONFIG.get("length_stats_sample_size", 4000))
LENGTH_PERCENTILE_TARGET = float(CONFIG.get("length_stats_percentile", 0.98))


def _format_to_text(example):
    user = example.get("user")
    assistant = example.get("assistant")
    if not user or not assistant:
        return {"text": None}

    messages = []
    sys_msg = example.get("system")
    if sys_msg:
        messages.append({"role": "system", "content": sys_msg})
    messages.append({"role": "user", "content": user})
    messages.append({"role": "assistant", "content": assistant})

    text = _chat_tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False,
    )
    return {"text": text}


def _truncate_to_max_tokens(example):
    text = example.get("text")
    if not isinstance(text, str) or not text.strip():
        return {"text": text}

    tokenized = _chat_tokenizer(
        text,
        truncation=False,
        add_special_tokens=True,
    )
    input_ids = tokenized["input_ids"]

    if len(input_ids) <= MAX_CHAT_TOKENS:
        return {"text": text}

    truncated_ids = input_ids[:MAX_CHAT_TOKENS]
    truncated_text = _chat_tokenizer.decode(truncated_ids, skip_special_tokens=False)
    return {"text": truncated_text}


def _min_length_filter(example):
    text = example.get("text")
    if not isinstance(text, str) or not text.strip():
        return False

    tokenized = _chat_tokenizer(
        text,
        truncation=False,
        add_special_tokens=True,
    )
    token_count = len(tokenized["input_ids"])
    return token_count >= MIN_CHAT_TOKENS


def _sample_token_lengths_from_text(dataset_split, tokenizer, sample_size):
    total = len(dataset_split)
    if total == 0:
        return []
    if sample_size >= total:
        indices = list(range(total))
    else:
        import random

        indices = random.sample(range(total), sample_size)
    lengths = []
    for idx in indices:
        text = dataset_split[idx].get("text")
        if not isinstance(text, str) or not text.strip():
            continue
        tokens = tokenizer(
            text,
            add_special_tokens=True,
            truncation=False,
            return_attention_mask=False,
            return_token_type_ids=False,
        )["input_ids"]
        lengths.append(len(tokens))
    return lengths


def _log_length_stats(lengths, prefix):
    if not lengths:
        print(f"{prefix}: unavailable (no valid samples)")
        return None
    import numpy as np

    array = np.array(lengths)
    percentiles = [50, 75, 90, 95, 98, 99, 100]
    stats = {f"p{p}": int(np.percentile(array, p)) for p in percentiles}
    print(prefix)
    for key in ["p50", "p75", "p90", "p95", "p98", "p99", "p100"]:
        value = stats.get(key)
        if value is not None:
            print(f"  {key.upper()}: {value} tokens")
    print(f"  Mean: {array.mean():.1f} tokens")
    print(f"  Std: {array.std():.1f} tokens")
    print(f"  Max (sample): {int(array.max())} tokens")
    return stats


from pathlib import Path

processed_dataset_path = Path(model_save_path) / "processed_finance_instruct_ds_v2"
log_gpu_memory("Before dataset processing")

if processed_dataset_path.exists():
    print(f"üìÅ Processed dataset already exists at {processed_dataset_path}, loading from disk...")
    from datasets import load_from_disk
    processed_dataset = load_from_disk(str(processed_dataset_path))
else:
    templated_dataset = processed_dataset.map(
        _format_to_text,
        desc="Applying chat template",
    )
    log_gpu_memory("After template application")
    templated_dataset = templated_dataset.filter(
        lambda x: isinstance(x["text"], str) and len(x["text"]) > 0,
        desc="Dropping incomplete rows",
    )
    log_gpu_memory("After dropping incomplete rows")

    length_samples = _sample_token_lengths_from_text(
        templated_dataset["train"],
        _chat_tokenizer,
        LENGTH_SAMPLE_SIZE,
    )
    stats = _log_length_stats(
        length_samples,
        "Token-length statistics (pre-truncation sample):",
    )
    if stats:
        import numpy as np

        percentile_target_value = int(
            np.percentile(length_samples, LENGTH_PERCENTILE_TARGET * 100)
        )
        print(
            f"  Target percentile ({LENGTH_PERCENTILE_TARGET*100:.0f}%): {percentile_target_value} tokens"
        )
        adaptive_candidates = [1024, 1536, CONFIG["max_length"]]
        adaptive_max_length = adaptive_candidates[-1]
        for candidate in adaptive_candidates:
            if percentile_target_value <= candidate:
                adaptive_max_length = candidate
                break
        CONFIG["max_length"] = min(CONFIG["max_length"], adaptive_max_length)
        CONFIG.setdefault("length_stats_pre_trunc", {}).update(
            {"target_percentile_tokens": percentile_target_value, **stats}
        )
        raw_boundaries = CONFIG.get("length_bucket_boundaries") or []
        computed_boundaries = sorted(
            set(boundary for boundary in raw_boundaries if boundary)
            | {256, 512, 768, 1024, 1280, CONFIG["max_length"]}
        )
        CONFIG["length_bucket_boundaries"] = [
            b for b in computed_boundaries if b <= CONFIG["max_length"]
        ]
        print(f"Adaptive max_length selected: {CONFIG['max_length']} tokens")
        print(f"Length bucket boundaries: {CONFIG['length_bucket_boundaries']}")
    else:
        print("Token-length statistics unavailable; keeping original max_length.")

    MAX_CHAT_TOKENS = CONFIG["max_length"]

    processed_dataset = templated_dataset.map(
        _truncate_to_max_tokens,
        desc=f"Truncating chats to <= {MAX_CHAT_TOKENS} tokens",
    )
    log_gpu_memory("After truncation")
    processed_dataset = processed_dataset.filter(
        _min_length_filter,
        desc=f"Dropping chats under {MIN_CHAT_TOKENS} tokens",
    )
    log_gpu_memory("After min-length filter")

    print("\nüìä Dataset Structure (chat-formatted):")
    print(f"   Training samples: {len(processed_dataset['train']):,}")
    print(f"   Validation samples: {len(processed_dataset['validation']):,}")
    print(f"   Token filter range: {MIN_CHAT_TOKENS}+ tokens (truncated at {MAX_CHAT_TOKENS})")
    print("\nüìÑ Sample Chat Text:")
    print(processed_dataset['train'][0]['text'][:500] + "...")

    print(f"üíæ Saving processed dataset to: {processed_dataset_path}")
    processed_dataset_path.mkdir(parents=True, exist_ok=True)
    processed_dataset.save_to_disk(str(processed_dataset_path))
    log_gpu_memory("After saving processed dataset")
    print("‚úÖ Dataset saved! Use datasets.load_from_disk(...) to reload later.")

Formatting Finance-Instruct-500k conversations with chat template...
Before dataset processing: allocated=0.00 GB, reserved=0.00 GB, total=85.17 GB | nvidia-smi used=5 MiB, free=81216 MiB
üìÅ Processed dataset already exists at /content/drive/MyDrive/financial_llama_models/processed_finance_instruct_ds_v2, loading from disk...


In [9]:
# Length-statistics summary (already computed pre-truncation during processing)
pre_trunc_stats = CONFIG.get("length_stats_pre_trunc")
if pre_trunc_stats:
    print("Token-length statistics (pre-truncation snapshot):")
    for key in ["p50", "p75", "p90", "p95", "p98", "p99", "p100"]:
        if key in pre_trunc_stats:
            print(f"  {key.upper()}: {pre_trunc_stats[key]} tokens")
    target_tokens = pre_trunc_stats.get("target_percentile_tokens")
    if target_tokens:
        print(f"  Target percentile tokens: {target_tokens}")
    print(f"Adaptive max_length in use: {CONFIG['max_length']} tokens")
    print(f"Length bucket boundaries: {CONFIG.get('length_bucket_boundaries')}")
else:
    print("No pre-truncation statistics recorded in CONFIG. Re-run preprocessing to populate them.")


No pre-truncation statistics recorded in CONFIG. Re-run preprocessing to populate them.


In [10]:
# Save processed dataset to Drive (HF datasets format)
from pathlib import Path

processed_dataset_path = Path(model_save_path) / "processed_finance_instruct_ds"
processed_dataset_path.mkdir(parents=True, exist_ok=True)

print(f"üíæ Saving processed dataset to: {processed_dataset_path}")
processed_dataset.save_to_disk(str(processed_dataset_path))
print("‚úÖ Dataset saved! Use datasets.load_from_disk(...) to reload later.")

üíæ Saving processed dataset to: /content/drive/MyDrive/financial_llama_models/processed_finance_instruct_ds


Saving the dataset (0/3 shards):   0%|          | 0/513185 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

‚úÖ Dataset saved! Use datasets.load_from_disk(...) to reload later.


In [11]:
# Attention backend configuration (stick with PyTorch SDPA/TF32)
import os
import torch
print(f"Torch: {torch.__version__} | CUDA: {torch.version.cuda}")

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

os.environ.pop("USE_FLASH_ATTENTION", None)
chosen = "default"
print(f"Attention impl to use: {chosen}")

Torch: 2.9.0+cu126 | CUDA: 12.6
Attention impl to use: default


In [12]:
import gc
import torch
def clear_gpu_memory(prefix: str = "Pre-training cleanup"):
    if torch.cuda.is_available():
        gc.collect()
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
        allocated = torch.cuda.memory_allocated() / 1e9
        reserved = torch.cuda.memory_reserved() / 1e9
        print(f"{prefix}: cleared cache. allocated={allocated:.2f} GB, reserved={reserved:.2f} GB")
    else:
        print(f"{prefix}: CUDA not available")


clear_gpu_memory()

Pre-training cleanup: cleared cache. allocated=0.00 GB, reserved=0.00 GB


In [13]:
# Run complete training pipeline
from train_model import run_training
from transformers import DataCollatorForLanguageModeling, AutoTokenizer

# Start training
print("üöÄ Starting fine-tuning process...")

# Define the tokenizer and set the padding token
tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name"])
tokenizer.pad_token = tokenizer.eos_token # Set padding token

# Define the data collator (if needed by run_training, otherwise it might be created internally)
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Optionally limit training set size for Colab runs
if "max_train_samples" in CONFIG:
    if CONFIG["max_train_samples"]:
        processed_dataset["train"] = processed_dataset["train"].select(range(min(CONFIG["max_train_samples"], len(processed_dataset["train"]))))
    if CONFIG.get("max_val_samples"):
        processed_dataset["validation"] = processed_dataset["validation"].select(range(min(CONFIG["max_val_samples"], len(processed_dataset["validation"]))))

# Start training - attempting to pass the configured tokenizer
# Assuming run_training accepts a tokenizer object
model, tokenizer, trainer = run_training(CONFIG, processed_dataset)


print("‚úÖ Training completed!")
print(f"Model saved to: {CONFIG['save_dir']}")

üöÄ Starting fine-tuning process...


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

trainable params: 167,772,160 || all params: 8,198,033,408 || trainable%: 2.0465
GPU memory status:
  [0] NVIDIA A100-SXM4-80GB: free 62.48 GiB / total 79.32 GiB | allocated 15.58 GiB | reserved 16.35 GiB
Estimated training memory breakdown (per device):
  Parameters (trainable): 0.62 GiB across 167,772,160 params
  Gradients: 0.62 GiB (mirrors trainable params)
  Optimizer state: 0.62 GiB (heuristic)
  Activations (~batch 16 √ó seq 2048): 12.00 GiB
  ----> Estimated total training footprint: 13.88 GiB


Filtering non-English texts (ratio>=0.85):   0%|          | 0/40000 [00:00<?, ? examples/s]

Filtering non-English texts (ratio>=0.85):   0%|          | 0/4000 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/38291 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/3823 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


Token length stats (processed train split):
  P50: 164 tokens
  P75: 287 tokens
  P90: 567 tokens
  P95: 832 tokens
  P98: 1142 tokens
  P99: 1537 tokens
  P100: 1540 tokens
  Mean: 252.0 tokens | Std: 263.7 tokens


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mgsmcq[0m ([33mgsmcq-university-of-washington[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
50,0.9684,1.186706
100,0.8707,1.126723
150,0.8995,1.096485
200,0.8683,1.080774
250,0.8762,1.074667
300,0.8203,1.071634
350,0.8519,1.064655
400,0.8158,1.059253
450,0.7996,1.051618
500,0.8661,1.052241


‚úÖ Training completed!
Model saved to: /content/drive/MyDrive/financial_llama_models/final_model


In [None]:
# Inference with trained LoRA adapter
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import gradio as gr

def load_lora_model(base_model_name: str, lora_path: str):
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    tokenizer.padding_side = "left"
    tokenizer.pad_token = tokenizer.eos_token

    dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        dtype=dtype,
        device_map="auto",
    )
    model = PeftModel.from_pretrained(base_model, lora_path)
    model.eval()
    return tokenizer, model

LOADED_TOKENIZER, LOADED_MODEL = load_lora_model(
    CONFIG["model_name"],
    "/content/drive/MyDrive/financial_llama_models/final_model",
)

def build_prompt(history, system_prompt):
    messages = [{"role": "system", "content": system_prompt}]
    for user_text, assistant_text in history:
        messages.append({"role": "user", "content": user_text})
        if assistant_text:
            messages.append({"role": "assistant", "content": assistant_text})
    return LOADED_TOKENIZER.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )


def respond(message, history, system_prompt):
    history = history or []
    history_as_tuples = [tuple(pair) for pair in history]
    prompt_text = build_prompt(history_as_tuples + [(message, "")], system_prompt)
    inputs = LOADED_TOKENIZER(
        prompt_text,
        return_tensors="pt",
        padding=True,
    ).to(LOADED_MODEL.device)

    generation_config = dict(
        max_new_tokens=512,
        temperature=0.2,
        top_p=0.9,
        do_sample=True,
        eos_token_id=LOADED_TOKENIZER.eos_token_id,
    )

    with torch.inference_mode():
        output_ids = LOADED_MODEL.generate(**inputs, **generation_config)

    generated_text = LOADED_TOKENIZER.decode(
        output_ids[0][inputs["input_ids"].shape[-1]:],
        skip_special_tokens=True,
    ).strip()

    return generated_text

system_prompt_default = "You are FinAssist, a helpful financial analyst."
demo = gr.ChatInterface(
    fn=lambda msg, hist: respond(msg, hist, system_prompt_default),
    title="Chat Bot",
    description="Ask finance questions and chat with the fine-tuned model.",
)

demo.launch(share=False,debug=True)



In [None]:
# Sample 50 QAs, run finetuned model, judge with GPT-5.1
import os, json, random
import torch
import pandas as pd
from datasets import load_dataset
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from openai import OpenAI

# Paths and config
adapter_path = "/content/drive/MyDrive/financial_llama_models/final_model"
base_model_name = CONFIG["model_name"]
dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
max_gen_tokens = 256

# Reuse loaded model/tokenizer if present; otherwise load
if "LOADED_MODEL" in globals() and "LOADED_TOKENIZER" in globals():
    model = LOADED_MODEL
    tokenizer = LOADED_TOKENIZER
else:
    tokenizer = AutoTokenizer.from_pretrained(adapter_path)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        dtype=dtype,
        device_map="auto",
    )
    model = PeftModel.from_pretrained(base_model, adapter_path)
    model.eval()
    LOADED_MODEL = model
    LOADED_TOKENIZER = tokenizer

def build_question(row: dict) -> str:
    sys_part = (row.get("system") or "").strip()
    user_part = (row.get("user") or "").strip()
    if sys_part:
        return f"{sys_part}\n\nUser: {user_part}".strip()
    return user_part


def show_model_input(row: dict) -> str:
    """Return the exact prompt sent into the finetuned model."""
    question = build_question(row)
    print(question)
    return question


def show_model_output(question: str, model_answer: str) -> None:
    """Log the model input and output for inspection."""
    print("=== Model input ===")
    print(question)
    print("=== Model output ===")
    print(model_answer)
    print("====================")


@torch.no_grad()
def generate_answer(question: str) -> str:
    # Single-turn only: truncate input and disable KV reuse
    inputs = tokenizer(
        question.strip(),
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=tokenizer.model_max_length,
    ).to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_gen_tokens,
        temperature=0.3,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        use_cache=False,
    )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Remove the prompt if echoed
    if decoded.startswith(question):
        decoded = decoded[len(question):].strip()
    return decoded.strip()

# Load dataset
raw_ds = load_dataset("Josephgflowers/Finance-Instruct-500k", split="train")

# (Optional) filtering helpers retained but NOT used for this run
FINANCE_KEYWORDS = [
    "stock", "bond", "equity", "loan", "mortgage", "rate", "yield", "treasury",
    "bank", "credit", "debit", "finance", "financial", "inflation", "gdp",
    "econom", "market", "currency", "forex", "exchange", "tax", "revenue",
    "budget", "investment", "investor", "portfolio", "fund", "derivative",
    "option", "futures", "hedge", "insurance", "pension", "ipo", "earnings",
]
SPORTS_TRIVIA = ["team", "league", "squad", "goal", "score", "player", "coach", "world cup", "nba", "mlb", "nhl", "fifa", "uefa", "match", "tournament", "medal", "olympic", "olympics"]
MAX_REF_LEN = 1200
MIN_REF_LEN = 8

def is_finance_like(text: str) -> bool:
    t = text.lower()
    return any(k in t for k in FINANCE_KEYWORDS)

def is_sports_trivia(text: str) -> bool:
    t = text.lower()
    return any(k in t for k in SPORTS_TRIVIA)

def keep_row(row):
    user = (row.get("user") or "")
    ref = (row.get("assistant") or "")
    if len(ref) < MIN_REF_LEN or len(ref) > MAX_REF_LEN:
        return False
    if is_sports_trivia(user) or is_sports_trivia(ref):
        return False
    return is_finance_like(user) or is_finance_like(ref)

# For this evaluation, use the first 50 entries from the RAW dataset (no preprocessing, no shuffling)
sampled = raw_ds.select(range(min(50, len(raw_ds))))
print("Sampling first", len(sampled), "rows from RAW dataset (no filtering / no preprocessing)")

# OpenAI judge setup (pull from Colab secrets: OPENAI_API_KEY)
from google.colab import userdata
openai_api_key = userdata.get("OPENAI_API_KEY")
client = OpenAI(api_key=openai_api_key)
judge_model = "gpt-5.1"

def judge_answer(question: str, reference: str, candidate: str):
    prompt = (
        "You are a strict financial QA judge. Compare the candidate answer to the reference.\n"
        "Return JSON with keys 'score' (0-10, lower if unsure) and 'justification' (short).\n\n"
        f"Question:\n{question}\n\nReference answer (ground truth):\n{reference}\n\nModel answer (candidate):\n{candidate}\n"
    )
    resp = client.responses.create(
        model=judge_model,
        input=prompt,
        max_output_tokens=200,
    )
    text = resp.output_text
    try:
        data = json.loads(text)
        score = data.get("score")
        justification = data.get("justification", text)
    except Exception:
        score = None
        justification = text
    return score, justification

records = []
for row in tqdm(sampled, desc="Evaluating", total=len(sampled)):
    # Single-turn evaluation only; no prior context is reused
    question = build_question(row)
    show_model_input(row)  # log the exact prompt sent to the model

    reference_raw = (row.get("assistant") or "").strip()
    # Light formatting: collapse excessive whitespace while preserving content
    reference = " ".join(reference_raw.split())

    model_answer = generate_answer(question)
    show_model_output(question, model_answer)

    score, justification = judge_answer(question, reference, model_answer)
    records.append(
        {
            "user": row.get("user"),
            "reference_answer": reference,
            "model_answer": model_answer,
            "judge_score": score,
            "judge_feedback": justification,
        }
    )

# Save and preview
out_path = "/content/drive/MyDrive/financial_llama_models/eval_50_gpt_judged.jsonl"
pd.DataFrame(records).to_json(out_path, orient="records", lines=True, force_ascii=False)
print(f"Saved judged results to {out_path}")
pd.DataFrame(records).head()

In [None]:
# Quick raw-dataset inference run (no preprocessing) with comparison to base Llama 3.1 8B
# Reload raw dataset directly from HF to avoid any preprocessed artifacts
raw_eval = load_dataset("Josephgflowers/Finance-Instruct-500k", split="train")
sampled = raw_eval.select(range(min(50, len(raw_eval))))

# Load original (base) model once for comparison
if "BASE_MODEL" not in globals():
    base_tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    base_tokenizer.pad_token = base_tokenizer.eos_token
    base_tokenizer.padding_side = "left"
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        torch_dtype=dtype,
        device_map="auto",
    )
    base_model.eval()
    BASE_MODEL = base_model
    BASE_TOKENIZER = base_tokenizer
else:
    base_model = BASE_MODEL
    base_tokenizer = BASE_TOKENIZER


@torch.no_grad()
def generate_answer_base(question: str) -> str:
    inputs = base_tokenizer(
        question.strip(),
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=base_tokenizer.model_max_length,
    ).to(base_model.device)
    outputs = base_model.generate(
        **inputs,
        max_new_tokens=max_gen_tokens,
        temperature=0.3,
        top_p=0.9,
        do_sample=True,
        pad_token_id=base_tokenizer.eos_token_id,
        use_cache=False,
    )
    decoded = base_tokenizer.decode(outputs[0], skip_special_tokens=True)
    if decoded.startswith(question):
        decoded = decoded[len(question):].strip()
    return decoded.strip()


records = []
for row in tqdm(sampled, desc="Evaluating (raw 50)", total=len(sampled)):
    question = build_question(row)
    show_model_input(row)  # log prompt

    reference_raw = (row.get("assistant") or "").strip()
    reference = " ".join(reference_raw.split())  # light whitespace cleanup only

    # Finetuned answer
    ft_answer = generate_answer(question)
    show_model_output(question, ft_answer)
    ft_score, ft_just = judge_answer(question, reference, ft_answer)

    # Base answer
    base_answer = generate_answer_base(question)
    base_score, base_just = judge_answer(question, reference, base_answer)

    records.append(
        {
            "user": row.get("user"),
            "reference_answer": reference,
            "ft_model_answer": ft_answer,
            "ft_judge_score": ft_score,
            "ft_judge_feedback": ft_just,
            "base_model_answer": base_answer,
            "base_judge_score": base_score,
            "base_judge_feedback": base_just,
        }
    )

# Compute comparison statistics
_df = pd.DataFrame(records)
ft_mean = _df["ft_judge_score"].mean()
base_mean = _df["base_judge_score"].mean()
delta_mean = ft_mean - base_mean
wins = (_df["ft_judge_score"] > _df["base_judge_score"]).mean()
ties = (_df["ft_judge_score"] == _df["base_judge_score"]).mean()
losses = (_df["ft_judge_score"] < _df["base_judge_score"]).mean()

stats = {
    "ft_mean_score": ft_mean,
    "base_mean_score": base_mean,
    "mean_score_delta_ft_minus_base": delta_mean,
    "ft_win_rate": wins,
    "tie_rate": ties,
    "ft_loss_rate": losses,
}
print("\nComparison stats (finetuned vs base):")
print(stats)

out_path = "/content/drive/MyDrive/financial_llama_models/eval_50_gpt_judged_raw.jsonl"
pd.DataFrame(records).to_json(out_path, orient="records", lines=True, force_ascii=False)
print(f"Saved judged results to {out_path}")
pd.DataFrame(records).head()