In [12]:
import os
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, prepare_model_for_kbit_training
from colorama import Fore, Style
from tqdm.auto import tqdm
import json

from colorama import init, Fore, Style
init(autoreset=True)

print(Fore.GREEN + "All libraries imported successfully!" + Style.RESET_ALL)

All libraries imported successfully!


In [13]:
TRAINING_CONFIG = {
    "base_model": "meta-llama/Llama-3.2-1B",
    "dataset_path": "../data/synthetic_QA_pairs/synthetic_output_assessment_results.json",
    "output_dir": "../checkpoints/bloombergTerminal-lora-llama3-1b",
}

print(Fore.CYAN + "Training Configuration:" + Style.RESET_ALL)
for key, value in TRAINING_CONFIG.items():
    print(f"  {key}: {value}")

Training Configuration:
  base_model: meta-llama/Llama-3.2-1B
  dataset_path: ../data/synthetic_QA_pairs/synthetic_output_assessment_results.json
  output_dir: ../checkpoints/bloombergTerminal-lora-llama3-1b


In [14]:
print(Fore.YELLOW + f"Loading dataset from: {TRAINING_CONFIG['dataset_path']}" + Style.RESET_ALL)

dataset = load_dataset('json', data_files=TRAINING_CONFIG['dataset_path'], split='train')

##Preview dataset
print(Fore.MAGENTA + "Dataset Preview:" + Style.RESET_ALL)
#dataset columns
print(f"Dataset columns: {dataset.column_names}")
print(f"Number of samples in dataset: {len(dataset)}")



print(Fore.CYAN + "\n🔍 Sample from dataset:" + Style.RESET_ALL)
sample = dataset[0]
for key, value in sample.items():
    print(f"  {key}: {str(value)[:100]}{'...' if len(str(value)) > 100 else ''}")

Loading dataset from: ../data/synthetic_QA_pairs/synthetic_output_assessment_results.json
Dataset Preview:
Dataset columns: ['question', 'answer', 'quality']
Number of samples in dataset: 3082

🔍 Sample from dataset:
  question: What is a Credit Default Swap (CDS) and why is its valuation complex?
  answer: A Credit Default Swap (CDS) is a financial instrument designed to hedge against credit risk, and the...
  quality: {'accuracy': {'explanation': 'The answer is mostly accurate but could be more precise or detailed. T...


In [None]:
## Tokenizer
hf_token = "ADD_YOUR_HUGGINGFACE_TOKEN_HERE"  # Replace with your actual token or set to None if not needed

tokenizer = AutoTokenizer.from_pretrained(
    TRAINING_CONFIG['base_model'], 
    token=hf_token, 
    trust_remote_code=True
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(Fore.GREEN + "✅ Tokenizer loaded and configured!" + Style.RESET_ALL)
print(f"Vocabulary size: {tokenizer.vocab_size}")
print(f"Padding token: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")
print(f"EOS token: '{tokenizer.eos_token}' (ID: {tokenizer.eos_token_id})")

# Test tokenization
test_text = "Hello, how are you?"
tokens = tokenizer.tokenize(test_text)
print(f"Test tokenization of '{test_text}':")
print(f"Tokens: {tokens}")
print(f"Token IDs: {tokenizer.convert_tokens_to_ids(tokens)}")

NameError: name 'os' is not defined

In [18]:
def format_chat_template(batch, tokenizer):
    """
    Converts question-answer pairs into proper chat format for instruction following
    """
    system_prompt = "You are a helpful assistant specializing in the Bloomberg Terminal. Provide clear, accurate answers to questions about its functions and usage."
    
    questions = batch["question"]
    answers = batch["answer"]
    
    texts = []
    for question, answer in zip(questions, answers):
        # Create the conversation structure
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": question},
            {"role": "assistant", "content": answer}
        ]
        
        # Check if tokenizer has a chat template
        if tokenizer.chat_template is not None:
            # Use the tokenizer's built-in chat template
            formatted_text = tokenizer.apply_chat_template(messages, tokenize=False)
        else:
            # Manually format using Llama-3 style template
            formatted_text = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>"
            formatted_text += f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|>"
            formatted_text += f"<|start_header_id|>assistant<|end_header_id|>\n\n{answer}<|eot_id|>"
        
        texts.append(formatted_text)
        
    return {"text": texts}

# Test the function with one example
test_batch = {
    "question": [dataset[0]["question"]], 
    "answer": [dataset[0]["answer"]]
}
test_result = format_chat_template(test_batch, tokenizer)
print("📝 Chat template example:")
print(test_result["text"][0])

📝 Chat template example:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant specializing in the Bloomberg Terminal. Provide clear, accurate answers to questions about its functions and usage.<|eot_id|><|start_header_id|>user<|end_header_id|>

What is a Credit Default Swap (CDS) and why is its valuation complex?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

A Credit Default Swap (CDS) is a financial instrument designed to hedge against credit risk, and the valuation of CDS is complex due to various factors including default probability, loss amount, recovery rate, and timing of default.<|eot_id|>


In [None]:
dataset[0]

{'question': 'What is a Credit Default Swap (CDS) and why is its valuation complex?',
 'answer': 'A Credit Default Swap (CDS) is a financial instrument designed to hedge against credit risk, and the valuation of CDS is complex due to various factors including default probability, loss amount, recovery rate, and timing of default.',
 'quality': {'accuracy': {'explanation': 'The answer is mostly accurate but could be more precise or detailed. The key factors affecting CDS valuation are correctly mentioned, but the explanation lacks depth and does not provide a clear example or scenario to illustrate its complexity.',
   'score': 9},
  'style': {'explanation': 'The style is excellent, professional, and perfectly clear. The answer provides a concise yet accurate description of CDS valuation complexity, making it easy for readers to understand the topic.',
   'score': 10}}}

In [19]:
print("Applying chat template to entire dataset...")
train_dataset = dataset.map(
    lambda batch: format_chat_template(batch, tokenizer), 
    batched=True, 
    desc="Formatting dataset with chat template"
)

print(f"   Number of examples: {len(train_dataset)}")
print(f"   New features: {train_dataset.features}")

Applying chat template to entire dataset...


Formatting dataset with chat template: 100%|██████████| 3082/3082 [00:00<00:00, 167591.63 examples/s]


   Number of examples: 3082
   New features: {'question': Value('string'), 'answer': Value('string'), 'quality': {'accuracy': {'explanation': Value('string'), 'score': Value('int64')}, 'style': {'explanation': Value('string'), 'score': Value('int64')}}, 'text': Value('string')}


In [20]:
text_lengths = [len(text) for text in train_dataset["text"]]
print(f"   Average length: {sum(text_lengths)/len(text_lengths):.0f} characters")
print(f"   Min length: {min(text_lengths)} characters")
print(f"   Max length: {max(text_lengths)} characters")

   Average length: 525 characters
   Min length: 365 characters
   Max length: 866 characters


In [22]:
print(f"\nExample of formatted training text:")
print("="*50)
print(train_dataset[0]['text'])
print("="*50)


Example of formatted training text:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant specializing in the Bloomberg Terminal. Provide clear, accurate answers to questions about its functions and usage.<|eot_id|><|start_header_id|>user<|end_header_id|>

What is a Credit Default Swap (CDS) and why is its valuation complex?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

A Credit Default Swap (CDS) is a financial instrument designed to hedge against credit risk, and the valuation of CDS is complex due to various factors including default probability, loss amount, recovery rate, and timing of default.<|eot_id|>


In [24]:
## Configure 4 bit quantization

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_type=torch.float16
)

print(f"   4-bit loading: {quant_config.load_in_4bit}")
print(f"   Double quantization: {quant_config.bnb_4bit_use_double_quant}")
print(f"   Quantization type: {quant_config.bnb_4bit_quant_type}")
print(f"   Compute dtype: {quant_config.bnb_4bit_compute_dtype}")

   4-bit loading: True
   Double quantization: True
   Quantization type: nf4
   Compute dtype: torch.float32


In [25]:
## Load the base model
model = AutoModelForCausalLM.from_pretrained(
    TRAINING_CONFIG['base_model'],
    device_map="auto",
    quantization_config=quant_config,
    token=hf_token,
    cache_dir="./model_cache",
)

print(f"   Model type: {type(model).__name__}")
print(f"   Device map: {model.hf_device_map if hasattr(model, 'hf_device_map') else 'Not available'}")

The 8-bit optimizer is not available on your device, only available on CUDA for now.


   Model type: LlamaForCausalLM
   Device map: {'': 'cpu'}


In [26]:
# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"   Total parameters: {total_params:,}")
print(f"   Trainable parameters: {trainable_params:,}")

   Total parameters: 749,275,136
   Trainable parameters: 262,735,872


In [27]:
## Enable Gradient Checkpointing
model.gradient_checkpointing_enable()

## Enable kbit training
model = prepare_model_for_kbit_training(model)

# Check what changed
trainable_params_after = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"After preparation:")
print(f"Trainable parameters: {trainable_params_after:,}")

After preparation:
Trainable parameters: 0
