In [None]:
# Cell 1: Upload your prepared dataset files
from google.colab import files
import os

print("üì§ Please upload your prepared SFT dataset files:")
print("1. sft_finance_train.json")
print("2. sft_finance_validation.json")
print("3. sft_finance_test.json")

# Check if files already exist
existing_files = []
for fname in ['sft_finance_train.json', 'sft_finance_validation.json', 'sft_finance_test.json']:
    if os.path.exists(fname):
        existing_files.append(fname)

if existing_files:
    print(f"‚úÖ Found existing files: {existing_files}")
    print("Skipping upload...")
else:
    uploaded = files.upload()
    for filename in uploaded.keys():
        print(f'‚úÖ Uploaded {filename} ({len(uploaded[filename])} bytes)')

üì§ Please upload your prepared SFT dataset files:
1. sft_finance_train.json
2. sft_finance_validation.json
3. sft_finance_test.json


Saving sft_finance_validation.json to sft_finance_validation.json
Saving sft_finance_train.json to sft_finance_train.json
Saving sft_finance_test.json to sft_finance_test.json
‚úÖ Uploaded sft_finance_validation.json (62457 bytes)
‚úÖ Uploaded sft_finance_train.json (571737 bytes)
‚úÖ Uploaded sft_finance_test.json (83295 bytes)


In [None]:
# conceptual_sft_training.ipynb

# First, install required packages with correct versions
!pip install -q transformers accelerate
!pip install -q bitsandbytes
!pip install -q datasets
!pip install -q torch
!pip install -q peft
!pip install -q trl

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
import pandas as pd
import json
from google.colab import drive
import os
import numpy as np
from tqdm import tqdm

# Mount Google Drive
drive.mount('/content/drive')

class ConceptualSFTTrainer:
    def __init__(self, base_model_name="meta-llama/Llama-2-7b-chat-hf"):
        self.base_model_name = base_model_name
        self.tokenizer = None
        self.model = None
        self.drive_path = "/content/drive/MyDrive/financial_llm"

        # Create directories in Drive
        os.makedirs(f"{self.drive_path}/models", exist_ok=True)
        os.makedirs(f"{self.drive_path}/results", exist_ok=True)
        os.makedirs(f"{self.drive_path}/data", exist_ok=True)

    def load_prepared_datasets(self):
        """Load the prepared datasets from your data preparation notebook"""
        print("üì• Loading prepared SFT datasets...")

        try:
            # Check if files exist in current directory first
            if os.path.exists('sft_finance_train.json'):
                with open('sft_finance_train.json', 'r') as f:
                    train_data = json.load(f)
                with open('sft_finance_validation.json', 'r') as f:
                    val_data = json.load(f)
                with open('sft_finance_test.json', 'r') as f:
                    test_data = json.load(f)
            else:
                # Try loading from Google Drive
                drive_data_path = f"{self.drive_path}/data"
                if os.path.exists(f"{drive_data_path}/sft_finance_train.json"):
                    with open(f"{drive_data_path}/sft_finance_train.json", 'r') as f:
                        train_data = json.load(f)
                    with open(f"{drive_data_path}/sft_finance_validation.json", 'r') as f:
                        val_data = json.load(f)
                    with open(f"{drive_data_path}/sft_finance_test.json", 'r') as f:
                        test_data = json.load(f)
                else:
                    print("‚ùå Prepared dataset files not found in local or Drive.")
                    print("üí° Using fallback dataset...")
                    return self._create_fallback_dataset()

            print(f"‚úÖ Loaded {len(train_data)} training, {len(val_data)} validation, {len(test_data)} test examples")

            return train_data, val_data, test_data

        except Exception as e:
            print(f"‚ùå Error loading datasets: {e}")
            return self._create_fallback_dataset()

    def _create_fallback_dataset(self):
        """Create a fallback dataset if prepared files aren't available"""
        print("üîÑ Creating fallback dataset from curated examples...")

        # Use comprehensive financial conceptual examples
        fallback_data = [
            {
                "instruction": "Answer the following financial concept question clearly and comprehensively.",
                "input": "What is the difference between systematic risk and unsystematic risk?",
                "output": "Systematic risk, also known as market risk, affects the entire market and cannot be diversified away. Examples include interest rate changes, inflation, and political instability. Unsystematic risk, also known as specific risk, affects individual companies or industries and can be reduced through diversification. Examples include management changes, product recalls, or labor strikes.",
                "conceptual_score": 9
            },
            {
                "instruction": "Answer the following financial concept question clearly and comprehensively.",
                "input": "Explain the concept of time value of money and its importance in finance.",
                "output": "The time value of money (TVM) is the fundamental financial concept that money available today is worth more than the same amount in the future due to its potential earning capacity. This core principle forms the basis for discounted cash flow analysis, bond pricing, capital budgeting, and investment valuation. TVM accounts for interest rates, inflation, and opportunity costs in financial decision-making.",
                "conceptual_score": 9
            },
            {
                "instruction": "Answer the following financial concept question clearly and comprehensively.",
                "input": "What are the three main financial statements and how are they interconnected?",
                "output": "The three main financial statements are: 1) Income Statement - shows revenue, expenses, and profit over a period; 2) Balance Sheet - shows assets, liabilities, and equity at a point in time; 3) Cash Flow Statement - shows cash inflows and outflows. They interconnect: Net income from income statement flows to retained earnings on balance sheet and operating activities on cash flow statement. Balance sheet changes are reflected in cash flow statement.",
                "conceptual_score": 8
            },
            {
                "instruction": "Answer the following financial concept question clearly and comprehensively.",
                "input": "Describe the efficient market hypothesis and its three forms.",
                "output": "The Efficient Market Hypothesis (EMH) states that asset prices fully reflect all available information. The three forms are: 1) Weak form - prices reflect all historical market data, technical analysis is ineffective; 2) Semi-strong form - prices reflect all public information, fundamental analysis is ineffective; 3) Strong form - prices reflect all public and private information, even insider information cannot generate excess returns.",
                "conceptual_score": 8
            },
            {
                "instruction": "Answer the following financial concept question clearly and comprehensively.",
                "input": "What is the capital asset pricing model (CAPM) and its formula?",
                "output": "The Capital Asset Pricing Model (CAPM) calculates the expected return of an asset based on its systematic risk. The formula is: Expected Return = Risk-Free Rate + Beta √ó (Market Return - Risk-Free Rate). Where Beta measures the asset's volatility relative to the market. CAPM is used to determine appropriate required rates of return for assets given their risk levels.",
                "conceptual_score": 8
            },
            {
                "instruction": "Answer the following financial concept question clearly and comprehensively.",
                "input": "Explain the difference between active and passive portfolio management.",
                "output": "Active portfolio management involves making specific investments with the goal of outperforming a benchmark index through security selection, market timing, and tactical asset allocation. Passive management involves replicating a market index with minimal trading, focusing on diversification and low costs. Active management seeks alpha (excess returns) while passive management accepts market returns with lower fees.",
                "conceptual_score": 7
            },
            {
                "instruction": "Answer the following financial concept question clearly and comprehensively.",
                "input": "What are the key principles of corporate governance?",
                "output": "Key corporate governance principles include: 1) Rights of shareholders - protecting shareholder interests; 2) Equitable treatment - fair treatment of all shareholders; 3) Stakeholder interests - recognizing legal rights of stakeholders; 4) Disclosure and transparency - accurate and timely information; 5) Board responsibilities - strategic guidance and monitoring management; 6) Ethical behavior - maintaining integrity and ethical conduct.",
                "conceptual_score": 7
            },
            {
                "instruction": "Answer the following financial concept question clearly and comprehensively.",
                "input": "Describe the discounted cash flow (DCF) valuation method.",
                "output": "Discounted Cash Flow (DCF) valuation estimates an investment's value based on its expected future cash flows, discounted to their present value using an appropriate discount rate. The formula is: Value = Œ£ [CFt / (1 + r)^t] where CFt is cash flow in period t, r is discount rate, and t is time period. DCF accounts for the time value of money and risk through the discount rate.",
                "conceptual_score": 8
            }
        ]

        # Split into train/val/test (70/15/15)
        train_size = int(0.7 * len(fallback_data))
        val_size = int(0.15 * len(fallback_data))

        train_data = fallback_data[:train_size]
        val_data = fallback_data[train_size:train_size + val_size]
        test_data = fallback_data[train_size + val_size:]

        print(f"üîÑ Created fallback dataset: {len(train_data)} train, {len(val_data)} val, {len(test_data)} test")

        return train_data, val_data, test_data

    def format_for_training(self, data):
        """Format data for SFT training with proper instruction formatting"""
        formatted_data = []

        for item in data:
            # Create the training text in instruction-following format
            if "instruction" in item and "input" in item and "output" in item:
                text = f"### Instruction:\n{item['instruction']}\n\n### Input:\n{item['input']}\n\n### Response:\n{item['output']}"
            elif "question" in item and "answer" in item:
                text = f"### Instruction:\nAnswer the following financial concept question clearly and comprehensively.\n\n### Input:\n{item['question']}\n\n### Response:\n{item['answer']}"
            else:
                # Skip malformed items
                continue

            formatted_data.append({
                "text": text,
                "conceptual_score": item.get('conceptual_score', 5)
            })

        return formatted_data

    def load_model_and_tokenizer(self):
        """Load the base LLaMA 2 model and tokenizer with proper quantization and PEFT setup"""
        print("üîÑ Loading LLaMA 2 model and tokenizer with LoRA...")

        # Configure 4-bit quantization for memory efficiency
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
        )

        self.tokenizer = AutoTokenizer.from_pretrained(
            self.base_model_name,
            trust_remote_code=True
        )

        # Add padding token if it doesn't exist
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        # Load model with quantization
        self.model = AutoModelForCausalLM.from_pretrained(
            self.base_model_name,
            quantization_config=bnb_config,
            device_map="auto",
            trust_remote_code=True,
        )

        # Prepare model for k-bit training
        self.model = prepare_model_for_kbit_training(self.model)

        # Configure LoRA
        lora_config = LoraConfig(
            r=16,  # Rank
            lora_alpha=32,
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM",
        )

        # Apply LoRA to model
        self.model = get_peft_model(self.model, lora_config)

        # Enable gradient checkpointing to save memory
        self.model.gradient_checkpointing_enable()

        # Print trainable parameters
        self.model.print_trainable_parameters()

        print(f"‚úÖ Model loaded with 4-bit quantization + LoRA on device: {self.model.device}")
        return self.model, self.tokenizer

    def tokenize_dataset(self, dataset):
        """Tokenize the dataset for training with proper padding"""

        def tokenize_function(examples):
            # Tokenize with padding and truncation
            tokenized = self.tokenizer(
                examples["text"],
                truncation=True,
                padding="max_length",  # Pad to max_length
                max_length=512,  # Fixed length for all sequences
                return_tensors=None,
            )

            # For causal LM, labels are the same as input_ids
            tokenized["labels"] = tokenized["input_ids"].copy()

            return tokenized

        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=dataset.column_names,
            desc="Tokenizing dataset",
        )

        return tokenized_dataset

    def train_conceptual_sft(self, output_dir="conceptual_sft_model"):
        """Train the conceptual SFT model using prepared datasets with PEFT"""

        # Load prepared datasets
        train_data, val_data, test_data = self.load_prepared_datasets()

        if len(train_data) == 0:
            raise ValueError("No training data available!")

        print(f"üìä Dataset sizes: Train={len(train_data)}, Val={len(val_data)}, Test={len(test_data)}")

        # Format datasets
        train_formatted = self.format_for_training(train_data)
        val_formatted = self.format_for_training(val_data) if val_data else None

        # Convert to Hugging Face datasets
        train_dataset = Dataset.from_list(train_formatted)
        val_dataset = Dataset.from_list(val_formatted) if val_formatted else None

        print("üîß Tokenizing datasets...")
        tokenized_train = self.tokenize_dataset(train_dataset)
        tokenized_val = self.tokenize_dataset(val_dataset) if val_dataset else None

        print(f"üìà Tokenized dataset sizes: Train={len(tokenized_train)}, Val={len(tokenized_val) if tokenized_val else 0}")

        # Check sequence lengths
        if len(tokenized_train) > 0:
            lengths = [len(item['input_ids']) for item in tokenized_train]
            print(f"üìè Sequence lengths - Min: {min(lengths)}, Max: {max(lengths)}, Avg: {np.mean(lengths):.1f}")

        # Training arguments - optimized for PEFT
        training_args = TrainingArguments(
            output_dir=output_dir,
            overwrite_output_dir=True,
            num_train_epochs=5,
            per_device_train_batch_size=2,
            per_device_eval_batch_size=2,
            gradient_accumulation_steps=4,
            warmup_steps=50,
            logging_steps=10,
            save_steps=100,
            eval_steps=50 if tokenized_val else None,
            eval_strategy="steps" if tokenized_val else "no",
            save_total_limit=2,
            learning_rate=1e-4,
            fp16=True,
            remove_unused_columns=False,
            report_to="none",  # Disable wandb
            dataloader_pin_memory=False,
            load_best_model_at_end=True if tokenized_val else False,
            metric_for_best_model="eval_loss" if tokenized_val else None,
            greater_is_better=False,
        )

        # Use standard data collator with padding
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False,
            pad_to_multiple_of=8,
        )

        # Trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_val,
            data_collator=data_collator,
            tokenizer=self.tokenizer,  # Use tokenizer directly
        )

        # Train
        print("üöÄ Starting Conceptual SFT training with LoRA...")
        print(f"üìà Training for {len(tokenized_train)} examples over {training_args.num_train_epochs} epochs")

        train_result = trainer.train()

        # Save training metrics
        metrics = train_result.metrics
        print(f"üìä Training completed with loss: {metrics.get('train_loss', 'N/A')}")

        # Save model (only LoRA adapters)
        self.model.save_pretrained(output_dir)
        self.tokenizer.save_pretrained(output_dir)

        # Save to Google Drive
        drive_output_dir = f"{self.drive_path}/models/{output_dir}"
        self.model.save_pretrained(drive_output_dir)
        self.tokenizer.save_pretrained(drive_output_dir)

        print(f"‚úÖ SFT model (LoRA adapters) saved to {drive_output_dir}")

        # Evaluate and save results
        evaluation_results = self.evaluate_sft_model(test_data, output_dir)

        return drive_output_dir, evaluation_results

    def evaluate_sft_model(self, test_data, model_path):
        """Evaluate the SFT model on test data"""
        print("üß™ Evaluating SFT model...")

        if not test_data:
            print("‚ùå No test data available for evaluation")
            return []

        # Load the base model and then apply the trained LoRA adapters
        try:
            from peft import PeftModel

            # Load base model
            base_model = AutoModelForCausalLM.from_pretrained(
                self.base_model_name,
                torch_dtype=torch.float16,
                device_map="auto",
                trust_remote_code=True
            )

            # Load tokenizer
            tokenizer = AutoTokenizer.from_pretrained(model_path)

            # Load and apply LoRA adapters
            model = PeftModel.from_pretrained(base_model, model_path)
            model = model.merge_and_unload()  # Merge adapters with base model for inference

        except Exception as e:
            print(f"‚ùå Error loading model for evaluation: {e}")
            return []

        evaluation_results = []

        # Use a subset for quick evaluation
        test_subset = test_data[:min(5, len(test_data))]

        for i, test_item in enumerate(tqdm(test_subset, desc="Evaluating")):
            question = test_item.get('input', test_item.get('question', ''))
            ground_truth = test_item.get('output', test_item.get('answer', ''))

            if not question:
                continue

            # Create prompt
            prompt = f"### Instruction:\nAnswer the following financial concept question clearly and comprehensively.\n\n### Input:\n{question}\n\n### Response:\n"

            # Generate response
            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    num_return_sequences=1,
                    temperature=0.7,
                    do_sample=True,
                    pad_token_id=tokenizer.eos_token_id,
                    repetition_penalty=1.1,
                    early_stopping=True
                )

            response = tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Extract just the generated part (after the prompt)
            generated_response = response[len(prompt):].strip()

            evaluation_results.append({
                'question': question,
                'ground_truth': ground_truth[:200] + "..." if len(ground_truth) > 200 else ground_truth,
                'generated_response': generated_response,
                'conceptual_score': test_item.get('conceptual_score', 5),
                'response_length': len(generated_response)
            })

        # Save evaluation results
        results_df = pd.DataFrame(evaluation_results)
        results_path = f"{self.drive_path}/results/sft_evaluation_results.csv"
        results_df.to_csv(results_path, index=False)

        print(f"‚úÖ Evaluation results saved to {results_path}")

        # Generate summary statistics
        self._generate_evaluation_summary(evaluation_results, results_path)

        return evaluation_results

    def _generate_evaluation_summary(self, results, results_path):
        """Generate summary statistics for evaluation"""
        if not results:
            return

        avg_length = np.mean([r['response_length'] for r in results])
        avg_score = np.mean([r['conceptual_score'] for r in results])

        summary = {
            'total_evaluated': len(results),
            'average_response_length': avg_length,
            'average_conceptual_score': avg_score,
            'evaluation_timestamp': pd.Timestamp.now().isoformat()
        }

        # Save summary
        summary_path = f"{self.drive_path}/results/sft_evaluation_summary.json"
        with open(summary_path, 'w') as f:
            json.dump(summary, f, indent=2)

        print(f"üìä Evaluation Summary:")
        print(f"   - Examples evaluated: {summary['total_evaluated']}")
        print(f"   - Avg response length: {summary['average_response_length']:.1f} chars")
        print(f"   - Avg conceptual score: {summary['average_conceptual_score']:.1f}/10")
        print(f"   - Summary saved to: {summary_path}")

def main():
    """Main execution function"""
    print("üéØ FINANCIAL CONCEPTUAL SFT TRAINING WITH LoRA")
    print("=" * 50)

    # Initialize trainer
    sft_trainer = ConceptualSFTTrainer()

    # Load model and tokenizer with PEFT setup
    model, tokenizer = sft_trainer.load_model_and_tokenizer()

    try:
        # Train the model
        sft_model_path, evaluation_results = sft_trainer.train_conceptual_sft()

        print(f"\n‚úÖ SFT TRAINING COMPLETED SUCCESSFULLY!")
        print(f"üìÅ LoRA adapters saved to: {sft_model_path}")
        print(f"üìä Evaluation completed on {len(evaluation_results)} examples")

        # Show sample of generated responses
        if evaluation_results:
            print(f"\nüìù Sample Generated Response:")
            print(f"Question: {evaluation_results[0]['question']}")
            print(f"Generated: {evaluation_results[0]['generated_response'][:200]}...")

    except Exception as e:
        print(f"‚ùå Training failed: {e}")
        raise

# Execute if run directly
if __name__ == "__main__":
    main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
üéØ FINANCIAL CONCEPTUAL SFT TRAINING WITH LoRA
üîÑ Loading LLaMA 2 model and tokenizer with LoRA...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 39,976,960 || all params: 6,778,392,576 || trainable%: 0.5898
‚úÖ Model loaded with 4-bit quantization + LoRA on device: cuda:0
üì• Loading prepared SFT datasets...
‚úÖ Loaded 99 training, 12 validation, 13 test examples
üìä Dataset sizes: Train=99, Val=12, Test=13
üîß Tokenizing datasets...


Tokenizing dataset:   0%|          | 0/99 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/12 [00:00<?, ? examples/s]

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.


üìà Tokenized dataset sizes: Train=99, Val=12
üìè Sequence lengths - Min: 512, Max: 512, Avg: 512.0
üöÄ Starting Conceptual SFT training with LoRA...
üìà Training for 99 examples over 5 epochs


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
50,0.6503,0.682429


üìä Training completed with loss: 0.7926281965695895
‚úÖ SFT model (LoRA adapters) saved to /content/drive/MyDrive/financial_llm/models/conceptual_sft_model
üß™ Evaluating SFT model...


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:48<00:00,  9.64s/it]

‚úÖ Evaluation results saved to /content/drive/MyDrive/financial_llm/results/sft_evaluation_results.csv
üìä Evaluation Summary:
   - Examples evaluated: 5
   - Avg response length: 979.0 chars
   - Avg conceptual score: 8.4/10
   - Summary saved to: /content/drive/MyDrive/financial_llm/results/sft_evaluation_summary.json

‚úÖ SFT TRAINING COMPLETED SUCCESSFULLY!
üìÅ LoRA adapters saved to: /content/drive/MyDrive/financial_llm/models/conceptual_sft_model
üìä Evaluation completed on 5 examples

üìù Sample Generated Response:
Question: A 21-year-old student wants to pass the CFA Level 1 exam, which is a significant milestone in their career. This enthusiast is seeking help on studying for this finance exam. What are some general tips to get started with the 2024 CFA Level 1 studying?

1.  Create a study schedule and stick to it: Plan out your study sessions and commit to them. A consistent study schedule will help you stay on track and make the most of your study time. A well-planned 


