# HouseBrain Model Fine-Tuning on Google Colab (A100 Optimized)

This notebook provides the definitive workflow for fine-tuning the HouseBrain model using a Google Colab Pro+ A100 environment. It includes a parallelized data generation step to maximize resource utilization and speed.


In [None]:
# @title Step 1: Set Up the Environment
import os
GITHUB_TOKEN = "" # PASTE YOUR GITHUB TOKEN HERE
os.environ['GITHUB_TOKEN'] = GITHUB_TOKEN

# Clone the repository using your token
!git clone https://$GITHUB_TOKEN@github.com/Vinay-O/HouseBrainLLM.git housebrain_v1_1
%cd housebrain_v1_1

# Install necessary libraries
!pip install --upgrade transformers peft trl accelerate datasets bitsandbytes sentencepiece jsonschema pydantic


In [None]:
# @title Step 2: Authenticate with Hugging Face
from huggingface_hub import login
login()


In [None]:
# @title Step 3: Generate "Silver Standard" Dataset in Parallel (A100 Optimized)

# Install Ollama if not present
!if ! command -v ollama &> /dev/null; then curl -fsSL https://ollama.com/install.sh | sh; fi

import subprocess
import time
import requests
import glob

# Start Ollama server in the background
with open("ollama_server.log", "w") as log_file:
    ollama_process = subprocess.Popen(["ollama", "serve"], stdout=log_file, stderr=subprocess.STDOUT)

print("🚀 Starting Ollama server...")
time.sleep(5)

# Health check loop
print("... Waiting for Ollama server to become available...")
server_ready = False
for _ in range(36): # Wait up to 3 minutes
    try:
        response = requests.get("http://localhost:11434")
        if response.status_code == 200:
            print("✅ Ollama server is up and running!")
            server_ready = True
            break
    except requests.exceptions.ConnectionError:
        time.sleep(5)

if server_ready:
    print("\\n⏳ Downloading deepseek-coder model...")
    !ollama pull deepseek-coder:6.7b-instruct
    print("✅ Model download complete.")
    !ollama list

    print("\\n⏳ Starting Silver Standard data generation (8 parallel workers)...")
    processes = []
    num_workers = 8
    num_examples = 100
    # Ensure the output directory exists
    !mkdir -p data/training/silver_standard

    for i in range(num_workers):
        command = f"python scripts/generate_silver_standard_data.py --num-examples {num_examples} --num-workers {num_workers} --worker-id {i}"
        log_file = open(f"worker_{i}.log", "w")
        proc = subprocess.Popen(command, shell=True, stdout=log_file, stderr=subprocess.STDOUT)
        processes.append((proc, log_file))

    print("\\n-- Monitoring data generation progress --")
    total_examples_to_generate = num_examples

    # Loop until all worker processes have completed
    while any(p.poll() is None for p, _ in processes):
        # Count the number of .json files in the output directory
        generated_files = glob.glob("data/training/silver_standard/*.json")
        
        # Calculate and display progress
        progress_percentage = (len(generated_files) / total_examples_to_generate) * 100
        progress_bar = f"[{'#' * int(progress_percentage / 4)}{'.' * (25 - int(progress_percentage / 4))}]"
        print(f"Progress: {progress_bar} {len(generated_files)}/{total_examples_to_generate} files generated ({progress_percentage:.2f}%)", end='\\r')
        
        # Wait for 30 seconds before the next update
        time.sleep(30)
    
    # Final check to ensure we print 100% completion
    generated_files = glob.glob("data/training/silver_standard/*.json")
    progress_percentage = (len(generated_files) / total_examples_to_generate) * 100
    progress_bar = f"[{'#' * int(progress_percentage / 4)}{'.' * (25 - int(progress_percentage / 4))}]"
    print(f"Progress: {progress_bar} {len(generated_files)}/{total_examples_to_generate} files generated ({progress_percentage:.2f}%)")

    # Final wait to ensure all processes are truly finished and logs are written and closed
    for proc, log_file in processes:
        proc.wait()
        log_file.close()

    print("\\n\\n✅ All data generation workers have finished.")

else:
    print("🔴 Ollama server failed to start. Cannot proceed.")
    !cat ollama_server.log


In [None]:
# @title Step 4: Prepare All Datasets for Fine-Tuning
# Prepare the Gold Standard dataset
!python scripts/prepare_data_for_finetuning.py \
    --input-dir data/training/gold_standard \
    --output-dir data/training/gold_standard_finetune_ready

# Prepare the newly generated Silver Standard dataset
!python scripts/prepare_data_for_finetuning.py \
    --input-dir data/training/silver_standard \
    --output-dir data/training/silver_standard_finetune_ready


In [None]:
# @title Step 5: Run Fine-Tuning (A100 Optimized)
!python scripts/run_finetuning.py \
    --model_id deepseek-ai/deepseek-coder:6.7b-instruct \
    --dataset_path data/training/gold_standard_finetune_ready data/training/silver_standard_finetune_ready \
    --output_dir models/housebrain-v1.0-silver \
    --epochs 15 \
    --batch_size 4 \
    --learning_rate 0.0002 \
    --use_4bit


In [None]:
# @title Step 6: (Optional) Download the Trained Model
!zip -r housebrain-v1.0-silver-adapter.zip models/housebrain-v1.0-silver

from google.colab import files
files.download('housebrain-v1.0-silver-adapter.zip')
