# HouseBrain Model Fine-Tuning on Google Colab (A100 Optimized)

This notebook provides the definitive workflow for fine-tuning the HouseBrain model using a Google Colab Pro+ A100 environment. It includes a parallelized data generation step with a live dashboard to monitor progress and worker status.


In [None]:
# @title Step 1: Set Up the Environment
import os
# IMPORTANT: PASTE YOUR GITHUB PERSONAL ACCESS TOKEN HERE
GITHUB_TOKEN = ""
os.environ['GITHUB_TOKEN'] = GITHUB_TOKEN

# Clone the repository using your token
!git clone https://$GITHUB_TOKEN@github.com/Vinay-O/HouseBrainLLM.git housebrain_v1_1
%cd housebrain_v1_1

# Install necessary libraries
!pip install --upgrade transformers peft trl accelerate datasets bitsandbytes sentencepiece jsonschema pydantic


In [None]:
# @title Step 2: Authenticate with Hugging Face
from huggingface_hub import login
# You will be prompted to enter your Hugging Face token.
login()


In [None]:
# @title Step 3: Generate Raw Drafts for Human Refinement (A100 Optimized)
# This step now focuses on generating a large volume of unvalidated drafts as quickly as possible.
# The actual data refinement and validation will be done offline using the `refine_drafts.py` script.

# Install Ollama if not present
!if ! command -v ollama &> /dev/null; then curl -fsSL https://ollama.com/install.sh | sh; fi

import subprocess
import time
import requests
import glob
from IPython.display import clear_output
import os

# Start Ollama server in the background
with open("ollama_server.log", "w") as log_file:
    ollama_process = subprocess.Popen(["ollama", "serve"], stdout=log_file, stderr=subprocess.STDOUT)

print("üöÄ Starting Ollama server...")
time.sleep(5)

# Health check loop
print("... Waiting for Ollama server to become available...")
server_ready = False
for _ in range(36):
    try:
        response = requests.get("http://localhost:11434")
        if response.status_code == 200:
            print("‚úÖ Ollama server is up and running!")
            server_ready = True
            break
    except requests.exceptions.ConnectionError:
        time.sleep(5)

if server_ready:
    print("\\n‚è≥ Downloading deepseek-coder model...")
    !ollama pull deepseek-coder:6.7b-instruct
    print("‚úÖ Model download complete.")
    !ollama list

    print("\\n‚è≥ Starting Raw Draft generation (8 parallel workers)...")
    processes = []
    num_workers = 8
    num_examples = 200 # Let's generate a larger pool of raw drafts
    output_dir = "data/training/silver_standard_raw"

    # Ensure the output directory exists and is empty
    if os.path.exists(output_dir):
        get_ipython().system(f'rm -rf {output_dir}')
    os.makedirs(output_dir)

    for i in range(num_workers):
        command = f"python scripts/generate_raw_drafts.py --num-examples {num_examples} --num-workers {num_workers} --worker-id {i}"
        log_file = open(f"worker_{i}.log", "w")
        proc = subprocess.Popen(command, shell=True, stdout=log_file, stderr=subprocess.STDOUT)
        processes.append((proc, log_file))

    total_examples_to_generate = num_examples
    
    while any(p.poll() is None for p, _ in processes):
        clear_output(wait=True)
        generated_files = glob.glob(f"{output_dir}/*.json")
        progress_percentage = (len(generated_files) / total_examples_to_generate) * 100
        progress_bar = f"[{'#' * int(progress_percentage / 4)}{'.' * (25 - int(progress_percentage / 4))}]"
        
        print("--- Generating Raw Drafts ---")
        print(f"Progress: {progress_bar} {len(generated_files)}/{total_examples_to_generate} raw drafts generated ({progress_percentage:.2f}%)\\n")
        print("--- Live Worker Status (last 3 lines of logs) ---")
        get_ipython().system('tail -n 3 worker_*.log')
        
        time.sleep(20)
    
    # Final update
    clear_output(wait=True)
    generated_files = glob.glob(f"{output_dir}/*.json")
    print(f"--- Final Count ---")
    print(f"‚úÖ Generated a total of {len(generated_files)} raw drafts.")
    
    for proc, log_file in processes:
        proc.wait()
        log_file.close()

    print("\\n\\n‚úÖ All raw draft generation workers have finished.")
    print("NEXT STEP: Download the 'data/training/silver_standard_raw' directory and use 'scripts/refine_drafts.py' locally to create the final dataset.")

else:
    print("üî¥ Ollama server failed to start. Cannot proceed.")
    get_ipython().system('cat ollama_server.log')


In [None]:
# @title Step 4: Prepare All Datasets for Fine-Tuning
!python scripts/prepare_data_for_finetuning.py \
    --input-dir data/training/gold_standard \
    --output-dir data/training/gold_standard_finetune_ready

!python scripts/prepare_data_for_finetuning.py \
    --input-dir data/training/silver_standard \
    --output-dir data/training/silver_standard_finetune_ready


In [None]:
# @title Step 5: Run Fine-Tuning (A100 Optimized)
!python scripts/run_finetuning.py \
    --model_id "deepseek-ai/deepseek-coder-6.7b-instruct" \
    --dataset_path "data/training/gold_standard_finetune_ready" "data/training/silver_standard_finetune_ready" \
    --output_dir "models/housebrain-v1.0-silver" \
    --epochs 15 \
    --batch_size 4 \
    --learning_rate 2e-4 \
    --use_4bit


In [None]:
# @title Step 6: (Optional) Download the Trained Model Adapter
!zip -r housebrain-v1.0-silver-adapter.zip models/housebrain-v1.0-silver

from google.colab import files
files.download('housebrain-v1.0-silver-adapter.zip')
