# HouseBrain Architect - Fine-Tuning on Colab

This notebook fine-tunes the `Qwen/Qwen2.5-3B-Instruct` model on our curated house plan dataset.

**Instructions:**

1.  **Set up the GPU:** Go to `Runtime` > `Change runtime type` and select `T4 GPU` or another available GPU.
2.  **Run each cell in order:** Execute the cells one by one from top to bottom.
3.  **Hugging Face Token (Optional but Recommended):** The second code cell will ask for a Hugging Face token. While not always required for public models, it's good practice. You can create a token in your Hugging Face account settings and add it as a "Secret" in Colab (click the 🔑 icon on the left). Name the secret `HF_TOKEN`.
4.  **Download the result:** After the final cell runs, a file named `housebrain_v1_adapters.zip` will appear in the file browser on the left. Right-click it and select `Download` to save your trained model adapters.


In [None]:
# --- 1. Clone the Repository ---
# This will download your project files, including the necessary datasets.

!git clone https://github.com/Vinay-O/HouseBrainLLM.git
%cd HouseBrainLLM

print("✅ Repository cloned and directory changed.")


In [None]:
# --- 2. Install Dependencies ---
# This multi-step process ensures that all libraries, especially torch, bitsandbytes,
# and triton, are installed and compiled correctly for the Colab A100 GPU environment.

# Step A: Install the main application libraries
!pip install -U "transformers==4.41.2" "peft==0.10.0" "accelerate==0.30.1" "datasets==2.19.1"

# Step B: Force a re-install of torch and a compatible triton from the PyTorch CUDA 12.1 index
# This is the critical step for A100 GPUs.
!pip install -U --force-reinstall "torch==2.3.1" "triton==2.3.1" --index-url https://download.pytorch.org/whl/cu121

# Step C: Install the correct version of bitsandbytes
!pip install -U "bitsandbytes==0.43.1"


In [None]:
# --- 3. Imports & Login ---

import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import get_peft_model, LoraConfig
import os

# Optional: Login to Hugging Face
from huggingface_hub import login
from google.colab import userdata

try:
  hf_token = userdata.get('HF_TOKEN')
  login(token=hf_token)
  print("✅ Successfully logged into Hugging Face.")
except Exception as e:
  print("Proceeding without Hugging Face login. Add a secret named HF_TOKEN if you encounter download issues.")


In [None]:
# --- 4. Load and Prepare Datasets ---

model_name = "Qwen/Qwen2.5-3B-Instruct"

# The tokenizer is loaded first to be used in the data preparation.
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Define the correct file paths inside the cloned repository
train_file_path = "/content/HouseBrainLLM/finetune_dataset_train.jsonl"
val_file_path = "/content/HouseBrainLLM/finetune_dataset_val.jsonl"

# Load from the cloned repository
train_dataset = load_dataset("json", data_files=train_file_path, split="train")
val_dataset = load_dataset("json", data_files=val_file_path, split="train")

# This function formats the prompts and tokenizes them
def formatting_and_tokenizing_func(examples):
    texts = []
    for msg_list in examples["messages"]:
        prompt = ""
        response = ""
        for msg in msg_list:
            if msg["role"] == "user":
                prompt = msg["content"]
            elif msg["role"] == "assistant":
                response = msg["content"]
        
        if prompt and response:
            # Use the model's chat template for correct formatting
            formatted_chat = tokenizer.apply_chat_template(
                [{"role": "system", "content": "You are a world-class AI architect. Generate a detailed and accurate JSON representation of a house floor plan based on the user's request."},
                 {"role": "user", "content": prompt},
                 {"role": "assistant", "content": response}],
                tokenize=False,
                add_generation_prompt=False
            )
            texts.append(formatted_chat)
    
    # Tokenize the formatted text
    return tokenizer(texts, truncation=True, max_length=1024, padding=False)

# Apply the function to the datasets
train_dataset = train_dataset.map(formatting_and_tokenizing_func, batched=True, remove_columns=train_dataset.column_names)
val_dataset = val_dataset.map(formatting_and_tokenizing_func, batched=True, remove_columns=val_dataset.column_names)

print("✅ Datasets loaded and prepared.")
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")


In [None]:
# --- 5. Configure and Load Model ---

# Configure 4-bit quantization to save memory
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load the base model with our quantization config
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto",
    trust_remote_code=True,
)

# Configure LoRA for efficient fine-tuning
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)

print("✅ Model configured with 4-bit quantization and LoRA.")
model.print_trainable_parameters()


In [None]:
# --- 6. Start Fine-Tuning ---

output_dir = "housebrain_v1_adapters"

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    logging_steps=10,
    num_train_epochs=1,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True, # Use fp16 for mixed-precision training
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

print("🚀 Starting training...")
trainer.train()

print("✅ Fine-tuning complete.")
trainer.save_model(output_dir)
print(f"💾 Final model adapters saved to '{output_dir}'.")


In [None]:
# --- 7. Package and Download ---
import shutil

# Zip the adapters folder
archive_path = shutil.make_archive("housebrain_v1_adapters", 'zip', "housebrain_v1_adapters")

print(f"✅ Model adapters saved and zipped at: {archive_path}")
print("You can now download 'housebrain_v1_adapters.zip' from the Files panel on the left.")
