In [None]:
# @title ## 1. Setup Environment
# @markdown Mount Google Drive and clone the repository using a secure token.
from google.colab import drive
import os
import getpass
import subprocess

# Mount Google Drive
drive.mount('/content/drive')
print("✅ Google Drive mounted.")

# --- GitHub Setup ---
#@markdown Enter your GitHub Personal Access Token (PAT) with repo access.
GITHUB_TOKEN = getpass.getpass('Enter your GitHub PAT: ')
REPO_URL = f"https://{GITHUB_TOKEN}@github.com/Vinay-O/HouseBrainLLM.git"
REPO_DIR = "/content/HouseBrainLLM"

# Clone the repository
if os.path.exists(REPO_DIR):
    print("Repository already exists. Pulling latest changes...")
    # Use subprocess.run for better error handling
    subprocess.run(f"cd {REPO_DIR} && git pull", shell=True, check=True)
else:
    print("Cloning repository...")
    subprocess.run(f"git clone {REPO_URL} {REPO_DIR}", shell=True, check=True)

print("✅ Repository is ready.")

# --- Install Dependencies ---
#@markdown Install necessary Python packages from the new requirements file.
requirements_path = os.path.join(REPO_DIR, "requirements.txt")
if os.path.exists(requirements_path):
    print("Installing dependencies from requirements.txt...")
    !pip install -q -r {requirements_path}
    print("✅ Dependencies installed.")
else:
    print("⚠️ requirements.txt not found. Installing default packages.")
    !pip install -q pydantic

print("✅ Environment setup complete.")


In [None]:
# @title ## 2. Configure and Start Ollama Server
# @markdown This cell will download and start the Ollama server, then pull the specified model.
# @markdown **NOTE:** A powerful model like `deepseek-r1:32b` is now recommended for higher quality results. It will be slower but more reliable.

MODEL_NAME = "deepseek-r1:32b" # @param ["deepseek-r1:32b", "llama3:70b-instruct", "qwen2:72b-instruct", "mixtral:instruct"]

# Download and start Ollama
!curl -fsSL https://ollama.com/install.sh | sh
import threading
import subprocess
import time

def run_ollama():
    try:
        subprocess.run("ollama serve", shell=True, check=True, capture_output=True, text=True)
    except subprocess.CalledProcessError as e:
        print(f"Ollama server failed: {e.stderr}")

print("🚀 Starting Ollama server in the background...")
ollama_thread = threading.Thread(target=run_ollama)
ollama_thread.daemon = True
ollama_thread.start()

# Wait for the server to be ready
print("⏳ Waiting for Ollama server to initialize...")
time.sleep(15) # Increased wait time for stability

# Pull the model
print(f"📦 Pulling model: {MODEL_NAME}. This may take a while...")
try:
    process = subprocess.run(
        f"ollama pull {MODEL_NAME}",
        shell=True, check=True, capture_output=True, text=True, timeout=900
    )
    print(f"✅ Model {MODEL_NAME} is ready.")
except subprocess.CalledProcessError as e:
    print(f"Error pulling model: {e.stderr}")
    print("This might happen if the model name is incorrect or the Ollama server is not ready.")
except subprocess.TimeoutExpired:
    print("Timed out while pulling the model. The model might be very large or the connection slow.")


# Verify Ollama is running
!ollama list


In [None]:
# @title ## 3. Run the Data Factory (V4 - Self-Healing Pipeline)
# @markdown This cell now uses a single-shot generation approach coupled with a powerful Python-based "Schema Healer" to fix errors deterministically.
import sys
import os
import random
import json
import time
import re
from urllib.request import urlopen, Request
from urllib.error import URLError, HTTPError
from pydantic import ValidationError
import hashlib

# Add the cloned repo's src directory to the Python path
REPO_DIR = "/content/HouseBrainLLM"
sys.path.append(os.path.join(REPO_DIR, 'src'))
try:
    from housebrain.schema import HouseOutput, RoomType
    print("✅ Successfully imported HouseBrain schema.")
except ImportError as e:
    print(f"❌ Failed to import HouseBrain schema: {e}")

# --- Configuration ---
BATCH_SIZE = 10 # @param {type:"integer"}
MASTER_PROMPT_LIST_PATH = "/content/drive/MyDrive/housebrain_prompts/platinum_prompts.txt" #@param {type:"string"}
OUTPUT_DIR = "/content/drive/MyDrive/housebrain_platinum_dataset" # @param {type:"string"}
MODEL_NAME = "deepseek-r1:32b" # This should match the model loaded in Cell 2

os.makedirs(OUTPUT_DIR, exist_ok=True)

def generate_file_hash(prompt_text):
    return hashlib.sha256(prompt_text.encode('utf-8')).hexdigest()[:16]

# --- Master Prompt (Single-Shot) ---
MASTER_PROMPT_TEMPLATE = """
You are a world-class architectural AI assistant. Your task is to generate a complete, valid house plan in a single JSON object based on the user's request.

**CRITICAL INSTRUCTIONS:**
1.  **Generate EVERYTHING at once:** You must output the entire house plan, including all levels, rooms, doors, and windows, in a single JSON response.
2.  **Follow the Schema Loosely:** It is okay to use creative but descriptive field names (e.g., "name" for "room_type", or "coordinates" for "bounds"). Another script will fix them.
3.  **Bounding Box Flexibility:** You can define room bounds using `{"x", "y", "width", "height"}` OR `{"x1", "y1", "x2", "y2"}`. The script will convert it.
4.  **Output ONLY JSON:** Your entire response must be a single, raw JSON object. Do not include any text, markdown, or explanations before or after the JSON.

**User Prompt:**
{user_prompt}

**Example JSON Structure (for guidance):**
```json
{{
  "levels": [
    {{
      "level_number": 0,
      "name": "Ground Floor",
      "rooms": [
        {{
          "id": "living_room_0",
          "name": "Living Room",
          "bounds": {{"x1": 0, "y1": 0, "x2": 20, "y2": 15}},
          "doors": [ ... ],
          "windows": [ ... ]
        }}
      ]
    }}
  ]
}}
```
"""

# --- Helper Functions ---

def call_ollama_colab(model, prompt, max_retries=3, delay=5):
    # (This function remains the same as before)
    OLLAMA_ENDPOINT = "http://localhost:11434/api/generate"
    headers = {"Content-Type": "application/json"}
    data = {"model": model, "prompt": prompt, "stream": False}
    for attempt in range(max_retries):
        try:
            req = Request(OLLAMA_ENDPOINT, data=json.dumps(data).encode("utf-8"), headers=headers, method="POST")
            with urlopen(req) as response:
                response_body = response.read().decode("utf-8")
                response_json = json.loads(response_body)
                return response_json.get("response", "").strip()
        except Exception as e:
            print(f"ERROR calling Ollama on attempt {attempt + 1}: {e}")
            if attempt < max_retries - 1: time.sleep(delay)
    return None

def repair_json(text):
    """Aggressively finds and parses the first JSON object from a string."""
    print("Attempting to extract JSON from model output...")
    match = re.search(r'\{[\s\S]*\}', text)
    if not match:
        print("❌ No JSON object found in the text.")
        return None
    try:
        parsed = json.loads(match.group(0))
        print("✅ Successfully extracted and parsed JSON.")
        return parsed
    except json.JSONDecodeError as e:
        print(f"❌ Failed to parse the extracted JSON blob: {e}")
        return None

def heal_and_convert_plan(raw_plan):
    """
    This is the "Schema Healer". It takes a raw, potentially malformed JSON
    object from the LLM and ruthlessly converts it into a valid schema.
    """
    healed_plan = {"levels": []}
    if not isinstance(raw_plan.get("levels"), list):
        return healed_plan # Cannot proceed without a list of levels

    # --- Room Type Mapping ---
    # Maps common, creative names to the strict schema Enum
    ROOM_TYPE_MAP = {
        "living room": RoomType.LIVING_ROOM, "livingroom": RoomType.LIVING_ROOM,
        "dining room": RoomType.DINING_ROOM, "dining area": RoomType.DINING_ROOM,
        "kitchen": RoomType.KITCHEN,
        "master bedroom": RoomType.MASTER_BEDROOM, "master bdrm": RoomType.MASTER_BEDROOM,
        "bedroom": RoomType.BEDROOM, "bdrm": RoomType.BEDROOM,
        "bathroom": RoomType.BATHROOM, "bath": RoomType.BATHROOM,
        "half bath": RoomType.HALF_BATH, "powder room": RoomType.HALF_BATH,
        "study": RoomType.STUDY, "office": RoomType.STUDY, "home office": RoomType.STUDY,
        "garage": RoomType.GARAGE,
        "utility": RoomType.UTILITY, "laundry": RoomType.UTILITY,
        "storage": RoomType.STORAGE, "closet": RoomType.STORAGE,
        "corridor": RoomType.CORRIDOR, "hallway": RoomType.CORRIDOR,
        "entrance": RoomType.ENTRANCE, "entry": RoomType.ENTRANCE, "foyer": RoomType.ENTRANCE,
        "balcony": RoomType.BALCONY,
        "verandah": RoomType.VERANDAH,
    }

    for i, raw_level in enumerate(raw_plan["levels"]):
        healed_level = {
            # Heal: Invent 'level_number' if missing, or find it in 'level' or 'floor'
            "level_number": raw_level.get("level_number", raw_level.get("level", raw_level.get("floor", i))),
            "rooms": []
        }

        if not isinstance(raw_level.get("rooms"), list): continue

        for raw_room in raw_level["rooms"]:
            healed_room = {}
            
            # Heal: Find 'id'
            healed_room["id"] = raw_room.get("id", f"room_{i}_{len(healed_level['rooms'])}")

            # Heal: Find and map 'room_type' from 'name' or 'type'
            room_name = str(raw_room.get("name", raw_room.get("type", ""))).lower().strip()
            healed_room["type"] = ROOM_TYPE_MAP.get(room_name, RoomType.STORAGE) # Default to storage if unknown

            # Heal: Convert bounds from x1/y1/x2/y2 to x/y/width/height if necessary
            raw_bounds = raw_room.get("bounds", raw_room.get("coordinates", {}))
            if "x1" in raw_bounds and "x2" in raw_bounds:
                healed_bounds = {
                    "x": raw_bounds["x1"],
                    "y": raw_bounds["y1"],
                    "width": abs(raw_bounds["x2"] - raw_bounds["x1"]),
                    "height": abs(raw_bounds["y2"] - raw_bounds["y1"]),
                }
            else:
                healed_bounds = raw_bounds # Assume it's already in the correct format
            
            healed_room["bounds"] = healed_bounds
            healed_room["doors"] = raw_room.get("doors", [])
            healed_room["windows"] = raw_room.get("windows", [])
            
            healed_level["rooms"].append(healed_room)
        healed_plan["levels"].append(healed_level)
        
    print(f"✅ Schema healing complete. Processed {len(healed_plan['levels'])} levels.")
    return healed_plan

# --- Execution ---

print("--- Starting Data Factory Run (V4 - Self-Healing Pipeline) ---")
try:
    with open(MASTER_PROMPT_LIST_PATH, 'r') as f:
        master_prompt_list = f.read().splitlines()
    print(f"✅ Found {len(master_prompt_list)} total prompts.")
except FileNotFoundError:
    print(f"❌ ERROR: Master prompt list not found at '{MASTER_PROMPT_LIST_PATH}'")
    master_prompt_list = []

if master_prompt_list:
    prompts_to_process = random.sample(master_prompt_list, min(BATCH_SIZE, len(master_prompt_list)))
    print(f"✅ This run will process a random batch of {len(prompts_to_process)} prompts.")

    for i, prompt_text in enumerate(prompts_to_process):
        print(f"\n================== PROMPT {i+1}/{len(prompts_to_process)} ==================")
        file_hash = generate_file_hash(prompt_text)
        output_path = os.path.join(OUTPUT_DIR, f"plan_{file_hash}.json")
        
        # Skip if this prompt has already been processed to avoid duplicate work
        if os.path.exists(output_path):
            print(f"⏭️ Skipping prompt, plan already exists: {output_path}")
            continue

        print(prompt_text[:120] + "..." if len(prompt_text) > 120 else prompt_text)
        print("--------------------------------------------------")

        # --- STAGE 1: Single-Shot Generation ---
        print("Running Stage 1: Single-Shot Generation...")
        master_prompt = MASTER_PROMPT_TEMPLATE.format(user_prompt=prompt_text)
        llm_output = call_ollama_colab(MODEL_NAME, master_prompt)
        if not llm_output:
            print("❌ Stage 1 Failed: No response from model."); continue
        
        raw_plan_data = repair_json(llm_output)
        if not raw_plan_data:
            print("❌ Stage 1 Failed: Could not extract a valid JSON object."); continue

        # --- STAGE 2: Python Schema Healer ---
        print("Running Stage 2: Healing and Converting Schema...")
        healed_plan = heal_and_convert_plan(raw_plan_data)
        
        # --- STAGE 3: Final Assembly & Validation ---
        print("Running Stage 3: Assembling and Validating...")
        try:
            # Calculate metadata from the healed plan
            calculated_area, bedroom_count, bathroom_count = 0.0, 0, 0
            for level in healed_plan.get('levels', []):
                for room in level.get('rooms', []):
                    bounds = room.get('bounds', {})
                    w, h = bounds.get('width', 0), bounds.get('height', 0)
                    calculated_area += w * h
                    if room.get('type') in [RoomType.BEDROOM, RoomType.MASTER_BEDROOM]: bedroom_count += 1
                    if room.get('type') in [RoomType.BATHROOM, RoomType.HALF_BATH]: bathroom_count += 1

            # Construct the final object for Pydantic validation
            final_plan_for_validation = {
                "input": {
                    "basicDetails": {
                        "prompt": prompt_text, "totalArea": calculated_area, "unit": "sqft",
                        "floors": len(healed_plan.get('levels', [])), "bedrooms": bedroom_count,
                        "bathrooms": bathroom_count, "style": "unknown", "budget": 0
                    }, "plot": {}, "roomBreakdown": []
                },
                "levels": healed_plan.get("levels", []),
                "total_area": calculated_area, "construction_cost": 0.0
            }
            
            # The final gate: Pydantic validation
            validated_plan = HouseOutput.model_validate(final_plan_for_validation)
            
            with open(output_path, 'w') as f:
                f.write(validated_plan.model_dump_json(indent=2))
            print(f"✅ SUCCESS! Saved validated plan to {output_path}")

        except ValidationError as e:
            print(f"❌ Stage 3 Failed: Pydantic validation error AFTER healing:\n{e}")
        except Exception as e:
            print(f"❌ Stage 3 Failed: An unexpected error occurred - {type(e).__name__}: {e}")

print("\n🎉 Data Factory run complete!")

In [None]:
# @title ## 4. (One-Time Setup) Generate Master Prompt File
# @markdown This cell uses the `generate_prompts.py` script to create your master prompt file in Google Drive.
# @markdown **You only need to run this cell once.**
# @markdown Once the file is created, Cell 3 will be able to read from it for all future runs.

import os
from pathlib import Path

# --- Configuration ---
#@markdown The desired location in your Google Drive for the master prompt file. This MUST match the path in Cell 3.
DRIVE_PROMPT_FILE = "/content/drive/MyDrive/housebrain_prompts/platinum_prompts.txt" #@param {type:"string"}

#@markdown The total number of prompts to generate for your master list.
NUM_PROMPTS_TO_GENERATE = 40000 #@param {type:"integer"}
# --- End Configuration ---

# --- Execution ---
# REPO_DIR is inherited from Cell 1
script_path = os.path.join(REPO_DIR, "scripts/generate_prompts.py")

# Ensure the repository is in the correct directory
os.chdir(REPO_DIR)

# Ensure the target directory in Drive exists
Path(DRIVE_PROMPT_FILE).parent.mkdir(parents=True, exist_ok=True)

print(f"Running prompt generation script to create {NUM_PROMPTS_TO_GENERATE} prompts...")
# Use an f-string for safer command construction
command = f'python3 "{script_path}" --num-prompts {NUM_PROMPTS_TO_GENERATE} --output-file "{DRIVE_PROMPT_FILE}"'
!{command}

print("\n--- Verification ---")
if Path(DRIVE_PROMPT_FILE).exists():
    print(f"✅ Master prompt file successfully created at: {DRIVE_PROMPT_FILE}")
    print("First 5 prompts in the file:")
    !head -n 5 "{DRIVE_PROMPT_FILE}"
else:
    print(f"❌ ERROR: Master prompt file was not created. Please check for errors above.")


In [None]:
# @title ## 5. (Optional) Download Generated Dataset
# @markdown Run this cell after the data generation is complete to compress and download the entire output folder.

import shutil
import os
from google.colab import files
from datetime import datetime

# Define the source directory in Google Drive. This should match OUTPUT_DIR from Cell 3.
# We define it here again to make this cell self-contained.
source_dir = "/content/drive/MyDrive/housebrain_platinum_dataset"

# Create a timestamped zip filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
zip_filename = f"housebrain_dataset_{timestamp}.zip"
zip_filepath = f"/content/{zip_filename}"

if os.path.exists(source_dir) and os.listdir(source_dir):
    # Create the zip archive
    print(f"Compressing '{source_dir}' into '{zip_filepath}'...")
    shutil.make_archive(zip_filepath.replace('.zip', ''), 'zip', source_dir)
    print("✅ Compression complete.")

    # Provide a download link
    print(f"\nDownloading '{zip_filename}'...")
    files.download(zip_filepath)
else:
    print(f"❌ ERROR: The source directory '{source_dir}' was not found or is empty. Please ensure the Data Factory ran correctly.")
