In [None]:
# Cell 1: Setup, Dependencies, and Google Drive

# Mount Google Drive to persist data
from google.colab import drive
drive.mount('/content/drive')

# Install required Python libraries
!pip install pydantic --quiet

# Configure Git and clone the project repository
# This ensures we have the latest schema and prompts
import os

GIT_USERNAME = "Vinay-O"
GIT_REPOSITORY = "HouseBrainLLM"
GIT_TOKEN = "" # @param {type:"string"}

# Set the working directory to your project folder in Google Drive
# IMPORTANT: Make sure this path is correct for your Drive setup
PROJECT_DIR = "/content/drive/MyDrive/housebrain_v1_1"
os.makedirs(PROJECT_DIR, exist_ok=True)
os.chdir(PROJECT_DIR)

# Clone the repository if it doesn't exist, otherwise pull the latest changes
if not os.path.exists(os.path.join(PROJECT_DIR, ".git")):
    print("Cloning repository...")
    # Construct the Git URL with the token for authentication
    GIT_URL = f"https://{GIT_TOKEN}@github.com/{GIT_USERNAME}/{GIT_REPOSITORY}.git"
    !git clone {GIT_URL} .
else:
    print("Repository already exists. Pulling latest changes...")
    !git pull

# Add the project's source to the Python path
# This allows us to import our custom schema
import sys
sys.path.append(os.path.join(PROJECT_DIR, 'src'))

print("\n✅ Setup complete. Environment is ready.")



In [None]:
# Cell 2: Install and Run Ollama
import os
import asyncio

# @title Select LLM Model and Run Ollama Server
# @markdown Choose the model to use for data generation. The selected model will be pulled and served by Ollama.
MODEL_NAME = 'deepseek-coder-v2:16b-lite-instruct-q4_0'  # @param ["deepseek-coder-v2:16b-lite-instruct-q4_0", "llama3:8b-instruct-q4_0", "mixtral:8x7b-instruct-v0.1-q4_0", "codegemma:7b-instruct-q4_0"]

def install_ollama():
    """Downloads and installs the Ollama server."""
    print("Installing Ollama...")
    !curl -fsSL https://ollama.com/install.sh | sh
    print("Ollama installation complete.")

async def run_ollama():
    """Starts the Ollama server as a background process."""
    print("Starting Ollama server...")
    # Stop any existing Ollama process
    !pkill -f ollama
    
    # Start the server in the background
    process = await asyncio.create_subprocess_shell(
        'ollama serve',
        stdout=asyncio.subprocess.PIPE,
        stderr=asyncio.subprocess.PIPE
    )
    
    # Wait for the server to be ready.
    # We can't just wait for stdout because it might hang.
    # Instead, we'll just give it a few seconds to initialize.
    await asyncio.sleep(5)
    
    print("Ollama server should be running.")
    return process

def pull_model(model_name):
    """Pulls the selected model from Ollama."""
    print(f"Pulling model: {model_name}...")
    os.system(f'ollama pull {model_name}')
    print(f"Model {model_name} pulled successfully.")

async def main():
    install_ollama()
    server_process = await run_ollama()
    pull_model(MODEL_NAME)
    print("\n✅ Ollama setup complete. The server is running with the selected model.")

# Run the setup
await main()



In [None]:
# Cell 3: Import HouseBrain Schema

try:
    from housebrain.schema import HouseOutput
    print("✅ Successfully imported HouseBrain schema.")
except ImportError as e:
    print(f"❌ Failed to import HouseBrain schema: {e}")
    print("Please ensure the path in Cell 1 is correct and the 'src' directory contains 'housebrain/schema.py'.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")



In [None]:
# Cell 4: The Data Factory (V3 - Assembly Line)

import random
import json
import time
import re
import os
from urllib.request import urlopen, Request
from urllib.error import URLError, HTTPError
from pydantic import ValidationError
import hashlib

# --- Configuration ---
BATCH_SIZE = 10 # @param {type:"integer"}
MASTER_PROMPT_LIST_PATH = "/content/drive/MyDrive/housebrain_v1_1/data/prompts/master_prompt_list.txt" # @param {type:"string"}
OUTPUT_DIR = "/content/drive/MyDrive/housebrain_platinum_dataset" # @param {type:"string"}

os.makedirs(OUTPUT_DIR, exist_ok=True)

def generate_file_hash(prompt_text):
    """Creates a unique and consistent hash for a given prompt text."""
    return hashlib.sha256(prompt_text.encode('utf-8')).hexdigest()[:16]

# --- Prompts ---

STAGE_1_LAYOUT_PROMPT = """
You are an expert architectural AI. Your task is to generate the foundational layout for a house based on a user's prompt.
**Instructions:**
1. **Analyze the Request:** Carefully read the user's prompt to understand the constraints (e.g., plot size, number of floors, total area, number of rooms).
2. **Design the Layout:** Create a logical and functional floor plan. Ensure rooms are reasonably sized and placed.
3. **Define Levels and Rooms:** Structure your output with levels (e.g., ground floor, first floor) and the rooms within each level.
4. **Specify Room Bounds:** For each room, define its `bounds` as a rectangle with `x`, `y`, `width`, and `height`. The origin (0,0) is the top-left corner of the plot.
5. **Adhere to the Schema:** The output MUST be a single JSON object that validates against the `HouseOutput` schema, but **ONLY include the `levels` and `rooms`**. Do NOT include `doors` or `windows` at this stage.
6. **Use Unique IDs:** Assign a unique string `id` to every level and every room (e.g., "level_0", "room_0", "kitchen_0").
**User Prompt:**
{user_prompt}
**Output Format (JSON Object only):**
```json
{{
  "levels": [
    {{
      "id": "level_0",
      "level_number": 0,
      "rooms": [
        {{ "id": "living_room_0", "room_type": "living_room", "bounds": {{"x": 0, "y": 0, "width": 20, "height": 15}} }},
        {{ "id": "kitchen_0", "room_type": "kitchen", "bounds": {{"x": 20, "y": 0, "width": 10, "height": 15}} }}
      ]
    }}
  ]
}}
```
"""

STAGE_2_DOORS_PROMPT = """
You are an expert architectural AI. You will be given a JSON object describing a house layout. Your task is to add doors to this layout.
**Instructions:**
1. **Analyze the Layout:** Review the provided `levels` and `rooms` JSON. Understand the room adjacencies.
2. **Place Doors:** Add doors logically. Every room should be accessible. Add an exterior door for the main entrance. Connect adjacent rooms where appropriate.
3. **Define Door Properties:** For each door, specify its `position` (`x`, `y`), `width`, `type` ('interior', 'exterior', 'sliding', 'pocket'), and the two room IDs it connects (`room1`, `room2`). For exterior doors, `room2` should be a descriptive string like "exterior_front" or "exterior_backyard".
4. **Adhere to the Schema:** The output MUST be a single JSON list of `Door` objects. Do NOT include any other text, conversation, or markdown.
**House Layout:**
{layout_json}
**Golden Example:**
A door connecting `living_room_0` and `kitchen_0` might look like this:
`{{ "position": {{"x": 19.5, "y": 7}}, "width": 3, "type": "interior", "room1": "living_room_0", "room2": "kitchen_0" }}`
**Output Format (JSON List only):**
```json
[
  {{ "position": {{"x": 0, "y": 7}}, "width": 3.5, "type": "exterior", "room1": "entrance_0", "room2": "exterior_front" }},
  {{ "position": {{"x": 10, "y": 7}}, "width": 3, "type": "interior", "room1": "entrance_0", "room2": "living_room_0" }}
]
```
"""

STAGE_3_WINDOWS_PROMPT = """
You are an expert architectural AI. You will be given a JSON object describing a house layout. Your task is to add windows to this layout.
**Instructions:**
1. **Analyze the Layout:** Review the provided `levels` and `rooms` JSON.
2. **Place Windows:** Add windows logically to exterior walls. Rooms like living rooms and bedrooms should have ample windows. Bathrooms might have smaller or fewer windows.
3. **Define Window Properties:** For each window, specify its `position` (`x`, `y`), `width`, `height`, `type` ('fixed', 'casement', 'sliding', 'bay'), and the `room_id` it belongs to.
4. **Adhere to the Schema:** The output MUST be a single JSON list of `Window` objects. Do NOT include any other text, conversation, or markdown.
**House Layout:**
{layout_json}
**Output Format (JSON List only):**
```json
[
  {{ "position": {{"x": 5, "y": 0}}, "width": 8, "height": 5, "type": "bay", "room_id": "living_room_0" }},
  {{ "position": {{"x": 25, "y": 0}}, "width": 4, "height": 3, "type": "sliding", "room_id": "kitchen_0" }}
]
```
"""

# --- Helper Functions ---

def call_ollama_colab(model, prompt, max_retries=3, delay=5):
    """Function to call the Ollama API running on Google Colab."""
    OLLAMA_ENDPOINT = "http://localhost:11434/api/generate"
    headers = {"Content-Type": "application/json"}
    data = {"model": model, "prompt": prompt, "stream": False}
    for attempt in range(max_retries):
        try:
            req = Request(OLLAMA_ENDPOINT, data=json.dumps(data).encode("utf-8"), headers=headers, method="POST")
            with urlopen(req) as response:
                response_body = response.read().decode("utf-8")
                response_json = json.loads(response_body)
                return response_json.get("response", "").strip()
        except (URLError, HTTPError, ConnectionResetError) as e:
            print(f"ERROR: Ollama connection error on attempt {attempt + 1}/{max_retries}: {e}")
            if attempt < max_retries - 1:
                time.sleep(delay)
            else:
                return None
        except Exception as e:
            print(f"ERROR: Unexpected error calling Ollama: {e}")
            return None

def repair_json(text, target_type=dict):
    """Aggressively finds and parses a JSON object or list from a string."""
    text = str(text)
    start_char, end_char = ('[', ']') if target_type == list else ('{', '}')
    try:
        parsed = json.loads(text)
        if isinstance(parsed, target_type):
            print("✅ Initial parse successful.")
            return parsed
    except json.JSONDecodeError:
        pass
    print(f"Initial parse for type {target_type.__name__} failed. Attempting aggressive extraction...")
    pattern = re.compile(f'\\{start_char}[\\s\\S]*\\{end_char}')
    match = pattern.search(text)
    if not match:
        print(f"❌ No JSON structure of type {target_type.__name__} found.")
        return None
    potential_json = match.group(0)
    try:
        parsed = json.loads(potential_json)
        if isinstance(parsed, target_type):
            print(f"✅ Successfully extracted and parsed JSON of type {target_type.__name__}.")
            return parsed
        if target_type == list and isinstance(parsed, dict):
            for key, value in parsed.items():
                if isinstance(value, list):
                    print(f"✅ Repaired JSON by extracting list from key '{key}'.")
                    return value
        print(f"❌ Extracted JSON is not of the target type {target_type.__name__}.")
        return None
    except json.JSONDecodeError as e:
        print(f"❌ Failed to parse the extracted JSON blob: {e}")
        return None

def assemble_plan(layout_dict, doors_list, windows_list):
    """A completely bulletproof assembly function that can handle any malformed data."""
    def safe_get(d, key, default=None):
        try:
            return d.get(key, default) if isinstance(d, dict) else default
        except:
            return default
    result = {"levels": []}
    try:
        if not isinstance(layout_dict, dict): return result
        levels = safe_get(layout_dict, "levels", [])
        if not isinstance(levels, list): return result
        result["levels"] = levels
        rooms_by_id = {}
        for level in levels:
            if not isinstance(level, dict): continue
            level["rooms"] = [] if "rooms" not in level or not isinstance(level["rooms"], list) else level["rooms"]
            for room in level["rooms"]:
                if not isinstance(room, dict): continue
                room_id = safe_get(room, "id")
                if not isinstance(room_id, str): continue
                room["doors"] = [] if "doors" not in room or not isinstance(room["doors"], list) else room["doors"]
                room["windows"] = [] if "windows" not in room or not isinstance(room["windows"], list) else room["windows"]
                rooms_by_id[room_id] = room
        if isinstance(windows_list, list):
            for window in windows_list:
                if not isinstance(window, dict): continue
                room_id = safe_get(window, "room_id")
                if isinstance(room_id, str) and room_id in rooms_by_id:
                    rooms_by_id[room_id]["windows"].append(window)
        if isinstance(doors_list, list):
            for door in doors_list:
                if not isinstance(door, dict): continue
                room1_id = safe_get(door, "room1")
                if isinstance(room1_id, str) and room1_id in rooms_by_id:
                    rooms_by_id[room1_id]["doors"].append(door)
    except Exception as e:
        print(f"⚠️ Assembly Error: {e}")
    return result

# --- Execution ---

print("--- Starting Data Factory Run (V3: Assembly Line) ---")
try:
    with open(MASTER_PROMPT_LIST_PATH, 'r') as f:
        master_prompt_list = f.read().splitlines()
    print(f"✅ Found {len(master_prompt_list)} total prompts.")
except FileNotFoundError:
    print(f"❌ ERROR: Master prompt list not found at '{MASTER_PROMPT_LIST_PATH}'")
    master_prompt_list = []

if master_prompt_list:
    prompts_to_process = random.sample(master_prompt_list, min(BATCH_SIZE, len(master_prompt_list)))
    print(f"✅ This run will process a random batch of {len(prompts_to_process)} prompts.")

    for i, prompt_text in enumerate(prompts_to_process):
        print(f"\n================== PROMPT {i+1}/{len(prompts_to_process)} ==================")
        print(prompt_text[:120] + "..." if len(prompt_text) > 120 else prompt_text)
        print("--------------------------------------------------")

        # STAGE 1
        print("Running Stage 1: Layout Generation...")
        stage_1_response = call_ollama_colab(MODEL_NAME, STAGE_1_LAYOUT_PROMPT.format(user_prompt=prompt_text))
        if not stage_1_response: print("❌ Stage 1 Failed: No response from model."); continue
        layout_data = repair_json(stage_1_response, target_type=dict)
        if not layout_data: print("❌ Stage 1 Failed: Could not produce a valid layout JSON."); continue

        # STAGE 2
        print("Running Stage 2: Door Generation...")
        stage_2_response = call_ollama_colab(MODEL_NAME, STAGE_2_DOORS_PROMPT.format(layout_json=json.dumps(layout_data, indent=2)))
        if not stage_2_response: print("❌ Stage 2 Failed: No response from model."); continue
        doors_data = repair_json(stage_2_response, target_type=list)
        if doors_data is None: print("❌ Stage 2 Failed: Could not produce a valid list of doors."); continue

        # STAGE 3
        print("Running Stage 3: Window Generation...")
        stage_3_response = call_ollama_colab(MODEL_NAME, STAGE_3_WINDOWS_PROMPT.format(layout_json=json.dumps(layout_data, indent=2)))
        if not stage_3_response: print("❌ Stage 3 Failed: No response from model."); continue
        windows_data = repair_json(stage_3_response, target_type=list)
        if windows_data is None: print("❌ Stage 3 Failed: Could not produce a valid list of windows."); continue

        # STAGE 4
        print("Running Stage 4: Assembling and Validating...")
        try:
            assembled_layout = assemble_plan(layout_data, doors_data, windows_data)
            calculated_area, bedroom_count, bathroom_count = 0.0, 0, 0
            if isinstance(assembled_layout.get('levels'), list):
                for level in assembled_layout.get('levels', []):
                    if isinstance(level.get('rooms'), list):
                        for room in level.get('rooms', []):
                            if isinstance(room, dict):
                                bounds = room.get('bounds', {})
                                if isinstance(bounds, dict):
                                    w, h = bounds.get('width', 0), bounds.get('height', 0)
                                    calculated_area += w * h if isinstance(w, (int, float)) and isinstance(h, (int, float)) else 0
                                room_type = room.get('type')
                                if room_type in ["bedroom", "master_bedroom"]: bedroom_count += 1
                                if room_type in ["bathroom", "half_bath"]: bathroom_count += 1
            
            final_plan_dict = {
                "input": {
                    "basicDetails": {
                        "prompt": prompt_text, "totalArea": calculated_area, "unit": "sqft",
                        "floors": len(assembled_layout.get('levels', [])), "bedrooms": bedroom_count,
                        "bathrooms": bathroom_count, "style": "unknown", "budget": 0
                    },
                    "plot": {}, "roomBreakdown": []
                },
                "levels": assembled_layout.get("levels", []),
                "total_area": calculated_area, "construction_cost": 0.0
            }
            validated_plan = HouseOutput.model_validate(final_plan_dict)
            file_hash = generate_file_hash(prompt_text)
            output_path = os.path.join(OUTPUT_DIR, f"plan_{file_hash}.json")
            with open(output_path, 'w') as f:
                f.write(validated_plan.model_dump_json(indent=2))
            print(f"✅ SUCCESS! Saved validated plan to {output_path}")
        except ValidationError as e:
            print(f"❌ Stage 4 Failed: Pydantic validation error - {e}")
        except Exception as e:
            print(f"❌ Stage 4 Failed: An unexpected error occurred - {type(e).__name__}: {e}")

    print("\n🎉 Data Factory run complete!")



In [None]:
# Cell 5: Zip and Download Results

import os
import shutil
from google.colab import files

# @title Zip and Download the Dataset
# @markdown This cell will create a zip archive of the generated dataset directory and offer it for download.

# Use the same OUTPUT_DIR variable from the Data Factory cell
# Ensure it has been defined by running Cell 4 first.
try:
    if 'OUTPUT_DIR' in locals() and os.path.exists(OUTPUT_DIR):
        print(f"Directory to be zipped: {OUTPUT_DIR}")
        
        # Get the base name of the directory (e.g., 'housebrain_platinum_dataset')
        dir_to_zip_basename = os.path.basename(os.path.normpath(OUTPUT_DIR))
        
        # Define the name for the output zip file
        zip_filename = f"{dir_to_zip_basename}.zip"
        
        print(f"Creating zip file: {zip_filename}...")
        
        # Create the zip archive
        shutil.make_archive(dir_to_zip_basename, 'zip', OUTPUT_DIR)
        
        print(f"✅ Zip file created successfully.")
        
        # Offer the file for download
        print("Offering zip file for download...")
        files.download(zip_filename)
        
    elif 'OUTPUT_DIR' not in locals():
         print("❌ ERROR: OUTPUT_DIR is not defined. Please run the Data Factory cell (Cell 4) first.")
    else:
        print(f"❌ ERROR: The output directory '{OUTPUT_DIR}' does not exist or is empty.")
        print("Please run the Data Factory cell (Cell 4) to generate data first.")

except Exception as e:
    print(f"An unexpected error occurred: {e}")

