In [None]:
# @title ## 1. Setup Environment
# @markdown Mount Google Drive and clone the repository using a secure token.
from google.colab import drive
import os
import getpass
import subprocess

# Mount Google Drive
drive.mount('/content/drive')
print("‚úÖ Google Drive mounted.")

# --- GitHub Setup ---
#@markdown Enter your GitHub Personal Access Token (PAT) with repo access.
GITHUB_TOKEN = getpass.getpass('Enter your GitHub PAT: ')
REPO_URL = f"https://{GITHUB_TOKEN}@github.com/Vinay-O/HouseBrainLLM.git"
REPO_DIR = "/content/HouseBrainLLM"

# Clone the repository
if os.path.exists(REPO_DIR):
    print("Repository already exists. Pulling latest changes...")
    # Use subprocess.run for better error handling
    subprocess.run(f"cd {REPO_DIR} && git pull", shell=True, check=True)
else:
    print("Cloning repository...")
    subprocess.run(f"git clone {REPO_URL} {REPO_DIR}", shell=True, check=True)

print("‚úÖ Repository is ready.")

# --- Install Dependencies ---
#@markdown Install necessary Python packages from the new requirements file.
requirements_path = os.path.join(REPO_DIR, "requirements.txt")
if os.path.exists(requirements_path):
    print("Installing dependencies from requirements.txt...")
    !pip install -q -r {requirements_path}
    print("‚úÖ Dependencies installed.")
else:
    print("‚ö†Ô∏è requirements.txt not found. Installing default packages.")
    !pip install -q pydantic

print("‚úÖ Environment setup complete.")


In [None]:
# @title ## 2. Configure and Start Ollama Server
# @markdown This cell will download and start the Ollama server, then pull the specified model.
# @markdown **NOTE:** A powerful model like `deepseek-r1:32b` is now recommended for higher quality results. It will be slower but more reliable.

MODEL_NAME = "deepseek-r1:32b" # @param ["deepseek-r1:32b", "llama3:70b-instruct", "qwen2:72b-instruct", "mixtral:instruct"]

# Download and start Ollama
!curl -fsSL https://ollama.com/install.sh | sh
import threading
import subprocess
import time

def run_ollama():
    try:
        subprocess.run("ollama serve", shell=True, check=True, capture_output=True, text=True)
    except subprocess.CalledProcessError as e:
        print(f"Ollama server failed: {e.stderr}")

print("üöÄ Starting Ollama server in the background...")
ollama_thread = threading.Thread(target=run_ollama)
ollama_thread.daemon = True
ollama_thread.start()

# Wait for the server to be ready
print("‚è≥ Waiting for Ollama server to initialize...")
time.sleep(15) # Increased wait time for stability

# Pull the model
print(f"üì¶ Pulling model: {MODEL_NAME}. This may take a while...")
try:
    process = subprocess.run(
        f"ollama pull {MODEL_NAME}",
        shell=True, check=True, capture_output=True, text=True, timeout=900
    )
    print(f"‚úÖ Model {MODEL_NAME} is ready.")
except subprocess.CalledProcessError as e:
    print(f"Error pulling model: {e.stderr}")
    print("This might happen if the model name is incorrect or the Ollama server is not ready.")
except subprocess.TimeoutExpired:
    print("Timed out while pulling the model. The model might be very large or the connection slow.")


# Verify Ollama is running
!ollama list


In [None]:
# @title ## 3. Run the Data Factory
# @markdown This cell is the core of the data generation process. It reads from the master prompt list and runs the "Assembly Line" process for each prompt.
import sys
import os
import random
import json
import time
import re
from urllib.request import urlopen, Request
from urllib.error import URLError, HTTPError
from pydantic import ValidationError
import hashlib

# Add the cloned repo's src directory to the Python path to import the schema
# REPO_DIR is inherited from Cell 1
sys.path.append(os.path.join(REPO_DIR, 'src'))
try:
    from housebrain.schema import HouseOutput
    print("‚úÖ Successfully imported HouseBrain schema.")
except ImportError as e:
    print(f"‚ùå Failed to import HouseBrain schema: {e}")

# --- Configuration ---
BATCH_SIZE = 10 # @param {type:"integer"}
MASTER_PROMPT_LIST_PATH = "/content/drive/MyDrive/housebrain_prompts/platinum_prompts.txt" #@param {type:"string"}
OUTPUT_DIR = "/content/drive/MyDrive/housebrain_platinum_dataset" # @param {type:"string"}

os.makedirs(OUTPUT_DIR, exist_ok=True)

def generate_file_hash(prompt_text):
    """Creates a unique and consistent hash for a given prompt text."""
    return hashlib.sha256(prompt_text.encode('utf-8')).hexdigest()[:16]

# --- Prompts ---

STAGE_1_LAYOUT_PROMPT = """
You are an expert architectural AI. Your task is to generate the foundational layout for a house based on a user's prompt.
**Instructions:**
1.  **Analyze the Request:** Carefully read the user's prompt to understand the constraints (e.g., plot size, number of floors, total area, number of rooms).
2.  **Design the Layout:** Create a logical and functional floor plan. Ensure rooms are reasonably sized and placed.
3.  **Define Levels and Rooms:** Structure your output with levels (e.g., ground floor, first floor) and the rooms within each level.
4.  **Specify Room Bounds:** For each room, define its `bounds` as a rectangle with `x`, `y`, `width`, and `height`. The origin (0,0) is the top-left corner of the plot.
5.  **Adhere to the Schema:** The output MUST be a single JSON object that validates against the `HouseOutput` schema, but **ONLY include the `levels` and `rooms`**. Do NOT include `doors` or `windows` at this stage.
6.  **Use Unique IDs:** Assign a unique string `id` to every level and every room (e.g., "level_0", "room_0", "kitchen_0").
**User Prompt:**
{user_prompt}
**Output Format (JSON Object only):**
```json
{{
  "levels": [
    {{
      "id": "level_0",
      "level_number": 0,
      "rooms": [
        {{ "id": "living_room_0", "room_type": "living_room", "bounds": {{"x": 0, "y": 0, "width": 20, "height": 15}} }},
        {{ "id": "kitchen_0", "room_type": "kitchen", "bounds": {{"x": 20, "y": 0, "width": 10, "height": 15}} }}
      ]
    }}
  ]
}}
```
"""

STAGE_2_DOORS_PROMPT = """
You are an expert architectural AI. You will be given a JSON object describing a house layout. Your task is to add doors to this layout.

**CRITICAL INSTRUCTIONS:**
1.  **Your output MUST be ONLY a valid JSON list of Door objects.**
2.  **DO NOT write ANY text, explanation, or markdown before or after the JSON list.** Your response will be parsed by a program and will fail if any extra text is present.
3.  **Analyze the Layout:** Review the provided rooms and their adjacencies.
4.  **Place Doors Logically:** Every room must be accessible. Add an exterior door for the main entrance. Connect adjacent rooms.
5.  **Adhere to the Schema:** Each door object must have `position`, `width`, `type`, `room1`, and `room2`.
6.  **Avoid Common JSON Errors:** Do not use trailing commas. Ensure all strings are enclosed in double quotes.

**House Layout:**
{layout_json}

**MULTIPLE EXAMPLES of correct door objects:**
- An exterior front door: `{{"position": {{"x": 0, "y": 7}}, "width": 3.5, "type": "exterior", "room1": "entrance_0", "room2": "exterior_front"}}`
- A standard interior door: `{{"position": {{"x": 10, "y": 7}}, "width": 3, "type": "interior", "room1": "entrance_0", "room2": "living_room_0"}}`
- A sliding door to a balcony: `{{"position": {{"x": 15, "y": 0}}, "width": 6, "type": "sliding", "room1": "master_bedroom_0", "room2": "balcony_0"}}`

**Final Output Format (A raw JSON list ONLY):**
```json
[
  {{ ... door object 1 ... }},
  {{ ... door object 2 ... }}
]
```
"""

STAGE_3_WINDOWS_PROMPT = """
You are an expert architectural AI. You will be given a JSON object describing a house layout. Your task is to add windows to this layout.

**CRITICAL INSTRUCTIONS:**
1.  **Your output MUST be ONLY a valid JSON list of Window objects.**
2.  **DO NOT write ANY text, explanation, or markdown before or after the JSON list.** Your response will be parsed by a program and will fail if any extra text is present.
3.  **Analyze the Layout:** Review the provided rooms.
4.  **Place Windows Logically:** Add windows to exterior walls. Living rooms and bedrooms should have good lighting.
5.  **Adhere to the Schema:** Each window object must have `position`, `width`, `height`, `type`, and `room_id`.
6.  **Avoid Common JSON Errors:** Do not use trailing commas. Ensure all strings are enclosed in double quotes.

**House Layout:**
{layout_json}

**MULTIPLE EXAMPLES of correct window objects:**
- A large bay window: `{{"position": {{"x": 5, "y": 0}}, "width": 8, "height": 5, "type": "bay", "room_id": "living_room_0"}}`
- A standard sliding window: `{{"position": {{"x": 25, "y": 0}}, "width": 4, "height": 3, "type": "sliding", "room_id": "kitchen_0"}}`
- A small fixed window for a bathroom: `{{"position": {{"x": 3, "y": 15}}, "width": 2, "height": 2, "type": "fixed", "room_id": "bathroom_0"}}`

**Final Output Format (A raw JSON list ONLY):**
```json
[
  {{ ... window object 1 ... }},
  {{ ... window object 2 ... }}
]
```
"""

# --- Helper Functions ---

def call_ollama_colab(model, prompt, max_retries=3, delay=5):
    """Function to call the Ollama API running on Google Colab."""
    OLLAMA_ENDPOINT = "http://localhost:11434/api/generate"
    headers = {"Content-Type": "application/json"}
    data = {"model": model, "prompt": prompt, "stream": False}
    for attempt in range(max_retries):
        try:
            req = Request(OLLAMA_ENDPOINT, data=json.dumps(data).encode("utf-8"), headers=headers, method="POST")
            with urlopen(req) as response:
                response_body = response.read().decode("utf-8")
                response_json = json.loads(response_body)
                return response_json.get("response", "").strip()
        except (URLError, HTTPError, ConnectionResetError) as e:
            print(f"ERROR: Ollama connection error on attempt {attempt + 1}/{max_retries}: {e}")
            if attempt < max_retries - 1:
                time.sleep(delay)
            else:
                return None
        except Exception as e:
            print(f"ERROR: Unexpected error calling Ollama: {e}")
            return None

def repair_json(text, target_type=dict):
    """Aggressively finds and parses a JSON object or list from a string."""
    text = str(text)
    start_char, end_char = ('[', ']') if target_type == list else ('{', '}')
    try:
        parsed = json.loads(text)
        if isinstance(parsed, target_type):
            print("‚úÖ Initial parse successful.")
            return parsed
    except json.JSONDecodeError:
        pass
    print(f"Initial parse for type {target_type.__name__} failed. Attempting aggressive extraction...")
    pattern = re.compile(f'\\{start_char}[\\s\\S]*\\{end_char}')
    match = pattern.search(text)
    if not match:
        print(f"‚ùå No JSON structure of type {target_type.__name__} found.")
        return None
    potential_json = match.group(0)
    try:
        parsed = json.loads(potential_json)
        if isinstance(parsed, target_type):
            print(f"‚úÖ Successfully extracted and parsed JSON of type {target_type.__name__}.")
            return parsed
        if target_type == list and isinstance(parsed, dict):
            for key, value in parsed.items():
                if isinstance(value, list):
                    print(f"‚úÖ Repaired JSON by extracting list from key '{key}'.")
                    return value
        print(f"‚ùå Extracted JSON is not of the target type {target_type.__name__}.")
        return None
    except json.JSONDecodeError as e:
        print(f"‚ùå Failed to parse the extracted JSON blob: {e}")
        return None

def assemble_plan(layout_dict, doors_list, windows_list):
    """A completely bulletproof assembly function that can handle any malformed data."""
    def safe_get(d, key, default=None):
        try:
            return d.get(key, default) if isinstance(d, dict) else default
        except:
            return default
    result = {"levels": []}
    try:
        if not isinstance(layout_dict, dict): return result
        levels = safe_get(layout_dict, "levels", [])
        if not isinstance(levels, list): return result
        result["levels"] = levels
        rooms_by_id = {}
        for level in levels:
            if not isinstance(level, dict): continue
            level["rooms"] = [] if "rooms" not in level or not isinstance(level["rooms"], list) else level["rooms"]
            for room in level["rooms"]:
                if not isinstance(room, dict): continue
                room_id = safe_get(room, "id")
                if not isinstance(room_id, str): continue
                room["doors"] = [] if "doors" not in room or not isinstance(room["doors"], list) else room["doors"]
                room["windows"] = [] if "windows" not in room or not isinstance(room["windows"], list) else room["windows"]
                rooms_by_id[room_id] = room
        if isinstance(windows_list, list):
            for window in windows_list:
                if not isinstance(window, dict): continue
                room_id = safe_get(window, "room_id")
                if isinstance(room_id, str) and room_id in rooms_by_id:
                    rooms_by_id[room_id]["windows"].append(window)
        if isinstance(doors_list, list):
            for door in doors_list:
                if not isinstance(door, dict): continue
                room1_id = safe_get(door, "room1")
                if isinstance(room1_id, str) and room1_id in rooms_by_id:
                    rooms_by_id[room1_id]["doors"].append(door)
    except Exception as e:
        print(f"‚ö†Ô∏è Assembly Error: {e}")
    return result

# --- Execution ---

print("--- Starting Data Factory Run (V3: Assembly Line) ---")
try:
    with open(MASTER_PROMPT_LIST_PATH, 'r') as f:
        master_prompt_list = f.read().splitlines()
    print(f"‚úÖ Found {len(master_prompt_list)} total prompts.")
except FileNotFoundError:
    print(f"‚ùå ERROR: Master prompt list not found at '{MASTER_PROMPT_LIST_PATH}'")
    master_prompt_list = []

if master_prompt_list:
    prompts_to_process = random.sample(master_prompt_list, min(BATCH_SIZE, len(master_prompt_list)))
    print(f"‚úÖ This run will process a random batch of {len(prompts_to_process)} prompts.")

    for i, prompt_text in enumerate(prompts_to_process):
        print(f"\n================== PROMPT {i+1}/{len(prompts_to_process)} ==================")
        print(prompt_text[:120] + "..." if len(prompt_text) > 120 else prompt_text)
        print("--------------------------------------------------")

        # STAGE 1
        print("Running Stage 1: Layout Generation...")
        stage_1_response = call_ollama_colab(MODEL_NAME, STAGE_1_LAYOUT_PROMPT.format(user_prompt=prompt_text))
        if not stage_1_response: print("‚ùå Stage 1 Failed: No response from model."); continue
        layout_data = repair_json(stage_1_response, target_type=dict)
        if not layout_data: print("‚ùå Stage 1 Failed: Could not produce a valid layout JSON."); continue

        # STAGE 2
        print("Running Stage 2: Door Generation...")
        stage_2_response = call_ollama_colab(MODEL_NAME, STAGE_2_DOORS_PROMPT.format(layout_json=json.dumps(layout_data, indent=2)))
        if not stage_2_response: print("‚ùå Stage 2 Failed: No response from model."); continue
        doors_data = repair_json(stage_2_response, target_type=list)
        if doors_data is None: print("‚ùå Stage 2 Failed: Could not produce a valid list of doors."); continue

        # STAGE 3
        print("Running Stage 3: Window Generation...")
        stage_3_response = call_ollama_colab(MODEL_NAME, STAGE_3_WINDOWS_PROMPT.format(layout_json=json.dumps(layout_data, indent=2)))
        if not stage_3_response: print("‚ùå Stage 3 Failed: No response from model."); continue
        windows_data = repair_json(stage_3_response, target_type=list)
        if windows_data is None: print("‚ùå Stage 3 Failed: Could not produce a valid list of windows."); continue

        # STAGE 4
        print("Running Stage 4: Assembling and Validating...")
        try:
            assembled_layout = assemble_plan(layout_data, doors_data, windows_data)
            calculated_area, bedroom_count, bathroom_count = 0.0, 0, 0
            if isinstance(assembled_layout.get('levels'), list):
                for level in assembled_layout.get('levels', []):
                    if isinstance(level.get('rooms'), list):
                        for room in level.get('rooms', []):
                            if isinstance(room, dict):
                                bounds = room.get('bounds', {})
                                if isinstance(bounds, dict):
                                    w, h = bounds.get('width', 0), bounds.get('height', 0)
                                    calculated_area += w * h if isinstance(w, (int, float)) and isinstance(h, (int, float)) else 0
                                room_type = room.get('type')
                                if room_type in ["bedroom", "master_bedroom"]: bedroom_count += 1
                                if room_type in ["bathroom", "half_bath"]: bathroom_count += 1
            
            final_plan_dict = {
                "input": {
                    "basicDetails": {
                        "prompt": prompt_text, "totalArea": calculated_area, "unit": "sqft",
                        "floors": len(assembled_layout.get('levels', [])), "bedrooms": bedroom_count,
                        "bathrooms": bathroom_count, "style": "unknown", "budget": 0
                    },
                    "plot": {}, "roomBreakdown": []
                },
                "levels": assembled_layout.get("levels", []),
                "total_area": calculated_area, "construction_cost": 0.0
            }
            validated_plan = HouseOutput.model_validate(final_plan_dict)
            file_hash = generate_file_hash(prompt_text)
            output_path = os.path.join(OUTPUT_DIR, f"plan_{file_hash}.json")
            with open(output_path, 'w') as f:
                f.write(validated_plan.model_dump_json(indent=2))
            print(f"‚úÖ SUCCESS! Saved validated plan to {output_path}")
        except ValidationError as e:
            print(f"‚ùå Stage 4 Failed: Pydantic validation error - {e}")
        except Exception as e:
            print(f"‚ùå Stage 4 Failed: An unexpected error occurred - {type(e).__name__}: {e}")

    print("\nüéâ Data Factory run complete!")

In [None]:
# @title ## 4. (One-Time Setup) Generate Master Prompt File
# @markdown This cell uses the `generate_prompts.py` script to create your master prompt file in Google Drive.
# @markdown **You only need to run this cell once.**
# @markdown Once the file is created, Cell 3 will be able to read from it for all future runs.

import os
from pathlib import Path

# --- Configuration ---
#@markdown The desired location in your Google Drive for the master prompt file. This MUST match the path in Cell 3.
DRIVE_PROMPT_FILE = "/content/drive/MyDrive/housebrain_prompts/platinum_prompts.txt" #@param {type:"string"}

#@markdown The total number of prompts to generate for your master list.
NUM_PROMPTS_TO_GENERATE = 40000 #@param {type:"integer"}
# --- End Configuration ---

# --- Execution ---
# REPO_DIR is inherited from Cell 1
script_path = os.path.join(REPO_DIR, "scripts/generate_prompts.py")

# Ensure the repository is in the correct directory
os.chdir(REPO_DIR)

# Ensure the target directory in Drive exists
Path(DRIVE_PROMPT_FILE).parent.mkdir(parents=True, exist_ok=True)

print(f"Running prompt generation script to create {NUM_PROMPTS_TO_GENERATE} prompts...")
# Use an f-string for safer command construction
command = f'python3 "{script_path}" --num-prompts {NUM_PROMPTS_TO_GENERATE} --output-file "{DRIVE_PROMPT_FILE}"'
!{command}

print("\n--- Verification ---")
if Path(DRIVE_PROMPT_FILE).exists():
    print(f"‚úÖ Master prompt file successfully created at: {DRIVE_PROMPT_FILE}")
    print("First 5 prompts in the file:")
    !head -n 5 "{DRIVE_PROMPT_FILE}"
else:
    print(f"‚ùå ERROR: Master prompt file was not created. Please check for errors above.")


In [None]:
# @title ## 5. (Optional) Download Generated Dataset
# @markdown Run this cell after the data generation is complete to compress and download the entire output folder.

import shutil
import os
from google.colab import files
from datetime import datetime

# Define the source directory in Google Drive. This should match OUTPUT_DIR from Cell 3.
# We define it here again to make this cell self-contained.
source_dir = "/content/drive/MyDrive/housebrain_platinum_dataset"

# Create a timestamped zip filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
zip_filename = f"housebrain_dataset_{timestamp}.zip"
zip_filepath = f"/content/{zip_filename}"

if os.path.exists(source_dir) and os.listdir(source_dir):
    # Create the zip archive
    print(f"Compressing '{source_dir}' into '{zip_filepath}'...")
    shutil.make_archive(zip_filepath.replace('.zip', ''), 'zip', source_dir)
    print("‚úÖ Compression complete.")

    # Provide a download link
    print(f"\nDownloading '{zip_filename}'...")
    files.download(zip_filepath)
else:
    print(f"‚ùå ERROR: The source directory '{source_dir}' was not found or is empty. Please ensure the Data Factory ran correctly.")
