In [None]:
# @title ## 1. Setup Environment (Corrected)
# @markdown Mount Google Drive, clone/pull the repo, install dependencies, and set the system path.
from google.colab import drive
import os
import sys
import getpass
import subprocess

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)
print("✅ Google Drive mounted.")

# --- GitHub Setup ---
#@markdown Enter your GitHub Personal Access Token (PAT) with repo access.
GITHUB_TOKEN = getpass.getpass('Enter your GitHub PAT: ')
REPO_URL = f"https://{GITHUB_TOKEN}@github.com/Vinay-O/HouseBrainLLM.git"
REPO_DIR = "/content/HouseBrainLLM"

# Clone or pull the repository
if os.path.exists(REPO_DIR):
    print("Repository already exists. Pulling latest changes...")
    subprocess.run(f"cd {REPO_DIR} && git pull", shell=True, check=True)
else:
    print("Cloning repository...")
    subprocess.run(f"git clone {REPO_URL} {REPO_DIR}", shell=True, check=True)
print("✅ Repository is ready.")

# CRITICAL FIX: Add repository to Python's system path
if REPO_DIR not in sys.path:
    sys.path.append(REPO_DIR)
    print(f"✅ Added {REPO_DIR} to system path.")

# --- Install Dependencies ---
requirements_path = os.path.join(REPO_DIR, "requirements.txt")
print("Installing dependencies...")
!pip install -q -r {requirements_path}
!pip install -q ollama
print("✅ Dependencies installed.")

print("✅ Environment setup complete.")


In [None]:
# @title ## 2. Configure and Start Ollama Server (Corrected)
# @markdown This cell stops any old server, starts a new one, and pulls the specified model.
import subprocess
import threading
import time
import ipywidgets as widgets
from IPython.display import display

# @markdown Select the model to use for generation.
model_name_widget = widgets.Dropdown(
    options=["deepseek-r1:32b", "llama3:70b-instruct", "qwen2:72b-instruct", "mixtral:instruct"],
    value='deepseek-r1:32b',
    description='Model:',
    disabled=False,
)
display(model_name_widget)

# Use a button to trigger the setup
setup_button = widgets.Button(description="Start Server & Pull Model")
display(setup_button)

output_area = widgets.Output()
display(output_area)

def setup_ollama_server(b):
    with output_area:
        output_area.clear_output()
        MODEL_NAME = model_name_widget.value

        # CRITICAL FIX: Stop any existing Ollama processes to prevent errors
        print("🛑 Stopping any old Ollama server...")
        subprocess.run("pkill -f 'ollama serve'", shell=True)
        time.sleep(3) # Give it a moment to shut down

        # Start Ollama serve in a background thread
        def run_ollama():
            try:
                # Using subprocess.run to wait until the command completes or fails.
                # The capture_output=True will hold stdout/stderr.
                proc = subprocess.run("ollama serve", shell=True, check=True, capture_output=True, text=True)
            except subprocess.CalledProcessError as e:
                print(f"Ollama server failed: {e.stderr}")

        print("🚀 Starting new Ollama server in the background...")
        ollama_thread = threading.Thread(target=run_ollama)
        ollama_thread.daemon = True
        ollama_thread.start()
        print("⏳ Waiting for Ollama server to initialize (this may take ~20 seconds)...")
        time.sleep(20)

        # Pull the model
        print(f"📦 Pulling model: {MODEL_NAME}. This may take a while...")
        try:
            # Using Popen for real-time output
            process = subprocess.Popen(f"ollama pull {MODEL_NAME}", shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
            for line in iter(process.stdout.readline, ''):
                print(line, end='')
            process.wait(timeout=900)
            if process.returncode == 0:
                 print(f"✅ Model {MODEL_NAME} is ready.")
            else:
                 print(f"❌ Error pulling model.")

        except Exception as e:
            print(f"Error pulling model: {e}")

        # Make the ollama library available globally
        global ollama
        import ollama
        print("✅ Ollama server is running and the library is imported.")
        !ollama list

setup_button.on_click(setup_ollama_server)


In [None]:
# @title ## 3. Run the Data Factory (V4.3 - Self-Healing Pipeline)
# @markdown This cell runs the main data generation loop. It depends on Cell 1 and Cell 2 being run successfully first.

import json
import uuid
import random
import time
import re
from pydantic import ValidationError
from google.colab import drive
from IPython.display import clear_output, display, HTML

# --- Configuration ---
DATASET_PATH = "/content/drive/MyDrive/housebrain_platinum_dataset"
MASTER_PROMPT_LIST_PATH = "/content/drive/MyDrive/housebrain_prompts/platinum_prompts.txt"
NUM_PROMPTS_TO_GENERATE = 10 # @param {type:"integer"}

# --- Prerequisite Checks ---
SCHEMA_LOADED = False
SERVER_RUNNING = False

try:
    from src.housebrain.schema import HouseOutput, HouseInput, BasicDetails, RoomType
    SCHEMA_LOADED = True
    print("✅ Schema loaded successfully.")
except (ImportError, ModuleNotFoundError) as e:
    print(f"❌ Could not import HouseBrain schema: {e}")
    print("   Please re-run Cell 1 to ensure the repository is cloned and the path is set.")

# Check if the ollama library and model widget from Cell 2 are available
try:
    if ollama and model_name_widget:
        SERVER_RUNNING = True
        MODEL_NAME = model_name_widget.value
        client = ollama.Client()
        print(f"✅ Ollama client connected. Using model: {MODEL_NAME}")
except NameError:
    print("❌ Ollama server not ready. Please run Cell 2 successfully before this cell.")

# --- Helper Functions ---
def call_ollama_colab(model, prompt, retries=3, delay=5):
    """Calls the Ollama server using the client initialised from Cell 2."""
    if not SERVER_RUNNING: return None
    for attempt in range(retries):
        try:
            response = client.chat(model=model, messages=[{'role': 'user', 'content': prompt}])
            return response['message']['content']
        except Exception as e:
            print(f"ERROR: Attempt {attempt + 1}/{retries} failed. Retrying in {delay}s...")
            print(f"DETAILS: {e}")
            time.sleep(delay)
    print(f"❌ All {retries} attempts to call the Ollama model failed.")
    return None

def repair_json(text, target_type):
    """Aggressively extracts and parses a JSON object or list from a string."""
    print(" -> Running Stage 2: Aggressive JSON Repair...")
    if not text: return None
    if target_type == 'dict': match = re.search(r'\{.*\}', text, re.DOTALL)
    elif target_type == 'list': match = re.search(r'\[.*\]', text, re.DOTALL)
    else: return None

    if not match:
        print("    ❌ JSON Repair Failed: No JSON object/list found in the output.")
        return None
    json_str = match.group(0)
    try:
        parsed_json = json.loads(json_str)
        print("    ✅ JSON Repair Successful.")
        return parsed_json
    except json.JSONDecodeError as e:
        print(f"    ❌ JSON Repair Failed: Could not decode the extracted JSON. Error: {e}")
        return None

def heal_and_convert_plan(raw_plan: dict):
    """Takes a raw dictionary from the LLM and deterministically fixes common schema deviations."""
    print(" -> Running Stage 3: Applying Schema Healer...")
    healed_plan = raw_plan.copy()
    total_area = 0
    if not isinstance(healed_plan.get('levels'), list): healed_plan['levels'] = []
    for level in healed_plan.get('levels', []):
        if not isinstance(level.get('rooms'), list): level['rooms'] = []
        for room in level.get('rooms', []):
            raw_type = room.get('name', room.get('room_type', '')).lower().replace('_', ' ')
            if 'living' in raw_type: room['room_type'] = 'living_room'
            elif 'dining' in raw_type: room['room_type'] = 'dining_room'
            elif 'kitchen' in raw_type: room['room_type'] = 'kitchen'
            elif 'bed' in raw_type: room['room_type'] = 'bedroom'
            elif 'bath' in raw_type: room['room_type'] = 'bathroom'
            elif 'balcony' in raw_type: room['room_type'] = 'balcony'
            elif 'garage' in raw_type: room['room_type'] = 'garage'
            elif 'stor' in raw_type: room['room_type'] = 'storage'
            elif 'study' in raw_type or 'office' in raw_type: room['room_type'] = 'study'
            elif 'utility' in raw_type: room['room_type'] = 'utility'
            elif 'entrance' in raw_type or 'foyer' in raw_type: room['room_type'] = 'entrance'
            elif 'hallway' in raw_type or 'corridor' in raw_type: room['room_type'] = 'hallway'
            elif 'patio' in raw_type: room['room_type'] = 'patio'
            elif 'laundry' in raw_type: room['room_type'] = 'laundry'
            bounds = room.get('bounds', {})
            if all(k in bounds for k in ['x1', 'y1', 'x2', 'y2']):
                bounds['x'] = bounds.get('x1', 0)
                bounds['y'] = bounds.get('y1', 0)
                bounds['width'] = abs(bounds.get('x2', 0) - bounds.get('x1', 0))
                bounds['height'] = abs(bounds.get('y2', 0) - bounds.get('y1', 0))
                for k in ['x1', 'y1', 'x2', 'y2']: bounds.pop(k, None)
            if all(k in bounds for k in ['width', 'height']):
                total_area += bounds['width'] * bounds['height']
    healed_plan['total_area'] = total_area
    print("    ✅ Schema Healer finished.")
    return healed_plan

# --- Master Prompt Template (V4.3 - Safe Formatting) ---
if SCHEMA_LOADED:
    ROOM_TYPES_LIST_STR = ', '.join([f'"{e.value}"' for e in RoomType])
    PROMPT_HEADER = f"""You are an expert architectural AI. Your task is to generate a complete, valid, and detailed house plan in JSON format based on a user's request. You MUST adhere strictly to the provided JSON schema. Do NOT add any extra fields or deviate from the specified structure. The output MUST be a single, raw JSON object, without any surrounding text, explanations, or markdown formatting.

**JSON Schema:**
- `room_type`: Must be one of the following: {ROOM_TYPES_LIST_STR}.
- `Rectangle` (for bounds): {{"x": float, "y": float, "width": float, "height": float}}.

**Constraint Checklist & Rules:**
1.  **Output Raw JSON ONLY:** Start with `{{` and end with `}}`.
2.  **Strict Schema Adherence:** Every field in the schema MUST be present.
3.  **Valid `room_type`:** Use ONLY the provided enum values.
4.  **No Overlapping Rooms:** Ensure `bounds` of rooms on the same level do not overlap.
5.  **Doors Connect Rooms:** Each `Door` must have a valid `room1` and `room2` ID.
6.  **Realistic Dimensions:** Room sizes must be practical.

**User Request:**
"""
    PROMPT_FOOTER = "\n\n**Your JSON Output:**"

# --- Main Execution Block ---
if SCHEMA_LOADED and SERVER_RUNNING:
    !mkdir -p "{DATASET_PATH}"
    print("\n--- Starting Data Factory Run (V4.3 - Self-Healing Pipeline) ---")
    try:
        with open(MASTER_PROMPT_LIST_PATH, 'r') as f: all_prompts = f.readlines()
        print(f"✅ Found {len(all_prompts)} total prompts.")
    except FileNotFoundError:
        print(f"❌ ERROR: Master prompt file not found at {MASTER_PROMPT_LIST_PATH}")
        all_prompts = []

    if all_prompts:
        prompts_to_process = random.sample(all_prompts, min(NUM_PROMPTS_TO_GENERATE, len(all_prompts)))
        print(f"✅ This run will process a random batch of {len(prompts_to_process)} prompts.")
        for i, prompt_text in enumerate(prompts_to_process):
            prompt_text = prompt_text.strip()
            if not prompt_text: continue
            print(f"\n================== PROMPT {i+1}/{len(prompts_to_process)} ==================")
            print(prompt_text[:100] + "...")
            print("--------------------------------------------------")

            print(" -> Running Stage 1: Single-Shot Generation...")
            final_prompt = PROMPT_HEADER + prompt_text + PROMPT_FOOTER
            llm_output = call_ollama_colab(MODEL_NAME, final_prompt)

            if not llm_output:
                print("    ❌ Stage 1 Failed: No response from model.")
                continue

            print(" -> Running Stage 4: Validation and Save...")
            try:
                raw_data = repair_json(llm_output, 'dict')
                if not raw_data: raise ValueError("JSON repair failed.")
                healed_data = heal_and_convert_plan(raw_data)
                plan_id = str(uuid.uuid4())
                healed_data['id'] = plan_id
                healed_data['input'] = HouseInput(basicDetails=BasicDetails(prompt=prompt_text)).model_dump()
                validated_plan = HouseOutput.model_validate(healed_data)
                file_path = f"{DATASET_PATH}/plan_{plan_id.replace('-', '_')}.json"
                with open(file_path, 'w') as f:
                    f.write(validated_plan.model_dump_json(indent=2))
                print(f"✅ SUCCESS! Saved validated plan to {file_path}")
            except ValidationError as e:
                print(f"    ❌ Stage 4 Failed: Pydantic validation error - {e}")
            except Exception as e:
                print(f"    ❌ Stage 4 Failed: An unexpected error occurred - {e}")
    print("\n🎉 Data Factory run complete!")
else:
    print("\nHALTING: Prerequisites not met. Please run Cell 1 and Cell 2 successfully before running this cell.")


In [None]:
# @title ## 4. (One-Time Setup) Generate Master Prompt File
# @markdown This cell generates a large list of diverse prompts and saves it to your Google Drive.
# @markdown You only need to run this once.
import itertools
import os

# --- Configuration ---
PROMPT_OUTPUT_DIR = "/content/drive/MyDrive/housebrain_prompts"
PROMPT_OUTPUT_FILE = os.path.join(PROMPT_OUTPUT_DIR, "platinum_prompts.txt")
NUM_PROMPTS_TO_GENERATE = 40000 # @param {type:"integer"}

# --- Prompt Components ---
styles = ["Modern", "Traditional", "Scandinavian", "Industrial", "Minimalist", "Bohemian", "Farmhouse", "Art Deco", "Brutalist", "Colonial", "Mediterranean", "Japanese Zen", "Traditional Kerala-style 'Nalukettu'"]
structures = ["house", "bungalow", "villa", "apartment", "cottage", "townhouse", "farmhouse", "penthouse"]
stories = ["single-story", "two-story", "G+1", "G+2", "split-level", "duplex", "triplex"]
bhk_options = ["studio apartment", "1BHK", "2BHK", "3BHK", "4BHK", "5BHK", "6BHK"]
plot_sizes = ["30x40 feet plot", "50x80 feet plot", "80x100 feet plot", "100x100 feet plot", "irregular plot"]
total_areas = ["1000 sqft", "1200 sqft", "1800 sqft", "3000 sqft", "4000 sqft", "5000 sqft"]
features = ["with a library", "with a home office", "with a private gym", "with a large garden", "with a rooftop terrace", "with a two-car garage", "with servant's quarters", "with a swimming pool"]
misc = ["for a joint family", "for a nuclear family", "with a North-facing entrance", "with a West-facing entrance", "featuring floor-to-ceiling windows", "with an open-plan kitchen"]

# --- Generation Logic ---
!mkdir -p "{PROMPT_OUTPUT_DIR}"

all_options = [styles, structures, stories, bhk_options, plot_sizes, total_areas, features, misc]
combinations = list(itertools.product(*all_options))
random.shuffle(combinations)

prompts_generated = 0
with open(PROMPT_OUTPUT_FILE, "w") as f:
    for combo in combinations:
        # Construct a more natural-sounding prompt
        prompt = f"Design a {combo[0]}, {combo[1]} {combo[2]} {combo[3]}. "
        if "plot" in combo[4]:
            prompt += f"for a {combo[4]}. "
        else:
            prompt += f"with a total area of {combo[5]}. "
        prompt += f"{combo[6]}. {combo[7]}."
        f.write(prompt + "\n")
        prompts_generated += 1
        if prompts_generated >= NUM_PROMPTS_TO_GENERATE:
            break

print(f"✅ Successfully generated and saved {prompts_generated} prompts to {PROMPT_OUTPUT_FILE}")



In [None]:
# @title ## 5. (Optional) Download Generated Dataset
# @markdown Zip the entire generated dataset directory and download it to your local machine.
from google.colab import files
import os
import datetime

# --- Configuration ---
# This path should match the DATASET_PATH in Cell 3
DATASET_DIR_TO_ZIP = "/content/drive/MyDrive/housebrain_platinum_dataset"

# --- Zipping Logic ---
if os.path.exists(DATASET_DIR_TO_ZIP):
    print(f"Zipping directory: {DATASET_DIR_TO_ZIP}...")
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    zip_filename = f"housebrain_dataset_{timestamp}.zip"
    
    # The -r flag zips recursively, and -j junks paths to store files at the top level of the zip.
    !zip -r -j {zip_filename} "{DATASET_DIR_TO_ZIP}"
    
    print(f"✅ Created {zip_filename}. Offering for download...")
    files.download(zip_filename)
else:
    print(f"❌ Directory not found: {DATASET_DIR_TO_ZIP}")
    print("   Please run the data factory in Cell 3 to generate some data first.")
