In [None]:
import random
import json
import time
import re
import os
from urllib.request import urlopen, Request
from urllib.error import URLError, HTTPError
from pydantic import ValidationError
import hashlib

# This cell contains the entire data generation pipeline.
# It assumes the schema is in a file that can be imported.
# Make sure to run Cell 1 and Cell 2 from the original notebook first to set up
# the environment, install dependencies, and define the HouseBrain schema.

# --- Configuration ---
# These would typically be defined in another cell
BATCH_SIZE = 10
MODEL_NAME = "deepseek-coder-v2:16b-lite-instruct-q4_0" # Or another model from Cell 2
MASTER_PROMPT_LIST_PATH = "/content/drive/MyDrive/housebrain_master_prompts.txt"
OUTPUT_DIR = "/content/drive/MyDrive/housebrain_platinum_dataset"

os.makedirs(OUTPUT_DIR, exist_ok=True)

def generate_file_hash(prompt_text):
    return hashlib.sha256(prompt_text.encode('utf-8')).hexdigest()[:16]

# --- Prompts ---

STAGE_1_LAYOUT_PROMPT = """
You are an expert architectural AI. Your task is to generate the foundational layout for a house based on a user's prompt.

**Instructions:**
1.  **Analyze the Request:** Carefully read the user's prompt to understand the constraints (e.g., plot size, number of floors, total area, number of rooms).
2.  **Design the Layout:** Create a logical and functional floor plan. Ensure rooms are reasonably sized and placed.
3.  **Define Levels and Rooms:** Structure your output with levels (e.g., ground floor, first floor) and the rooms within each level.
4.  **Specify Room Bounds:** For each room, define its `bounds` as a rectangle with `x`, `y`, `width`, and `height`. The origin (0,0) is the top-left corner of the plot.
5.  **Adhere to the Schema:** The output MUST be a single JSON object that validates against the `HouseOutput` schema, but **ONLY include the `levels` and `rooms`**. Do NOT include `doors` or `windows` at this stage.
6.  **Use Unique IDs:** Assign a unique string `id` to every level and every room (e.g., "level_0", "room_0", "kitchen_0").

**User Prompt:**
{user_prompt}

**Output Format (JSON Object only):**
```json
{{
  "levels": [
    {{
      "id": "level_0",
      "level_number": 0,
      "rooms": [
        {{
          "id": "living_room_0",
          "room_type": "living_room",
          "bounds": {{"x": 0, "y": 0, "width": 20, "height": 15}}
        }},
        {{
          "id": "kitchen_0",
          "room_type": "kitchen",
          "bounds": {{"x": 20, "y": 0, "width": 10, "height": 15}}
        }}
      ]
    }}
  ]
}}
```
"""

STAGE_2_DOORS_PROMPT = """
You are an expert architectural AI. You will be given a JSON object describing a house layout. Your task is to add doors to this layout.

**Instructions:**
1.  **Analyze the Layout:** Review the provided `levels` and `rooms` JSON. Understand the room adjacencies.
2.  **Place Doors:** Add doors logically. Every room should be accessible. Add an exterior door for the main entrance. Connect adjacent rooms where appropriate.
3.  **Define Door Properties:** For each door, specify its `position` (`x`, `y`), `width`, `type` ('interior', 'exterior', 'sliding', 'pocket'), and the two room IDs it connects (`room1`, `room2`). For exterior doors, `room2` should be a descriptive string like "exterior_front" or "exterior_backyard".
4.  **Adhere to the Schema:** The output MUST be a single JSON list of `Door` objects. Do NOT include any other text, conversation, or markdown.

**House Layout:**
{layout_json}

**Golden Example:**
A door connecting `living_room_0` and `kitchen_0` might look like this:
`{{ "position": {{"x": 19.5, "y": 7}}, "width": 3, "type": "interior", "room1": "living_room_0", "room2": "kitchen_0" }}`

**Output Format (JSON List only):**
```json
[
  {{
    "position": {{"x": 0, "y": 7}},
    "width": 3.5,
    "type": "exterior",
    "room1": "entrance_0",
    "room2": "exterior_front"
  }},
  {{
    "position": {{"x": 10, "y": 7}},
    "width": 3,
    "type": "interior",
    "room1": "entrance_0",
    "room2": "living_room_0"
  }}
]
```
"""

STAGE_3_WINDOWS_PROMPT = """
You are an expert architectural AI. You will be given a JSON object describing a house layout. Your task is to add windows to this layout.

**Instructions:**
1.  **Analyze the Layout:** Review the provided `levels` and `rooms` JSON.
2.  **Place Windows:** Add windows logically to exterior walls. Rooms like living rooms and bedrooms should have ample windows. Bathrooms might have smaller or fewer windows.
3.  **Define Window Properties:** For each window, specify its `position` (`x`, `y`), `width`, `height`, `type` ('fixed', 'casement', 'sliding', 'bay'), and the `room_id` it belongs to.
4.  **Adhere to the Schema:** The output MUST be a single JSON list of `Window` objects. Do NOT include any other text, conversation, or markdown.

**House Layout:**
{layout_json}

**Output Format (JSON List only):**
```json
[
  {{
    "position": {{"x": 5, "y": 0}},
    "width": 8,
    "height": 5,
    "type": "bay",
    "room_id": "living_room_0"
  }},
  {{
    "position": {{"x": 25, "y": 0}},
    "width": 4,
    "height": 3,
    "type": "sliding",
    "room_id": "kitchen_0"
  }}
]
```
"""

# --- Helper Functions ---

def call_ollama_colab(model, prompt, max_retries=3, delay=5):
    """Function to call the Ollama API running on Google Colab."""
    OLLAMA_ENDPOINT = "http://localhost:11434/api/generate"
    headers = {"Content-Type": "application/json"}
    data = {"model": model, "prompt": prompt, "stream": False}

    for attempt in range(max_retries):
        try:
            req = Request(OLLAMA_ENDPOINT, data=json.dumps(data).encode("utf-8"), headers=headers, method="POST")
            with urlopen(req) as response:
                response_body = response.read().decode("utf-8")
                response_json = json.loads(response_body)
                return response_json.get("response", "").strip()
        except (URLError, HTTPError, ConnectionResetError) as e:
            print(f"ERROR: Ollama connection error on attempt {attempt + 1}/{max_retries}: {e}")
            if attempt < max_retries - 1:
                print(f"Retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                print("ERROR: Max retries reached. Failing.")
                return None
        except Exception as e:
            print(f"ERROR: Unexpected error calling Ollama: {e}")
            return None

def repair_json(text, target_type=dict):
    """
    Aggressively finds and parses a JSON object or list from a string.
    Even if it's embedded in conversation or markdown.
    """
    text = str(text) # Ensure input is a string

    # Determine the start and end delimiters based on the target type
    if target_type == list:
        start_char, end_char = '[', ']'
    else:
        start_char, end_char = '{', '}'

    # 1. First, try a direct parse. This is the ideal case.
    try:
        parsed = json.loads(text)
        if isinstance(parsed, target_type):
            print("✅ Initial parse successful.")
            return parsed
    except json.JSONDecodeError:
        pass # It's not a clean JSON, so we move to extraction

    # 2. If direct parse fails, use regex to find the JSON blob.
    # This regex is designed to find the first occurrence of a valid JSON structure.
    print(f"Initial parse for type {target_type.__name__} failed. Attempting aggressive extraction...")
    pattern = re.compile(f'\\{start_char}[\\s\\S]*\\{end_char}')
    match = pattern.search(text)

    if not match:
        print(f"❌ No JSON structure of type {target_type.__name__} found in the text.")
        return None

    potential_json = match.group(0)
    try:
        parsed = json.loads(potential_json)
        if isinstance(parsed, target_type):
            print(f"✅ Successfully extracted and parsed JSON of type {target_type.__name__}.")
            return parsed
        else:
            # Fallback for when a list is wrapped in a dict
            if target_type == list and isinstance(parsed, dict):
                for key, value in parsed.items():
                    if isinstance(value, list):
                        print(f"✅ Repaired JSON by extracting list from key '{key}'.")
                        return value
            print(f"❌ Extracted JSON is not of the target type {target_type.__name__}.")
            return None
    except json.JSONDecodeError as e:
        print(f"❌ Failed to parse the extracted JSON blob: {e}")
        return None


def assemble_plan(layout_dict, doors_list, windows_list):
    """A completely bulletproof assembly function that can handle any malformed data."""
    # Safety wrapper for dict access to handle any possible type error
    def safe_get(d, key, default=None):
        try:
            if not isinstance(d, dict):
                return default
            return d.get(key, default)
        except:
            return default

    # Initialize an empty result with a safe structure
    result = {"levels": []}

    # Safely extract and verify the levels
    try:
        if not isinstance(layout_dict, dict):
            print("⚠️ Assembly Error: Layout is not a dictionary. Creating empty layout.")
            return result

        levels = safe_get(layout_dict, "levels", [])
        if not isinstance(levels, list):
            print("⚠️ Assembly Error: 'levels' key is not a list. Creating empty layout.")
            return result

        result["levels"] = levels

        # Create a safe room lookup dictionary
        rooms_by_id = {}

        # Process each level and room with complete safety
        for level_idx, level in enumerate(levels):
            try:
                if not isinstance(level, dict):
                    print(f"⚠️ Assembly Warning: Skipping non-dict level at index {level_idx}.")
                    continue

                # Initialize an empty rooms list if needed
                if "rooms" not in level or not isinstance(level["rooms"], list):
                    level["rooms"] = []
                    continue

                # Process each room
                for room_idx, room in enumerate(level["rooms"]):
                    try:
                        if not isinstance(room, dict):
                            print(f"⚠️ Assembly Warning: Skipping non-dict room at index {room_idx}.")
                            continue

                        room_id = safe_get(room, "id")
                        if not isinstance(room_id, str):
                            print(f"⚠️ Assembly Warning: Room has invalid ID: {room_id}")
                            continue

                        # Initialize empty collections for doors and windows
                        if "doors" not in room or not isinstance(room["doors"], list):
                            room["doors"] = []
                        if "windows" not in room or not isinstance(room["windows"], list):
                            room["windows"] = []

                        # Add to lookup
                        rooms_by_id[room_id] = room
                    except Exception as e:
                        print(f"⚠️ Assembly Warning: Error processing room: {str(e)}")
            except Exception as e:
                print(f"⚠️ Assembly Warning: Error processing level: {str(e)}")

        # Process windows safely
        if isinstance(windows_list, list):
            for window_idx, window in enumerate(windows_list):
                try:
                    if not isinstance(window, dict):
                        print(f"⚠️ Assembly Warning: Discarding non-dict window at index {window_idx}: {window}")
                        continue

                    room_id = safe_get(window, "room_id")
                    if not isinstance(room_id, str) or room_id not in rooms_by_id:
                        print(f"⚠️ Assembly Warning: Window has invalid room_id: {room_id}")
                        continue

                    # Verify window has required fields
                    required_fields = ["position", "width", "height", "type"]
                    for field in required_fields:
                        if field not in window:
                            print(f"⚠️ Assembly Warning: Window missing required field: {field}")
                            break
                    else:
                        # All checks passed, safe to add
                        rooms_by_id[room_id]["windows"].append(window)
                except Exception as e:
                    print(f"⚠️ Assembly Warning: Error processing window: {str(e)}")

        # Process doors safely
        if isinstance(doors_list, list):
            for door_idx, door in enumerate(doors_list):
                try:
                    if not isinstance(door, dict):
                        print(f"⚠️ Assembly Warning: Discarding non-dict door at index {door_idx}: {door}")
                        continue

                    room1_id = safe_get(door, "room1")
                    if not isinstance(room1_id, str) or room1_id not in rooms_by_id:
                        print(f"⚠️ Assembly Warning: Door has invalid room1_id: {room1_id}")
                        continue

                    # Verify door has required fields
                    required_fields = ["position", "width", "type", "room2"]
                    for field in required_fields:
                        if field not in door:
                            print(f"⚠️ Assembly Warning: Door missing required field: {field}")
                            break
                    else:
                        # All checks passed, safe to add
                        rooms_by_id[room1_id]["doors"].append(door)
                except Exception as e:
                    print(f"⚠️ Assembly Warning: Error processing door: {str(e)}")
    except Exception as e:
        print(f"⚠️ Assembly Error: Unexpected error during assembly: {str(e)}")

    return result

# --- Execution ---

print("--- Starting Data Factory Run (V3: Assembly Line) ---")

# Load the master list of prompts
with open(MASTER_PROMPT_LIST_PATH, 'r') as f:
    master_prompt_list = f.read().splitlines()
print(f"✅ Found {len(master_prompt_list)} total prompts in the master list.")


# In parallel mode, we process a small, random batch of prompts
prompts_to_process = random.sample(master_prompt_list, BATCH_SIZE)
print(f"✅ This run will process a random batch of {len(prompts_to_process)} prompts.")


for i, prompt_text in enumerate(prompts_to_process):
    print("\n==================================================")
    print(f"Processing prompt {i+1}/{len(prompts_to_process)}")
    # Truncate for display
    print(prompt_text[:100] + "..." if len(prompt_text) > 100 else prompt_text)
    print("==================================================")

    # --- STAGE 1: Layout ---
    print("Running Stage 1: Layout Generation...")
    stage_1_prompt = STAGE_1_LAYOUT_PROMPT.format(user_prompt=prompt_text)
    stage_1_response_text = call_ollama_colab(MODEL_NAME, stage_1_prompt)
    if not stage_1_response_text:
        print("❌ Stage 1 Failed: No response from model.")
        continue
    layout_data = repair_json(stage_1_response_text, target_type=dict)
    if not layout_data:
        print("❌ Stage 1 Failed: Could not produce a valid layout JSON.")
        continue

    # --- STAGE 2: Doors ---
    print("Running Stage 2: Door Generation...")
    layout_json_str = json.dumps(layout_data, indent=2)
    stage_2_prompt = STAGE_2_DOORS_PROMPT.format(layout_json=layout_json_str)
    stage_2_response_text = call_ollama_colab(MODEL_NAME, stage_2_prompt)
    if not stage_2_response_text:
        print("❌ Stage 2 Failed: No response from model.")
        continue
    doors_data = repair_json(stage_2_response_text, target_type=list)
    if doors_data is None: # Check for None specifically, as [] is a valid list
        print("❌ Stage 2 Failed: Could not produce a valid list of doors.")
        continue

    # --- STAGE 3: Windows ---
    print("Running Stage 3: Window Generation...")
    stage_3_prompt = STAGE_3_WINDOWS_PROMPT.format(layout_json=layout_json_str)
    stage_3_response_text = call_ollama_colab(MODEL_NAME, stage_3_prompt)
    if not stage_3_response_text:
        print("❌ Stage 3 Failed: No response from model.")
        continue
    windows_data = repair_json(stage_3_response_text, target_type=list)
    if windows_data is None: # Check for None specifically, as [] is a valid list
        print("❌ Stage 3 Failed: Could not produce a valid list of windows.")
        continue

    # --- STAGE 4: Assembly & Validation ---
    print("Running Stage 4: Assembling and Validating...")
    try:
        # Assemble the plan using the new bulletproof function
        assembled_layout = assemble_plan(layout_data, doors_data, windows_data)

        # Calculate total area and count rooms from the generated layout.
        # This is a safe calculation that won't crash if keys are missing.
        calculated_area = 0.0
        bedroom_count = 0
        bathroom_count = 0
        if 'levels' in assembled_layout and isinstance(assembled_layout.get('levels'), list):
            for level in assembled_layout.get('levels', []):
                if isinstance(level, dict) and isinstance(level.get('rooms'), list):
                    for room in level.get('rooms', []):
                         if isinstance(room, dict):
                            # Calculate area
                            if isinstance(room.get('bounds'), dict):
                                width = room.get('bounds', {}).get('width', 0)
                                height = room.get('bounds', {}).get('height', 0)
                                if isinstance(width, (int, float)) and isinstance(height, (int, float)):
                                    calculated_area += width * height
                            # Count room types
                            room_type = room.get('type')
                            if room_type in ["bedroom", "master_bedroom"]:
                                bedroom_count += 1
                            if room_type in ["bathroom", "half_bath"]:
                                bathroom_count += 1

        # Create a complete dictionary that matches the HouseOutput schema
        final_plan_dict_for_validation = {
            "input": {
                "basicDetails": {
                    "prompt": prompt_text,
                    # Add dummy/calculated values for other required fields
                    "totalArea": calculated_area,
                    "unit": "sqft",
                    "floors": len(assembled_layout.get('levels', [])),
                    "bedrooms": bedroom_count,
                    "bathrooms": bathroom_count,
                    "style": "unknown",
                    "budget": 0
                },
                "plot": {}, # Dummy value, as it's a required dict
                "roomBreakdown": [] # Dummy value, as it's a required list
            },
            "levels": assembled_layout.get("levels", []),
            "total_area": calculated_area,
            "construction_cost": 0.0, # Dummy value
        }

        # Validate with Pydantic
        validated_plan = HouseOutput.model_validate(final_plan_dict_for_validation)

        # Save the validated plan
        file_hash = generate_file_hash(prompt_text)
        output_path = os.path.join(OUTPUT_DIR, f"plan_{file_hash}.json")
        with open(output_path, 'w') as f:
            f.write(validated_plan.model_dump_json(indent=2))
        print(f"✅ SUCCESS! Saved validated plan to {output_path}")

    except ValidationError as e:
        print(f"❌ Stage 4 Failed: Pydantic validation error - {e}")
    except Exception as e:
        print(f"❌ Stage 4 Failed: An unexpected error occurred - {e}")


print("\n🎉 Data Factory run complete!")

