In [None]:
# --- 1. Install and Start Ollama (High-Concurrency Mode) ---
# This is the most critical step. We must install and configure the Ollama service.
# The '%%bash' magic runs the entire cell as a shell script.
%%bash
# 1. Install Ollama
curl -fsSL https://ollama.com/install.sh | sh

# 2. CONFIGURE FOR PARALLELISM: Set the number of parallel requests Ollama will handle.
# This is the key to maximizing A100 GPU utilization.
export OLLAMA_NUM_PARALLEL=20

# 3. Start the server in the background and log its output for debugging
ollama serve > ollama_server.log 2>&1 &

# 4. Wait 10 seconds to ensure the server has time to fully initialize
sleep 10

# 5. Verify that the server process is running by checking for the 'ollama' process
if pgrep -x "ollama" > /dev/null
then
    echo "✅ Ollama server is running successfully in high-concurrency mode."
else
    echo "❌ Ollama server failed to start. Please check the log below:"
    cat ollama_server.log
fi


In [None]:
# --- 2. Install Python Client ---
!pip install -q ollama ipywidgets pandas tqdm


In [None]:
# --- 3. Mount Drive & Load Prompts ---
from google.colab import drive
import os
import random

print("▶️ Mounting Google Drive...")
drive.mount('/content/drive')
print("✅ Google Drive mounted successfully.")

PROMPT_FILE_PATH = "/content/drive/MyDrive/housebrain_prompts/platinum_prompts.txt" 

def load_prompts_from_file(filepath):
    print(f"\nLoading prompts from {filepath}...")
    if not os.path.exists(filepath):
        print(f"❌ ERROR: Prompt file not found. Please check the path.")
        return []
    with open(filepath, 'r', encoding='utf-8') as f:
        prompts = [line.strip() for line in f if line.strip()]
    print(f"✅ Successfully loaded {len(prompts)} prompts.")
    return prompts

ALL_PROMPTS = load_prompts_from_file(PROMPT_FILE_PATH)

if ALL_PROMPTS:
    random.shuffle(ALL_PROMPTS)
    print("✅ Prompts have been successfully shuffled for this run.")


In [None]:
# --- 4. Configure Run ---
import ipywidgets as widgets
from IPython.display import display
import os

# --- A. Select Model ---
# The powerful gpt-oss:120b is now the default for high-quality generation.
model_options = [
    "gpt-oss:120b", "gpt-oss:20b",
    "magistral:24b", "magistral:12b", "magistral:8b",
    "phi4-reasoning:latest", "phi4-reasoning:plus", "phi4-reasoning:14b",
    "phi4-reasoning:14b-plus-q4_K_M", "phi3:instruct", "llama3:instruct",
    "llama3.1:8b", "gemma3:27b","qwen3:14b","qwen2.5:72b","qwen3:32b","qwen2.5:32b"
]
model_dropdown = widgets.Dropdown(
    options=model_options, value='gpt-oss:120b', description='Select Model:',
    disabled=False, style={'description_width': 'initial'}
)
display(model_dropdown)

# --- B. Configure Paths and Counts ---
# All output will be saved directly into your Google Drive to prevent data loss.
BASE_OUTPUT_DIR = "/content/drive/MyDrive/HouseBrain/generated_data" # SAVING TO DRIVE!
DATASET_TIER = "platinum_tier" # We are aiming for a higher quality tier now
NUM_PLANS_TO_GENERATE = 10000 # The script will generate *up to* this many new plans.

# Create the base directory on Google Drive if it doesn't exist
os.makedirs(BASE_OUTPUT_DIR, exist_ok=True)
print(f"✔️ Base output directory is set to: {BASE_OUTPUT_DIR}")


In [None]:
# --- 5. Pre-pull the LLM Model (Recommended) ---
# This step is highly recommended to avoid a long delay on the first generation.
# It explicitly downloads the selected model so you can see the progress.
# If the model is already local, this will finish instantly.
selected_model = model_dropdown.value
print("---  मॉडल डाउनलोड हो रहा है (Model is downloading) ---")
print(f"Selected model: {selected_model}")
print("This may take several minutes for large models like gpt-oss:120b...")

# Using a system call to ollama pull to get nice, clean progress bars.
get_ipython().system(f'ollama pull {selected_model}')

print("\n✅ Model is ready locally.")


In [None]:
!ollama list

In [None]:
# --- 6. Run Data Generation (V3.1 - Big Guns Prompting) ---
import ollama
import json
import hashlib
import time
from tqdm.notebook import tqdm
import concurrent.futures
import os

# --- A. Set Parallelism Level ---
MAX_WORKERS = 20

# --- B. V3.1 Master Prompt Template ---
# Upgraded for more powerful models. Demands higher architectural realism.
V3_MASTER_PROMPT_TEMPLATE = """
You are a world-class AI architect with a deep understanding of spatial design, residential building codes, and architectural aesthetics. Your task is to generate a single, complete, and meticulously detailed JSON object representing a house plan based on the user's request. You must adhere strictly to the provided schema and instructions.

### USER REQUEST:
**{user_prompt}**

### CRITICAL INSTRUCTIONS:
1.  **Output ONLY JSON:** Your entire response must be a single JSON object. Do not include any introductory text, conversation, or markdown (```json) formatting.
2.  **Strict Schema Adherence:** The generated JSON MUST perfectly conform to the structure shown in the `GOLDEN_EXAMPLE`. Use the exact key names and data types.
3.  **Geometric Consistency & Realism:** Rooms must have realistic dimensions and be placed logically. The `bounds` (`x`, `y`, `width`, `height`) for each room must NOT overlap with other rooms on the same level. Avoid simple, unrealistic linear layouts; create plans with believable adjacencies and flow.
4.  **Connectivity is Paramount:** `doors` MUST be placed precisely on the shared wall between the two rooms they connect. `windows` MUST be placed on exterior walls (walls that do not touch another room).
5.  **Valid Room Types:** The `type` for each room must be one of the following: `living_room`, `dining_room`, `kitchen`, `master_bedroom`, `bedroom`, `bathroom`, `half_bath`, `family_room`, `study`, `garage`, `utility`, `storage`, `stairwell`, `corridor`, `entrance`, `balcony`.
6.  **Populate All Required Fields:** Ensure top-level fields like `input`, `total_area`, and `construction_cost` are present and filled with reasonable, calculated values based on the generated plan.

### GOLDEN_EXAMPLE (A realistic two-room layout):
```json
{{
  "input": {{ "basicDetails": {{"bedrooms": 2, "floors": 1, "totalArea": 840, "style": "Modern"}}, "plot": {{"shape": "rectangular", "length": 40, "width": 30}}, "roomBreakdown": [] }},
  "total_area": 840.0,
  "construction_cost": 150000.0,
  "levels": [
    {{
      "level_number": 0,
      "rooms": [
        {{
          "id": "living_room",
          "type": "living_room",
          "bounds": {{"x": 0, "y": 0, "width": 15, "height": 20}},
          "doors": [
            {{"position": {{"x": 15, "y": 10}}, "width": 3.0, "type": "interior", "room1": "living_room", "room2": "kitchen"}}
          ],
          "windows": [
            {{"position": {{"x": 7.5, "y": 0}}, "width": 8.0, "room_id": "living_room"}}
          ]
        }},
        {{
          "id": "kitchen",
          "type": "kitchen",
          "bounds": {{"x": 15, "y": 0, "width": 10, "height": 12}},
          "doors": [],
          "windows": [
            {{"position": {{"x": 25, "y": 6}}, "width": 4.0, "room_id": "kitchen"}}
          ]
        }}
      ]
    }}
  ]
}}
```

Now, generate the complete, valid, and architecturally sound JSON for the user request.
"""

# --- C. Generation Function (Updated for V3) ---
def generate_and_save_raw_plan(args):
    task_id, prompt_content, model_name, base_dir, tier = args
    prompt_hash = hashlib.sha1(prompt_content.encode()).hexdigest()[:10]
    prompt_id = f"prompt_{task_id:05d}_{prompt_hash}"

    output_dir_for_prompt = os.path.join(base_dir, tier, model_name, prompt_id)
    final_output_path = os.path.join(output_dir_for_prompt, "raw_output.json")

    if os.path.exists(final_output_path):
        tqdm.write(f"🟢 [Task {task_id:05d}] SKIPPING (Already exists): {prompt_id}")
        return (prompt_id, True, "Skipped")

    tqdm.write(f"⚪ [Task {task_id:05d}] STARTING: {prompt_id}")
    os.makedirs(output_dir_for_prompt, exist_ok=True, mode=0o777)

    try:
        # Use the new V3 prompt template
        structured_prompt = V3_MASTER_PROMPT_TEMPLATE.format(user_prompt=prompt_content)
        
        response = ollama.chat(
            model=model_name,
            messages=[{'role': 'user', 'content': structured_prompt}],
            format='json'
        )
        raw_output = response['message']['content']

        with open(final_output_path, 'w') as f:
            f.write(raw_output)

        tqdm.write(f"✅ [Task {task_id:05d}] COMPLETED & SAVED: {prompt_id}")
        return (prompt_id, True, final_output_path)

    except Exception as e:
        error_message = f"An error occurred in task {task_id}: {str(e)}"
        error_log_path = os.path.join(output_dir_for_prompt, "error.log")
        with open(error_log_path, 'w') as f:
            f.write(error_message)
        tqdm.write(f"❌ [Task {task_id:05d}] FAILED. Log saved to {error_log_path}")
        return (prompt_id, False, error_message)

# --- D. Main Generation Loop (Unchanged) ---
if 'ALL_PROMPTS' in locals() and ALL_PROMPTS:
    print("\n--- Checking for existing data in Google Drive to prevent re-work ---")
    
    tasks_to_run = []
    total_prompts_to_consider = min(NUM_PLANS_TO_GENERATE, len(ALL_PROMPTS))

    for i in range(total_prompts_to_consider):
        prompt_content = ALL_PROMPTS[i]
        prompt_hash = hashlib.sha1(prompt_content.encode()).hexdigest()[:10]
        prompt_id = f"prompt_{i:05d}_{prompt_hash}"
        output_path = os.path.join(BASE_OUTPUT_DIR, DATASET_TIER, selected_model, prompt_id, "raw_output.json")
        
        if not os.path.exists(output_path):
            tasks_to_run.append((i, prompt_content, selected_model, BASE_OUTPUT_DIR, DATASET_TIER))

    total_already_generated = total_prompts_to_consider - len(tasks_to_run)
    print(f"Targeting {NUM_PLANS_TO_GENERATE} total plans.")
    print(f"Found {total_already_generated} plans already completed in previous runs.")
    
    if not tasks_to_run:
        print("\n🎉 All requested plans have already been generated. Nothing to do!")
    else:
        print(f"▶️ Preparing to generate {len(tasks_to_run)} new plans...")
        print(f"🚀 Starting PARALLEL data generation using {MAX_WORKERS} workers...")

        successful_generations = 0
        with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            results = list(tqdm(executor.map(generate_and_save_raw_plan, tasks_to_run), total=len(tasks_to_run), desc=f"Generating with {selected_model}"))
        
        successful_generations = sum(1 for res in results if res[1] is True and res[2] != "Skipped")

        print("\n" + "="*50)
        print("✅ Data Generation Run Complete!")
        print(f"Successfully generated {successful_generations} NEW raw plan files in this run.")
        final_output_dir = os.path.join(BASE_OUTPUT_DIR, DATASET_TIER, selected_model)
        print(f"All outputs are saved in your Google Drive at: '{final_output_dir}'")
        print("="*50)
else:
    print("\n🛑 HALTED. Please run the prompt loading cell (Cell 3) successfully first.")


In [None]:
# --- 7. (Optional) Package and Download Results ---
import shutil
import time
import os

# The data is already safe in Google Drive. This cell is for creating a convenient zip archive.
output_directory_path = os.path.join(BASE_OUTPUT_DIR, DATASET_TIER, selected_model)
zip_filename = f"{DATASET_TIER}_{selected_model.replace(':', '_')}_raw_data_{int(time.time())}"
zip_filepath_in_colab = f"/content/{zip_filename}" # We'll create the zip in the local runtime for speed

print(f"Locating generated data in your Google Drive: {output_directory_path}...")

if os.path.isdir(output_directory_path):
    print(f"Compressing into '{zip_filename}.zip' in the Colab runtime...")
    # This might take a while if the dataset is very large
    try:
        shutil.make_archive(
            base_name=zip_filepath_in_colab,
            format='zip',
            root_dir=output_directory_path
        )
        print(f"✅ Success! Your data is compressed and ready for download at {zip_filepath_in_colab}.zip")
        print("You can find it in the 'Files' panel on the left.")
    except Exception as e:
        print(f"❌ An error occurred during zipping: {e}")
else:
    print(f"❌ Error: Could not find the output directory '{output_directory_path}'. Please ensure the generation step ran correctly.")
