In [None]:
# --- 1. Install and Start Ollama ---
# This is the most critical step. We must install the Ollama service in the Colab environment.
# The '%%bash' magic runs the entire cell as a shell script.
%%bash
# 1. Install Ollama
curl -fsSL https://ollama.com/install.sh | sh

# 2. Start the server in the background and log its output for debugging
ollama serve > ollama_server.log 2>&1 &

# 3. Wait 10 seconds to ensure the server has time to fully initialize
sleep 10

# 4. Verify that the server process is running by checking for the 'ollama' process
if pgrep -x "ollama" > /dev/null
then
    echo "✅ Ollama server is running successfully."
else
    echo "❌ Ollama server failed to start. Please check the log below:"
    cat ollama_server.log
fi


In [None]:
# --- 2. Install Python Client ---
# With the server running, now we install the Python library that lets our code talk to it.
!pip install -q ollama ipywidgets pandas tqdm


In [None]:
# --- 3. Mount Drive & Load Prompts ---
from google.colab import drive
import os
import random

# Mount Google Drive
drive.mount('/content/drive')

# --- Path to your prompt file ---
# IMPORTANT: Verify this path is correct for your Google Drive setup.
PROMPT_FILE_PATH = "/content/drive/MyDrive/housebrain_prompts/platinum_prompts.txt" 

# --- Load Prompts ---
def load_prompts_from_file(filepath):
    """Loads prompts from a text file, one prompt per line."""
    print(f"\\nLoading prompts from {filepath}...")
    if not os.path.exists(filepath):
        print(f"❌ ERROR: Prompt file not found. Please check the path.")
        return []
    with open(filepath, 'r', encoding='utf-8') as f:
        prompts = [line.strip() for line in f if line.strip()]
    print(f"✅ Successfully loaded {len(prompts)} prompts.")
    return prompts

ALL_PROMPTS = load_prompts_from_file(PROMPT_FILE_PATH)

# --- SHUFFLE PROMPTS ---
# This is a critical step to ensure that multiple parallel runs generate unique data.
if ALL_PROMPTS:
    random.shuffle(ALL_PROMPTS)
    print("✅ Prompts have been successfully shuffled for this run.")


In [None]:
# --- 4. Configure Run & Pull Model ---
import ipywidgets as widgets
from IPython.display import display
import os

# --- A. Select Model ---
model_options = [
    "phi4-reasoning:latest", "phi4-reasoning:plus", "phi4-reasoning:14b",
    "phi4-reasoning:14b-plus-q4_K_M", "phi3:instruct", "llama3:instruct"
]
model_dropdown = widgets.Dropdown(
    options=model_options, value='phi4-reasoning:latest', description='Select Model:',
    disabled=False, style={'description_width': 'initial'}
)
display(model_dropdown)

# --- B. Tier & Number of Samples ---
DATASET_TIER = "gold_tier"
NUM_PLANS_TO_GENERATE = 15000 
BASE_OUTPUT_DIR = "raw_generated_data"
TIER_OUTPUT_DIR = os.path.join(BASE_OUTPUT_DIR, DATASET_TIER)
os.makedirs(TIER_OUTPUT_DIR, exist_ok=True)


In [None]:
# --- 5. Pull & Verify Model ---
# This cell takes the model you selected above and downloads it.

selected_model = model_dropdown.value
print(f"--- Preparing Model: {selected_model} ---")
print(f"Attempting to download via Ollama... (This may take several minutes)")

# Pull the model. The ! allows us to run the shell command.
!ollama pull {selected_model}

print(f"\\n--- Verifying {selected_model} Installation ---")
!ollama list

print("\\n" + "="*50)
print("✅ Setup Complete!")
print("You are ready to run the data generation in the next cell.")
print("="*50)


In [None]:
# --- 6. Run Data Generation ---
import ollama
import json
import hashlib
import time
from tqdm.notebook import tqdm
import random

def generate_and_save_raw_plan(prompt, model_name, tier_dir):
    """Generates a plan and saves the raw output."""
    prompt_hash = hashlib.md5(prompt.encode()).hexdigest()[:10]
    timestamp = int(time.time())
    unique_id = f"prompt_{prompt_hash}_{timestamp}"
    run_output_dir = os.path.join(tier_dir, model_name, unique_id)
    os.makedirs(run_output_dir, exist_ok=True)
    output_filename = os.path.join(run_output_dir, "raw_output.json")
    
    try:
        structured_prompt = f"""
        Please act as an expert architect specializing in Indian residential and commercial design. Your task is to generate a detailed JSON representation of a floor plan based on the following request, keeping local building norms and Vastu principles in mind where appropriate.
        **Architectural Request:** "{prompt}"
        **Instructions:** Provide ONLY the JSON output.
        **JSON Schema:** {{ "levels": [ {{ "level_id": "ground_floor", "rooms": [], "openings": [] }} ] }}
        Now, begin.
        """
        response = ollama.chat(
            model=model_name,
            messages=[{'role': 'user', 'content': structured_prompt}],
            format='json'
        )
        raw_output = response['message']['content']
        with open(output_filename, 'w') as f:
            f.write(raw_output)
        return True
    except Exception as e:
        error_message = f"An error occurred: {str(e)}"
        with open(os.path.join(run_output_dir, "error.log"), 'w') as f:
            f.write(error_message)
        return False

# --- Main Generation Loop ---
if 'ALL_PROMPTS' in locals() and ALL_PROMPTS:
    print(f"\\n🚀 Starting data generation for {NUM_PLANS_TO_GENERATE} samples...")
    successful_generations = 0
    for i in tqdm(range(NUM_PLANS_TO_GENERATE), desc=f"Generating with {selected_model}"):
        # Cycle through the shuffled list of prompts.
        # This ensures variety and that each run uses the prompts in a unique order.
        current_prompt = ALL_PROMPTS[i % len(ALL_PROMPTS)]
        if generate_and_save_raw_plan(current_prompt, selected_model, TIER_OUTPUT_DIR):
            successful_generations += 1
    print("\\n" + "="*50)
    print("✅ Gold Tier Data Generation Complete!")
    print(f"Successfully generated {successful_generations} / {NUM_PLANS_TO_GENERATE} raw plan files.")
    print(f"All outputs are saved in: '{TIER_OUTPUT_DIR}/{selected_model}'")
    print("="*50)
else:
    print("\\n🛑 HALTED. Please run the prompt loading cell (Cell 3) successfully first.")


In [None]:
# --- 7. Package and Download Results ---
import shutil
import time

# This cell packages the output from the last generation run into a single zip file.
output_directory_path = os.path.join(BASE_OUTPUT_DIR, DATASET_TIER, selected_model)
zip_filename = f"{DATASET_TIER}_{selected_model}_raw_data_{int(time.time())}"
zip_filepath = f"/content/{zip_filename}"

print(f"Locating generated data in: {output_directory_path}...")

if os.path.isdir(output_directory_path):
    print(f"Compressing into '{zip_filename}.zip'...")
    shutil.make_archive(zip_filepath, 'zip', output_directory_path)
    print(f"✅ Success! Your data is compressed and ready at {zip_filepath}.zip")
else:
    print(f"❌ Error: Could not find the output directory '{output_directory_path}'.")
