In [None]:
# --- 1. Install and Start Ollama ---
%%bash
curl -fsSL https://ollama.com/install.sh | sh
ollama serve > ollama_server.log 2>&1 &
sleep 10
if pgrep -x "ollama" > /dev/null; then echo "✅ Ollama server is running."; else echo "❌ Ollama server failed to start."; cat ollama_server.log; fi


In [None]:
# --- 2. Install Python Client ---
!pip install -q ollama ipywidgets pandas tqdm


In [None]:
# --- 3. Mount Drive & Load Prompts ---
from google.colab import drive
import os
import random
drive.mount('/content/drive')
PROMPT_FILE_PATH = "/content/drive/MyDrive/housebrain_prompts/platinum_prompts.txt" 
def load_prompts_from_file(filepath):
    print(f"\\nLoading prompts from {filepath}...")
    if not os.path.exists(filepath):
        print(f"❌ ERROR: Prompt file not found. Please check the path.")
        return []
    with open(filepath, 'r', encoding='utf-8') as f:
        prompts = [line.strip() for line in f if line.strip()]
    print(f"✅ Successfully loaded {len(prompts)} prompts.")
    return prompts
ALL_PROMPTS = load_prompts_from_file(PROMPT_FILE_PATH)
if ALL_PROMPTS:
    random.shuffle(ALL_PROMPTS)
    print("✅ Prompts have been successfully shuffled for this run.")


In [None]:
# --- 4. Configure Run ---
import ipywidgets as widgets
from IPython.display import display
import os
model_options = [
    "phi4-reasoning:latest", "phi4-reasoning:plus", "phi4-reasoning:14b",
    "phi4-reasoning:14b-plus-q4_K_M", "phi3:instruct", "llama3:instruct"
]
model_dropdown = widgets.Dropdown(
    options=model_options, value='phi4-reasoning:latest', description='Select Model:',
    disabled=False, style={'description_width': 'initial'}
)
display(model_dropdown)
DATASET_TIER = "gold_tier"
NUM_PLANS_TO_GENERATE = 15000 
BASE_OUTPUT_DIR = "raw_generated_data"
TIER_OUTPUT_DIR = os.path.join(BASE_OUTPUT_DIR, DATASET_TIER)
os.makedirs(TIER_OUTPUT_DIR, exist_ok=True)


In [None]:
# --- 5. Pull & Verify Model ---
selected_model = model_dropdown.value
print(f"--- Preparing Model: {selected_model} ---")
print(f"Attempting to download via Ollama... (This may take several minutes)")
!ollama pull {selected_model}
print(f"\\n--- Verifying {selected_model} Installation ---")
!ollama list
print("\\n" + "="*50)
print("✅ Setup Complete!")
print("You are ready to run the data generation in the next cell.")
print("="*50)


In [None]:
# --- 6. Run Data Generation (Parallelized for Max GPU Utilization) ---
import ollama
import json
import hashlib
import time
from tqdm.notebook import tqdm
import random
import concurrent.futures

# --- A. Set Parallelism Level ---
# The number of parallel requests to send to the Ollama server.
# For an A100 with 40GB VRAM, a value between 32 and 96 is a good range.
# We will start with an aggressive value of 64.
# MONITOR your "GPU RAM" in the Colab resource panel. If it gets too close to the limit,
# stop this cell, lower the number, and run it again.
MAX_WORKERS = 64

# --- B. Generation Function (Modified for Parallelism & Logging) ---
def generate_and_save_raw_plan(task_tuple):
    """
    Generates a plan and saves the raw output. Includes logging to visualize workers.
    """
    index, prompt, model_name, tier_dir = task_tuple
    
    # Use tqdm.write for thread-safe logging that won't corrupt the progress bar.
    tqdm.write(f"[Task {index:05d}] STARTING...")

    prompt_hash = hashlib.md5(prompt.encode()).hexdigest()[:10]
    timestamp = int(time.time())
    unique_id = f"prompt_{index:05d}_{prompt_hash}_{timestamp}"
    run_output_dir = os.path.join(tier_dir, model_name, unique_id)
    os.makedirs(run_output_dir, exist_ok=True)
    output_filename = os.path.join(run_output_dir, "raw_output.json")
    
    try:
        structured_prompt = f"""
        Please act as an expert architect specializing in Indian residential and commercial design. Your task is to generate a detailed JSON representation of a floor plan based on the following request, keeping local building norms and Vastu principles in mind where appropriate.
        **Architectural Request:** "{prompt}"
        **Instructions:** Provide ONLY the JSON output.
        **JSON Schema:** {{ "levels": [ {{ "level_id": "ground_floor", "rooms": [], "openings": [] }} ] }}
        Now, begin.
        """
        response = ollama.chat(
            model=model_name,
            messages=[{'role': 'user', 'content': structured_prompt}],
            format='json'
        )
        raw_output = response['message']['content']
        with open(output_filename, 'w') as f:
            f.write(raw_output)
        
        tqdm.write(f"[Task {index:05d}] COMPLETED SUCCESSFULLY.")
        return True
    except Exception as e:
        error_message = f"An error occurred: {str(e)}"
        with open(os.path.join(run_output_dir, "error.log"), 'w') as f:
            f.write(error_message)
        
        tqdm.write(f"[Task {index:05d}] FAILED. See error.log for details.")
        return False

# --- C. Main Generation Loop (Parallelized) ---
if 'ALL_PROMPTS' in locals() and ALL_PROMPTS:
    print(f"\\n🚀 Starting PARALLEL data generation for {NUM_PLANS_TO_GENERATE} samples using {MAX_WORKERS} workers...")
    
    tasks = []
    for i in range(NUM_PLANS_TO_GENERATE):
        current_prompt = ALL_PROMPTS[i % len(ALL_PROMPTS)]
        tasks.append((i, current_prompt, selected_model, TIER_OUTPUT_DIR))

    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        results = list(tqdm(executor.map(generate_and_save_raw_plan, tasks), total=len(tasks), desc=f"Generating with {selected_model}"))

    successful_generations = sum(1 for res in results if res is True)

    print("\\n" + "="*50)
    print("✅ Gold Tier Data Generation Complete!")
    print(f"Successfully generated {successful_generations} / {NUM_PLANS_TO_GENERATE} raw plan files.")
    print(f"All outputs are saved in: '{TIER_OUTPUT_DIR}/{selected_model}'")
    print("="*50)
else:
    print("\\n🛑 HALTED. Please run the prompt loading cell (Cell 3) successfully first.")


In [None]:
# --- 7. Package and Download Results ---
import shutil
import time
output_directory_path = os.path.join(BASE_OUTPUT_DIR, DATASET_TIER, selected_model)
zip_filename = f"{DATASET_TIER}_{selected_model}_raw_data_{int(time.time())}"
zip_filepath = f"/content/{zip_filename}"
print(f"Locating generated data in: {output_directory_path}...")
if os.path.isdir(output_directory_path):
    print(f"Compressing into '{zip_filename}.zip'...")
    shutil.make_archive(zip_filepath, 'zip', output_directory_path)
    print(f"✅ Success! Your data is compressed and ready at {zip_filepath}.zip")
else:
    print(f"❌ Error: Could not find the output directory '{output_directory_path}'.")
