In [None]:
# --- 1. Install and Start Ollama (High-Concurrency Mode) ---
# This is the most critical step. We must install and configure the Ollama service.
# The '%%bash' magic runs the entire cell as a shell script.
%%bash
# 1. Install Ollama
curl -fsSL https://ollama.com/install.sh | sh

# 2. CONFIGURE FOR PARALLELISM: Set the number of parallel requests Ollama will handle.
# This is the key to maximizing A100 GPU utilization.
export OLLAMA_NUM_PARALLEL=20

# 3. Start the server in the background and log its output for debugging
ollama serve > ollama_server.log 2>&1 &

# 4. Wait 10 seconds to ensure the server has time to fully initialize
sleep 10

# 5. Verify that the server process is running by checking for the 'ollama' process
if pgrep -x "ollama" > /dev/null
then
    echo "✅ Ollama server is running successfully in high-concurrency mode."
else
    echo "❌ Ollama server failed to start. Please check the log below:"
    cat ollama_server.log
fi


In [None]:
# --- 2. Install Python Client ---
!pip install -q ollama ipywidgets pandas tqdm


In [None]:
# --- 3. Mount Drive & Load Prompts ---
from google.colab import drive
import os
import random
drive.mount('/content/drive')
PROMPT_FILE_PATH = "/content/drive/MyDrive/housebrain_prompts/platinum_prompts.txt" 
def load_prompts_from_file(filepath):
    print(f"\\nLoading prompts from {filepath}...")
    if not os.path.exists(filepath):
        print(f"❌ ERROR: Prompt file not found. Please check the path.")
        return []
    with open(filepath, 'r', encoding='utf-8') as f:
        prompts = [line.strip() for line in f if line.strip()]
    print(f"✅ Successfully loaded {len(prompts)} prompts.")
    return prompts
ALL_PROMPTS = load_prompts_from_file(PROMPT_FILE_PATH)
if ALL_PROMPTS:
    random.shuffle(ALL_PROMPTS)
    print("✅ Prompts have been successfully shuffled for this run.")


In [None]:
# --- 4. Configure Run & Mount Google Drive ---
import ipywidgets as widgets
from IPython.display import display
import os
from google.colab import drive

# --- A. Mount Google Drive ---
# This is now a critical step. All data will be saved directly to your Drive.
print("▶️ Mounting Google Drive...")
drive.mount('/content/drive')
print("✅ Google Drive mounted successfully.")

# --- B. Select Model ---
model_options = [
    "gpt-oss:120b", "gpt-oss:20b",
    "magistral:24b", "magistral:12b", "magistral:8b",
    "phi4-reasoning:latest", "phi4-reasoning:plus", "phi4-reasoning:14b",
    "phi4-reasoning:14b-plus-q4_K_M", "phi3:instruct", "llama3:instruct",
    "llama3.1:8b", "gemma3:27b","qwen3:14b","qwen2.5:72b","qwen3:32b","qwen2.5:32b"
]
model_dropdown = widgets.Dropdown(
    options=model_options, value='phi4-reasoning:latest', description='Select Model:',
    disabled=False, style={'description_width': 'initial'}
)
display(model_dropdown)

# --- C. Configure Paths and Counts ---
# All output will be saved directly into your Google Drive to prevent data loss.
BASE_OUTPUT_DIR = "/content/drive/MyDrive/HouseBrain/generated_data" # SAVING TO DRIVE!
DATASET_TIER = "gold_tier"
NUM_PLANS_TO_GENERATE = 3500 # The script will generate *up to* this many new plans.

# Create the base directory on Google Drive if it doesn't exist
os.makedirs(BASE_OUTPUT_DIR, exist_ok=True)
print(f"✔️ Base output directory is set to: {BASE_OUTPUT_DIR}")



In [None]:
# --- 5. Pull & Verify Model ---
selected_model = model_dropdown.value
print(f"--- Preparing Model: {selected_model} ---")
print(f"Attempting to download via Ollama... (This may take several minutes)")
!ollama pull {selected_model}
print(f"\\n--- Verifying {selected_model} Installation ---")
!ollama list
print("\\n" + "="*50)
print("✅ Setup Complete!")
print("You are ready to run the data generation in the next cell.")
print("="*50)


In [None]:
# --- 6. Run Data Generation (Resilient & Parallelized) ---
import ollama
import json
import hashlib
import time
from tqdm.notebook import tqdm
import concurrent.futures
import os

# --- A. Set Parallelism Level ---
# We still use a high number of workers to keep the GPU busy.
MAX_WORKERS = 20

# --- B. Generation Function (Now with Auto-Resume Logic) ---
def generate_and_save_raw_plan(args):
    """
    Generates a plan, but FIRST checks if it already exists on Google Drive.
    Saves the output directly to Google Drive to prevent data loss.
    """
    task_id, prompt_content, model_name, base_dir, tier = args
    prompt_hash = hashlib.sha1(prompt_content.encode()).hexdigest()[:10]
    prompt_id = f"prompt_{task_id:05d}_{prompt_hash}"

    # Construct the final, persistent output path on Google Drive
    output_dir_for_prompt = os.path.join(base_dir, tier, model_name, prompt_id)
    final_output_path = os.path.join(output_dir_for_prompt, "raw_output.json")

    # --- RESILIENCE CHECK ---
    # If the final output file already exists, we skip this task entirely.
    if os.path.exists(final_output_path):
        # This is not an error, it's by design. We use tqdm.write for thread-safe logging.
        tqdm.write(f"🟢 [Task {task_id:05d}] SKIPPING (Already exists): {prompt_id}")
        return (prompt_id, True, "Skipped")

    # If not skipped, proceed with generation
    tqdm.write(f"⚪ [Task {task_id:05d}] STARTING: {prompt_id}")
    os.makedirs(output_dir_for_prompt, exist_ok=True, mode=0o777) # Create dir just-in-time

    try:
        structured_prompt = f"""
        Please act as an expert architect specializing in Indian residential and commercial design. Your task is to generate a detailed JSON representation of a floor plan based on the following request, keeping local building norms and Vastu principles in mind where appropriate.
        **Architectural Request:** "{prompt_content}"
        **Instructions:** Provide ONLY the JSON output.
        **JSON Schema:** {{ "levels": [ {{ "level_id": "ground_floor", "rooms": [], "openings": [] }} ] }}
        Now, begin.
        """
        response = ollama.chat(
            model=model_name,
            messages=[{'role': 'user', 'content': structured_prompt}],
            format='json'
        )
        raw_output = response['message']['content']

        # --- PERSISTENT SAVE ---
        # Save the output directly to Google Drive IMMEDIATELY.
        with open(final_output_path, 'w') as f:
            # The raw output from the model is already a JSON string
            f.write(raw_output)

        tqdm.write(f"✅ [Task {task_id:05d}] COMPLETED & SAVED: {prompt_id}")
        return (prompt_id, True, final_output_path)

    except Exception as e:
        error_message = f"An error occurred in task {task_id}: {str(e)}"
        error_log_path = os.path.join(output_dir_for_prompt, "error.log")
        with open(error_log_path, 'w') as f:
            f.write(error_message)
        tqdm.write(f"❌ [Task {task_id:05d}] FAILED. Log saved to {error_log_path}")
        return (prompt_id, False, error_message)

# --- C. Main Generation Loop (Now with Pre-computation) ---
if 'ALL_PROMPTS' in locals() and ALL_PROMPTS:
    # --- Pre-computation: Find which prompts actually need to be generated ---
    print("\\n--- Checking for existing data in Google Drive to prevent re-work ---")
    
    # We will build a list of only the tasks that need to be run.
    tasks_to_run = []
    total_prompts_to_consider = min(NUM_PLANS_TO_GENERATE, len(ALL_PROMPTS))

    for i in range(total_prompts_to_consider):
        prompt_content = ALL_PROMPTS[i]
        prompt_hash = hashlib.sha1(prompt_content.encode()).hexdigest()[:10]
        prompt_id = f"prompt_{i:05d}_{prompt_hash}"
        output_path = os.path.join(BASE_OUTPUT_DIR, DATASET_TIER, selected_model, prompt_id, "raw_output.json")
        
        if not os.path.exists(output_path):
            tasks_to_run.append((i, prompt_content, selected_model, BASE_OUTPUT_DIR, DATASET_TIER))

    total_already_generated = total_prompts_to_consider - len(tasks_to_run)
    print(f"Targeting {NUM_PLANS_TO_GENERATE} total plans.")
    print(f"Found {total_already_generated} plans already completed in previous runs.")
    
    if not tasks_to_run:
        print("\\n🎉 All requested plans have already been generated. Nothing to do!")
    else:
        print(f"▶️ Preparing to generate {len(tasks_to_run)} new plans...")
        print(f"🚀 Starting PARALLEL data generation using {MAX_WORKERS} workers...")

        successful_generations = 0
        with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            # Pass the list of tasks that actually need to be run
            results = list(tqdm(executor.map(generate_and_save_raw_plan, tasks_to_run), total=len(tasks_to_run), desc=f"Generating with {selected_model}"))
        
        # Count successes from the results tuple (prompt_id, success_boolean, message)
        successful_generations = sum(1 for res in results if res[1] is True and res[2] != "Skipped")

        print("\\n" + "="*50)
        print("✅ Data Generation Run Complete!")
        print(f"Successfully generated {successful_generations} NEW raw plan files in this run.")
        final_output_dir = os.path.join(BASE_OUTPUT_DIR, DATASET_TIER, selected_model)
        print(f"All outputs are saved in your Google Drive at: '{final_output_dir}'")
        print("="*50)
else:
    print("\\n🛑 HALTED. Please run the prompt loading cell (Cell 3) successfully first.")


In [None]:
# --- 7. (Optional) Package and Download Results ---
import shutil
import time
import os

# The data is already safe in Google Drive. This cell is for creating a convenient zip archive.
output_directory_path = os.path.join(BASE_OUTPUT_DIR, DATASET_TIER, selected_model)
zip_filename = f"{DATASET_TIER}_{selected_model}_raw_data_{int(time.time())}"
zip_filepath_in_colab = f"/content/{zip_filename}" # We'll create the zip in the local runtime for speed

print(f"Locating generated data in your Google Drive: {output_directory_path}...")

if os.path.isdir(output_directory_path):
    print(f"Compressing into '{zip_filename}.zip' in the Colab runtime...")
    # This might take a while if the dataset is very large
    try:
        shutil.make_archive(
            base_name=zip_filepath_in_colab,
            format='zip',
            root_dir=output_directory_path
        )
        print(f"✅ Success! Your data is compressed and ready for download at {zip_filepath_in_colab}.zip")
        print("You can find it in the 'Files' panel on the left.")
    except Exception as e:
        print(f"❌ An error occurred during zipping: {e}")
else:
    print(f"❌ Error: Could not find the output directory '{output_directory_path}'. Please ensure the generation step ran correctly.")

