In [None]:
# ==========================================
# PyCaret - Automated ML Comparison
# ==========================================
# Team: Error 400
# Purpose: Use PyCaret to automatically compare multiple models
# ==========================================

import pandas as pd
import numpy as np
from pathlib import Path
from pycaret.regression import *

print("‚úÖ Libraries loaded")

# ==========================================
# 1. Load Clean Data
# ==========================================
print("\nüìÇ Loading clean data...")
df = pd.read_parquet("data/cleaned/housing_clean.parquet")

print(f"‚úÖ Loaded {len(df):,} records")
print(f"Date range: {df['year'].min()} - {df['year'].max()}")

# ==========================================
# 2. Prepare Data for PyCaret
# ==========================================
print("\nüîß Preparing data...")

# Add smart location features
top_towns = df['town_city'].value_counts().head(30).index
df['town_encoded'] = df['town_city'].apply(lambda x: x if x in top_towns else 'OTHER')

top_districts = df['district'].value_counts().head(50).index
df['district_encoded'] = df['district'].apply(lambda x: x if x in top_districts else 'OTHER')

# Select features
features = [
    'price',  # TARGET
    'property_type',
    'is_new_build',
    'tenure_type',
    'county',
    'district_encoded',
    'town_encoded',
    'year',
    'month',
    'quarter'
]

df_model = df[features].copy()
print(f"‚úÖ Data prepared: {df_model.shape}")

# ==========================================
# 3. Setup PyCaret
# ==========================================
print("\n‚öôÔ∏è Setting up PyCaret environment...")

exp = setup(
    data=df_model,
    target='price',
    train_size=0.8,
    session_id=42,
    verbose=False,
    n_jobs=-1
)

print("‚úÖ PyCaret setup complete!")

# ==========================================
# 4. Compare Models
# ==========================================
print("\nüîç Comparing models (this will take a few minutes)...")

best_models = compare_models(n_select=5, sort='R2')

print("\n‚úÖ Model comparison complete!")
print("Top 5 models shown above ‚òùÔ∏è")