In [0]:
%pip install xgboost


In [0]:
# =============================================================================
 
# Classification + Regression + MLflow Registry
# =============================================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBClassifier, XGBRegressor
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import joblib

# Load final processed data
df = pd.read_csv("./03_final_with_features_and_target.csv")

# === CREATE REGRESSION TARGET: Future_Price_5Y ===
# Realistic 5-year appreciation based on city tier + property type
growth_rate = {
    'Mumbai': 0.10, 'Delhi': 0.09, 'Bangalore': 0.09, 'Pune': 0.085,
    'Hyderabad': 0.08, 'Chennai': 0.07, 'Kolkata': 0.06
}
df['growth_rate'] = df['City'].map(growth_rate).fillna(0.07)
df['Future_Price_5Y'] = df['Price_in_Lakhs'] * (1 + df['growth_rate'])**5

print("Targets ready:")
print("Good_Investment distribution:", df['Good_Investment'].value_counts().to_dict())
print("Future_Price_5Y range: ₹", df['Future_Price_5Y'].min().round(1), "→", df['Future_Price_5Y'].max().round(1), "Lakhs")

# Features (use only numeric + encoded)
feature_cols = [col for col in df.columns if '_encoded' in col or col in [
    'Size_in_SqFt', 'BHK', 'Age_of_Property', 'Price_per_SqFt',
    'School_Density_Score', 'Amenities_Count', 'Security_Score', 'Transport_Score',
    'Is_Tier1_City', 'Is_New_Property', 'Is_Ready_to_Move'
]]

X = df[feature_cols]
y_class = df['Good_Investment']
y_reg = df['Future_Price_5Y']

# Train-test split
X_train, X_test, y_train_c, y_test_c = train_test_split(X, y_class, test_size=0.2, random_state=42, stratify=y_class)
_, _, y_train_r, y_test_r = train_test_split(X, y_reg, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save scaler
joblib.dump(scaler, "./scaler.pkl")

# Set MLflow Experiment
mlflow.set_experiment("/Users/rsangramofficial@gmail.com/real_estate_investment_advisor")

print("Starting MLflow Experiments...")


In [0]:
# === CLASSIFICATION MODELS ===
models_class = {
    "Logistic_Regression": LogisticRegression(max_iter=1000),
    "RandomForest_Classifier": RandomForestClassifier(n_estimators=300, max_depth=15, random_state=42),
    "XGBoost_Classifier": XGBClassifier(n_estimators=300, max_depth=6, learning_rate=0.1, random_state=42)
}

best_auc = 0
best_model_name = ""

for name, model in models_class.items():
    with mlflow.start_run(run_name=name):
        model.fit(X_train_scaled, y_train_c)
        preds = model.predict(X_test_scaled)
        proba = model.predict_proba(X_test_scaled)[:, 1]

        # Metrics
        acc = accuracy_score(y_test_c, preds)
        prec = precision_score(y_test_c, preds)
        rec = recall_score(y_test_c, preds)
        auc = roc_auc_score(y_test_c, proba)
        f1 = f1_score(y_test_c, preds)

        # Log to MLflow
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision", prec)
        mlflow.log_metric("recall", rec)
        mlflow.log_metric("auc", auc)
        mlflow.log_metric("f1", f1)
        mlflow.log_param("model", name)

        if "RandomForest" in name or "XGBoost" in name:
            mlflow.xgboost.log_model(model, "model") if "XGBoost" in name else mlflow.sklearn.log_model(model, "model")
        else:
            mlflow.sklearn.log_model(model, "model")

        # Register best model
        if auc > best_auc:
            best_auc = auc
            best_model_name = name
            joblib.dump(model, "./best_classifier.pkl")
            mlflow.sklearn.log_model(model, "best_classifier")

        print(f"{name} → AUC: {auc:.4f} | Acc: {acc:.4f}")

        # === REGRESSION MODELS ===
models_reg = {
    "Linear_Regression": LinearRegression(),
    "RandomForest_Regressor": RandomForestRegressor(n_estimators=300, max_depth=20, random_state=42),
    "XGBoost_Regressor": XGBRegressor(n_estimators=300, max_depth=6, learning_rate=0.1, random_state=42)
}

best_r2 = -999
best_reg_name = ""

for name, model in models_reg.items():
    with mlflow.start_run(run_name=name + "_reg"):
        model.fit(X_train_scaled, y_train_r)
        preds = model.predict(X_test_scaled)

        rmse = np.sqrt(mean_squared_error(y_test_r, preds))
        mae = mean_absolute_error(y_test_r, preds)
        r2 = r2_score(y_test_r, preds)

        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)
        mlflow.log_param("model", name)

        if "XGBoost" in name:
            mlflow.xgboost.log_model(model, "model")
        else:
            mlflow.sklearn.log_model(model, "model")

        if r2 > best_r2:
            best_r2 = r2
            best_reg_name = name
            joblib.dump(model, "./best_regressor.pkl")

        print(f"{name} → R²: {r2:.4f} | RMSE: {rmse:.2f}")

print(f"\nBEST CLASSIFIER: {best_model_name} (AUC = {best_auc:.4f})")
print(f"BEST REGRESSOR: {best_reg_name} (R² = {best_r2:.4f})")