In [1]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import joblib

In [2]:
# Load dataset
df = pd.read_csv("houseprice_ml.csv")

# --- Binary columns ---
binary_cols = [
    'mainroad',
    'guestroom',
    'basement',
    'hotwaterheating',
    'airconditioning',
    'parking',
    'prefarea'
]

# --- Numeric and categorical ---
numeric_features = ['area', 'bedrooms', 'bathrooms', 'stories']
categorical_features = ['furnishingstatus', 'price_category']  # include price_category

# --- Step 1: Split before making price_category ---
X = df.drop(columns=['price'])
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --- Step 2: Create price_category ---
Q1 = y_train.quantile(0.25)
Q3 = y_train.quantile(0.75)

def categorize_price(p):
    if p < Q1:
        return 'Low Pricing'
    elif p < Q3:
        return 'Medium Pricing'
    else:
        return 'High Pricing'

X_train = X_train.copy()
X_test = X_test.copy()

X_train['price_category'] = y_train.apply(categorize_price)
X_test['price_category'] = y_test.apply(categorize_price)

# --- Step 3: Preprocessing ---
# Define a named function instead of lambda
def apply_binary_mapper(df):
    def binary_mapper(x):
        return x.map({'yes': 1, 'no': 0, 'Required': 1, 'Not Required': 0}).fillna(x)
    return df.apply(binary_mapper)

binary_transformer = FunctionTransformer(apply_binary_mapper)

# One-hot encode furnishingstatus and price_category
categorical_transformer = OneHotEncoder(drop="first", handle_unknown="ignore")

# Column Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("bin", binary_transformer, binary_cols),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# --- Step 4: Pipeline with Random Forest ---
rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(
        n_estimators=300,
        max_depth=10,
        min_samples_split=2,
        min_samples_leaf=2,
        random_state=42
    ))
])

# --- Step 5: Train pipeline ---
rf_pipeline.fit(X_train, y_train)

# --- Step 6: Evaluate ---
def evaluate_model(y_true, y_pred, dataset_name):
    r2 = r2_score(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = mse ** 0.5
    print(f"{dataset_name} → R²: {r2:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}")

y_train_pred = rf_pipeline.predict(X_train)
y_test_pred = rf_pipeline.predict(X_test)

evaluate_model(y_train, y_train_pred, "Train")
evaluate_model(y_test, y_test_pred, "Test")

# --- Step 7: Save pipeline ---
joblib.dump(rf_pipeline, "house_price_rf_pipeline.pkl")
print("✅ Pipeline saved as house_price_rf_pipeline.pkl")

Train → R²: 0.9518, MSE: 148543333897.1914, RMSE: 385413.1989
Test → R²: 0.8177, MSE: 921250923508.8689, RMSE: 959818.1721
✅ Pipeline saved as house_price_rf_pipeline.pkl
