
# 🏙️ Apartment Price Prediction — End-to-End Machine Learning Project

This notebook showcases a **complete machine learning pipeline** for predicting apartment prices.  
It allows the user to interactively select one of three models to train and evaluate:

1. **Linear Regression**
2. **HistGradientBoostingRegressor**
3. **Neural Network (Keras)**

---

### ✨ Features
- User-driven model selection (`input()` prompt)
- Automatic preprocessing (missing values, encoding, scaling)
- Evaluation metrics: RMSE, MAE, R², Accuracy (within ±10%)
- Model visualizations (Predicted vs Actual, Residuals, Error Distribution)
- Neural network training visualization (loss & MAE curves)

---

### ⚙️ Requirements
```bash
pip install numpy pandas matplotlib scikit-learn tensorflow
```


In [None]:

import os
import math
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor

# Try importing TensorFlow
_keras_ok = True
try:
    from tensorflow import keras
    from tensorflow.keras import layers
except Exception:
    _keras_ok = False


# ========== Configuration ==========
DATA_PATH = "apartment_prices.csv"   # Update if needed
SEED = 42
TEST_SIZE = 0.2
ACCURACY_TOL_PCT = 0.10
NN_EPOCHS = 200
NN_BATCH_SIZE = 64
NN_VAL_SPLIT = 0.15
NN_PATIENCE = 15


# ========== Helper Functions ==========
def load_dataset(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Dataset not found at: {path}")
    return pd.read_csv(path)

def infer_target_column(df):
    cols_lower = {c.lower(): c for c in df.columns}
    if "price" in cols_lower:
        return cols_lower["price"]
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    return num_cols[-1]

def split_features_target(df, target_col):
    y = df[target_col].values
    X = df.drop(columns=[target_col])
    cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    return X, y, num_cols, cat_cols

def build_preprocessor(num_cols, cat_cols):
    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])
    return ColumnTransformer([
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols)
    ])

def regression_accuracy(y_true, y_pred, tol_pct=0.10):
    denom = np.maximum(np.abs(y_true), 1e-8)
    rel_err = np.abs(y_pred - y_true) / denom
    return np.mean(rel_err <= tol_pct)

def evaluate(y_true, y_pred, label="Model"):
    rmse = math.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    acc = regression_accuracy(y_true, y_pred, ACCURACY_TOL_PCT)
    print(f"\n=== {label} Evaluation ===")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE : {mae:.4f}")
    print(f"R²  : {r2:.4f}")
    print(f"Accuracy (±{int(ACCURACY_TOL_PCT*100)}%): {acc*100:.2f}%")
    return {"Model": label, "RMSE": rmse, "MAE": mae, "R2": r2, "Acc": acc}

def plot_results(y_true, y_pred, title="Model Results"):
    residuals = y_pred - y_true
    plt.figure(figsize=(6,5))
    plt.scatter(y_true, y_pred, alpha=0.6)
    lims = [min(y_true.min(), y_pred.min()), max(y_true.max(), y_pred.max())]
    plt.plot(lims, lims, 'r--')
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.title(f"Predicted vs Actual - {title}")
    plt.show()

    plt.figure(figsize=(6,5))
    plt.scatter(y_pred, residuals, alpha=0.6)
    plt.axhline(0, color='r', linestyle='--')
    plt.xlabel("Predicted")
    plt.ylabel("Residuals")
    plt.title(f"Residuals Plot - {title}")
    plt.show()

    plt.figure(figsize=(6,5))
    plt.hist(residuals, bins=30, color='gray')
    plt.xlabel("Residuals")
    plt.ylabel("Frequency")
    plt.title(f"Error Distribution - {title}")
    plt.show()

def build_keras_model(input_dim):
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(256, activation="relu"),
        layers.Dense(128, activation="relu"),
        layers.Dense(64, activation="relu"),
        layers.Dense(1)
    ])
    model.compile(optimizer="adam", loss="mse", metrics=["mae"])
    return model

def plot_nn_history(history):
    plt.figure(figsize=(6,5))
    plt.plot(history.history["loss"], label="Train Loss")
    plt.plot(history.history["val_loss"], label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss (MSE)")
    plt.title("Neural Network Training Curve")
    plt.legend()
    plt.show()


# ========== Main Workflow ==========
df = load_dataset(DATA_PATH)
print(f"Loaded dataset: {df.shape[0]} samples, {df.shape[1]} columns")

target = infer_target_column(df)
print(f"Target column: {target}")

X, y, num_cols, cat_cols = split_features_target(df, target)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=SEED)

print("\nChoose a model to train:")
print("1) Linear Regression")
print("2) HistGradientBoostingRegressor")
print("3) Neural Network (Keras)")
choice = input("Enter 1, 2, or 3: ").strip()

results = None

if choice == "1":
    print("\nTraining Linear Regression...")
    preprocessor = build_preprocessor(num_cols, cat_cols)
    model = LinearRegression()
    pipe = Pipeline([("preprocessor", preprocessor), ("model", model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    results = evaluate(y_test, y_pred, "Linear Regression")
    plot_results(y_test, y_pred, "Linear Regression")

elif choice == "2":
    print("\nTraining HistGradientBoostingRegressor...")
    preprocessor = build_preprocessor(num_cols, cat_cols)
    model = HistGradientBoostingRegressor(random_state=SEED, max_iter=300, learning_rate=0.08)
    pipe = Pipeline([("preprocessor", preprocessor), ("model", model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    results = evaluate(y_test, y_pred, "HistGradientBoosting")
    plot_results(y_test, y_pred, "HistGradientBoosting")

elif choice == "3":
    if not _keras_ok:
        print("TensorFlow/Keras not installed. Please install it to use this option.")
    else:
        print("\nTraining Neural Network...")
        preprocessor = build_preprocessor(num_cols, cat_cols)
        X_train_p = preprocessor.fit_transform(X_train)
        X_test_p = preprocessor.transform(X_test)

        model = build_keras_model(X_train_p.shape[1])
        cb = keras.callbacks.EarlyStopping(patience=NN_PATIENCE, restore_best_weights=True)
        history = model.fit(X_train_p, y_train, validation_split=NN_VAL_SPLIT,
                            epochs=NN_EPOCHS, batch_size=NN_BATCH_SIZE,
                            verbose=0, callbacks=[cb])

        y_pred = model.predict(X_test_p).reshape(-1)
        results = evaluate(y_test, y_pred, "Neural Network")
        plot_nn_history(history)
        plot_results(y_test, y_pred, "Neural Network")

else:
    print("Invalid choice. Please enter 1, 2, or 3.")

if results:
    pd.DataFrame([results])



---

## 📊 Interpretation of Results

| Metric | Meaning |
|:--|:--|
| **RMSE** | Root Mean Squared Error – larger errors penalized more. |
| **MAE** | Mean Absolute Error – average absolute deviation from actual prices. |
| **R²** | Coefficient of Determination – how much variance is explained by the model. |
| **Accuracy (±10%)** | Fraction of predictions within ±10% of actual values. |

---

### 🧠 What This Project Demonstrates
- Data preprocessing (imputation, encoding, scaling)
- Training multiple regression models
- Evaluating and visualizing performance
- Integrating deep learning (Keras) with Scikit-learn pipelines

You can proudly include this in your **portfolio** to demonstrate your full-stack ML engineering capability.
