# Modeling
---

In [21]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

train_df = pd.read_csv("../data/processed/train.csv")
test_df = pd.read_csv("../data/processed/test.csv")

X_train = train_df.drop("price", axis=1)
y_train = np.log10(train_df["price"])

X_test = test_df.drop("price", axis=1)
y_test = np.log10(test_df["price"])


## Linear Regression

### Normal Linear Regression

In [22]:
from sklearn.linear_model import LinearRegression

lr_pipline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

scores = cross_val_score(
        estimator= lr_pipline,
        X=X_train,
        y=y_train,
        cv=5,
        n_jobs=1,
        scoring="r2")

print(f"R2 Scores: {scores}")
print(f"Mean R2: {scores.mean():.4f}")

R2 Scores: [0.66469696 0.69045355 0.57597432 0.60679453 0.62421665]
Mean R2: 0.6324


### Ridge Linear Regression

In [26]:
from sklearn.linear_model import Ridge

ridge_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge())
])

param_grid = {
    'ridge__alpha': [0.1, 1, 10, 100, 1000] 
}

grid_ridge = GridSearchCV(ridge_pipeline, param_grid, cv=5, scoring='r2')

grid_ridge.fit(X_train, y_train)

print(f"Best Alpha: {grid_ridge.best_params_}")
print(f"Best Score: {grid_ridge.best_score_:.4f}")

Best Alpha: {'ridge__alpha': 10}
Best Score: 0.6325


### Lasso Linear Regression

In [25]:
from sklearn.linear_model import Lasso

lasso_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lasso', Lasso())
])

param_grid = {
    'lasso__alpha': [0.01, 0.1, 1, 10] 
}

grid_lasso = GridSearchCV(lasso_pipeline, param_grid, cv=5, scoring='r2')

grid_lasso.fit(X_train, y_train)

print(f"Best Alpha: {grid_lasso.best_params_}")
print(f"Best Score: {grid_lasso.best_score_:.4f}")

Best Alpha: {'lasso__alpha': 0.01}
Best Score: 0.6121


## kNN

In [38]:
from sklearn.neighbors import KNeighborsRegressor

kNN_pipeline = Pipeline(
    steps= [
        ("scaler", StandardScaler()),
        ("kNN", KNeighborsRegressor())
    ]
)

param_grid = {"kNN__n_neighbors": [1, 3, 5, 7]}

grid_kNN = GridSearchCV(kNN_pipeline, param_grid, cv=5, scoring='r2')

grid_kNN.fit(X_train, y_train)

print(f"Best Neighbor: {grid_kNN.best_params_}")
print(f"Best Score: {grid_kNN.best_score_:.4f}")


Best Neighbor: {'kNN__n_neighbors': 5}
Best Score: 0.8178


## SVM

In [28]:
from sklearn.svm import SVR

svm_pipeline = Pipeline(
    steps= [
        ("scaler", StandardScaler()),
        ("SVM", SVR())
    ]
)

scores = cross_val_score(
        estimator= svm_pipeline,
        X=X_train,
        y=y_train,
        cv=5,
        n_jobs=1,
        scoring="r2")

print(f"R2 Scores: {scores}")
print(f"Mean R2: {scores.mean():.4f}")

R2 Scores: [0.80945886 0.8420065  0.8355616  0.83207355 0.85112025]
Mean R2: 0.8340


## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_pipeline = Pipeline(
    steps= [
        ("scaler", StandardScaler()),
        ("rf", RandomForestRegressor())
    ]
)
RandomForestRegressor()
param_grid = {"rf__n_estimators": [100, 200, 300, 500],
            "rf__max_depth": [10, 20, 30, None]}

grid_rf = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring='r2')

grid_rf.fit(X_train, y_train)

print(f"Best parameters: {grid_rf.best_params_}")
print(f"Best Score: {grid_rf.best_score_:.4f}")

Best parameters: {'rf__max_depth': 20, 'rf__n_estimators': 500}
Best Score: 0.8595


In [30]:
from sklearn.metrics import mean_absolute_error

y_pred_log = grid_rf.best_estimator_.predict(X_test)


y_pred_actual = 10**y_pred_log
y_test_actual = 10**y_test

mae = mean_absolute_error(y_test_actual, y_pred_actual)

print(f"Average Error (MAE): ${mae:,.2f}")

Average Error (MAE): $563,641.04


In [31]:
mape = np.mean(np.abs((y_test_actual - y_pred_actual) / y_test_actual)) * 100
print(f"Average Error Percentage: {mape:.2f}%")

Average Error Percentage: 28.11%


In [39]:
import joblib

joblib.dump(grid_rf.best_estimator_, "../models/house_price_model.joblib")

['../models/house_price_model.joblib']