# House Price Prediction

The goal of this project is to predict the sales price of residential homes in Ames, Iowa, USA based on various of attributes. It is a supervised regression problem.

# Import Packages

In [1]:
import pandas as pd
import numpy as np

from datetime import datetime
from sklearn.linear_model import LinearRegression, ElasticNetCV, LassoCV, RidgeCV, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.metrics import root_mean_squared_error, mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import RobustScaler, StandardScaler

import joblib

from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# 1. Load the Data

In [2]:
train = pd.read_csv("data/train_cleaned_2.csv", header=0)
test = pd.read_csv("data/test_cleaned_2.csv", header=0)

# 6. Choose an Evaluation Metrics
For regression problems, common metrics include:
- Mean Absolute Error (MAE)
- Mean Squared Error (MSE)
- Root Mean Squared Error (RMSE)
- R-squared

For classification problems, common metrics include:
- Accuracy
- Precision
- Recall
- F1-score
- AUC

The confusion matric and ROC Curve can also bring useful insights. 

In [3]:
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

def rmse(y, y_pred):
    return root_mean_squared_error(y, y_pred)

def cv_rmse(model, X, y):
    return np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kfolds))

In [4]:
# Prepare data for machine learning model

X_train = train.drop(columns=['SalePrice', 'Id'])
X_test = test.drop(columns=['Id'])
y_train = train['SalePrice']

# 7. Select Algorithms
- Start with simple regression algorithms like Linear Regression and gradually explore more complex models like Random Forest, Gradient Boosting, or XGBoost.
- Consider simple ensemble methods, such as simple average, weighted average, or voting ensembles, to combine multiple models for potentially better results.
- The model chosen depends on the data. A more complex model does not always constitute a better model.

In [5]:
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

In [6]:
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'SGD Regressor': SGDRegressor(max_iter=1000, tol=1e-3),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'SVR': SVR(),
    'K-Neighbors': KNeighborsRegressor()
}

In [7]:
models = {
    'Linear Regression': LinearRegression()
}

In [8]:
lr = LinearRegression()
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kfolds))
lasso = make_pipeline(RobustScaler(), LassoCV(max_iter=1e7, alphas=alphas2, random_state=42, cv=kfolds))
elasticnet = make_pipeline(RobustScaler(), ElasticNetCV(max_iter=1e7, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio))                                
sdrg = SGDRegressor()
dtr = DecisionTreeRegressor()
rfr = RandomForestRegressor()
knr = KNeighborsRegressor()
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003,))
gbr = GradientBoostingRegressor(
    n_estimators=3000, 
    learning_rate=0.05, 
    max_depth=4, 
    max_features='sqrt', 
    min_samples_leaf=15, 
    min_samples_split=10, 
    loss='huber', 
    random_state=42)
lightgbm = LGBMRegressor(objective='regression', 
                         num_leaves=4,
                         learning_rate=0.01,
                         n_estimators=5000,
                         max_bin=200,
                         bagging_fraction=0.75,
                         bagging_freq=5,
                         bagging_seed=7,
                         feature_fraction=0.2,
                         feature_fraction_seed=7,
                         verbose=-1)
xgboost = XGBRegressor(learning_rate=0.01,
                       n_estimators=3460,
                       max_depth=3, 
                       min_child_weight=0,
                       gamma=0, 
                       subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:linear', 
                       nthread=-1,
                       scale_pos_weight=1, 
                       seed=27,
                       reg_alpha=0.00006)
stack_gen = StackingCVRegressor(
    regressors=(ridge, lasso, elasticnet, gbr, xgboost, lightgbm),
    meta_regressor=xgboost,
    use_features_in_secondary=True)

# 8. Model Validation
- Split the data into training and validation sets. A common split is 70-30 or 80-20 for training and validation, respectively. This method is computationally less intensive and often used for initial model exploration or when dealing with very large datasets.
- K-Fold Cross Validation. This method provides a more reliable evaluation, especially with smaller datasets.
- Model validation is important to assess the model's generalization performance (i.e. assess how well the model performs on unseen data). This helps prevent overfitting and gives you a more reliable estimate of your model's performance.

In [9]:
def regression_model(model, X_train, X_test, y_train, cv_folds=5, random_state=42):
    """
    Train a regression model and assess performance using RMSE
    model: eg. model = LinearRegression()
    X_train: train dataframe without the target column
    X_test: test dataframe    
    y_train: target column
    cv_folds: number of cross-validation folds
    random_state: random state for reproducibility
    """
    
    # Use model class name as model name
    model_name = model.__class__.__name__
    print(f"Training: {model_name}")    
    
    # Fit the model
    model.fit(X_train, y_train)
    
    # Perform cross-validation with RMSE scoring
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
    rmse_scores = np.sqrt(-cross_val_score(model, X_train, y_train, 
                                         scoring='neg_mean_squared_error', 
                                         cv=kf, n_jobs=-1))
    
    cv_rmse_mean = np.mean(rmse_scores)
    
    # Predictions on train set
    y_pred_train = model.predict(X_train)
    
    # Predict on test set 
    y_pred_test = model.predict(X_test)
    
    # Calculate metrics on training data
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
    
    # Create results dictionary
    results_dict = {
        'Model_Name': model_name,
        'Train_RMSE': train_rmse,
        'CV_RMSE_Mean': cv_rmse_mean
    }
    
    return y_pred_test, y_pred_train, results_dict

In [10]:
# fit all regression models

all_results = {}

for model_name, model in models.items():
    y_pred_test, y_pred_train, results = regression_model(
        model, X_train, X_test, y_train, cv_folds=5
    )
    all_results[model_name] = results

Training: LinearRegression


In [11]:
# Print result dataframe
results_df = pd.DataFrame.from_dict(all_results, orient='index')
results_df = results_df.sort_values('CV_RMSE_Mean').round(4)

results_df

Unnamed: 0,Model_Name,Train_RMSE,CV_RMSE_Mean
Linear Regression,LinearRegression,0.12,0.13


## 8.1. Hyperparameter Tuning
- Tune the hyperparameters of your chosen algorithms on the validation dataset using techniques like grid search or random search to find the best combination.
- Optuna is an efficient and effective way to search for optimal hyperparameters.

## 8.2. Regularization
- Implement regularization techniques like L1 (Lasso) or L2 (Ridge) regularization to prevent overfitting.
- Many ML algorithms include regularization parameters, including L1 and L2, sometimes called reg_alpha or reg_lambda. Read up on your chosen algorithms regularization parameters and tune them accordingly on your validation set.

# 9. Train the final model

Fit the best model using the optimal hyperparameters found on the whole training set (including the validation set)

In [12]:
best_model = GradientBoostingRegressor(random_state=42)
y_pred_test, y_pred_train, _ = regression_model(best_model, X_train, X_test, y_train)

Training: GradientBoostingRegressor


# 10. Predictions

Generate predictions on the test set (unseen data).

In [13]:
# revert the log transform to get the original SalePrice values
test['SalePrice'] = np.expm1(y_pred_test)

In [14]:
submission = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': test['SalePrice']
})

submission.to_csv('predictions/gbr.csv', index=False)

In [15]:
submission.head(10)

Unnamed: 0,Id,SalePrice
0,1461,124828.882404
1,1462,160974.37086
2,1463,184168.642459
3,1464,184974.732709
4,1465,203586.188866
5,1466,176096.563794
6,1467,171886.047427
7,1468,162649.686668
8,1469,190877.728835
9,1470,119355.577115


# 11. Model Persistence

Save the model weights for future use.

In [17]:
# Save the model
joblib.dump(best_model, 'models/gbr.joblib')

# Load the model
loaded_model = joblib.load('models/gbr.joblib')