# Regression : House Price Prediction

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Loading of prepared training dataset
df_train = pd.read_csv('housing_train.csv')
df_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household,ocean_1,ocean_2,ocean_3,ocean_4,ocean_5,median_house_value
0,-1.498801e-15,0.872078,-0.821368,1.060025,0.762658,1.810501,0.078757,-1.545452,1.586059,-2.511716,1.0,0.0,0.0,0.0,0.0,240000.0
1,-1.471046e-15,0.861019,-0.652535,1.449561,1.413273,1.505998,0.637755,0.033086,-0.145523,-0.097845,1.0,0.0,0.0,0.0,0.0,250000.0
2,2.053913e-15,-1.372369,0.606412,-1.014326,-1.57272,-1.511673,-0.326826,1.523596,0.446924,-0.226179,0.0,1.0,0.0,0.0,0.0,134400.0
3,-1.526557e-15,0.890389,-0.243525,0.579703,0.36723,0.54759,0.833675,0.152906,-0.305407,-0.401924,1.0,0.0,0.0,0.0,0.0,287600.0
4,-2.220446e-16,0.660449,1.407146,-0.671688,-0.045275,-0.356063,-1.907712,-0.961195,1.693039,0.889069,0.0,1.0,0.0,0.0,0.0,49600.0


In [2]:
# Extracting X_train (n_samples, n_features) and y_train (target variable)
X_train = df_train.drop("median_house_value", axis=1)  
y_train = df_train["median_house_value"].to_numpy() 

print('X_train:', X_train.shape, '; y_train:', np.shape(y_train))


X_train: (16512, 15) ; y_train: (16512,)


In [3]:
# Loading of prepared testing dataset
df_test = pd.read_csv('housing_test.csv')
# extracting X_test and y_test
X_test= df_test.drop("median_house_value", axis=1) 
y_test = df_test["median_house_value"].to_numpy() 
print('X_test:', X_test.shape, '; y_test:', np.shape(y_test))

X_test: (4128, 15) ; y_test: (4128,)


# Modeling & Validation

In [4]:
# https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
from sklearn.pipeline import Pipeline

# https://scikit-learn.org/stable/modules/linear_model.html#
from sklearn.linear_model import LinearRegression, Ridge

# https://scikit-learn.org/stable/modules/model_evaluation.html
from sklearn.metrics import mean_absolute_percentage_error, r2_score 

# https://scikit-learn.org/stable/model_selection.html
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score

In [5]:
# Defaut hyperparameters
print("LinearRegression: ", LinearRegression().get_params())
print("Ridge:", Ridge().get_params())

LinearRegression:  {'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': 'deprecated', 'positive': False}
Ridge: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'normalize': 'deprecated', 'positive': False, 'random_state': None, 'solver': 'auto', 'tol': 0.001}


In [6]:
pipelines = []
params = []
names = []

# Add LinearRegresion Model
pipelines.append(Pipeline([('clf', LinearRegression())])) ## LinearRegression
params.append({}) # use default hyperparameters
names.append('LinearRegression')

# Add Ridge
pipelines.append(Pipeline([('clf', Ridge())])) ## Ridge
params.append({'clf__alpha':np.linspace(1, 10, 10)}) # # fine-tuning of alpha
names.append('Ridge')

In [7]:
# training with K-fold cross-validation & fine-tuning of hyperparameters with GridSearchCV
# n_jobs = -1 means using all processors

def modeling(pipeline, parameters, name, X, y):    
    cv = KFold(n_splits=5, shuffle=True, random_state=32) # KFold with 10 folds
    grid_obj = GridSearchCV(estimator=pipeline, param_grid=parameters, cv=cv, scoring='r2', n_jobs=-1)
    grid_obj.fit(X,y)  # supervised training on k-1 folds
    print(name, 'R2:', grid_obj.best_score_)
    estimator = grid_obj.best_estimator_ # best model has the highest R2 score
    estimator.fit(X,y) # training on all the training dataset
    return estimator 
estimators = []
for i in range(len(pipelines)): # build two estimators    
    estimators.append(modeling(pipelines[i], params[i], names[i], X_train, y_train))

LinearRegression R2: 0.6119046502450434
Ridge R2: 0.6119141413021839


# RandomForestRegressor

In [8]:
from sklearn.ensemble import RandomForestRegressor

# default hyperparameters
print(RandomForestRegressor().get_params())

# Add RandomForestRegressor 
pipelines.append(Pipeline([('clf', RandomForestRegressor())])) 
params.append({'clf__n_estimators': [50,100,150,200,250,300], 'clf__max_depth':[3,5,10]}) # fine-tuning de n_estimators
names.append('RandomForestRegressor')

# build RandomForestRegressor estimator
estimators.append(modeling(pipelines[-1], params[-1], names[-1], X_train, y_train))

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
RandomForestRegressor R2: 0.7818638196618466


In [9]:
# Performance evaluation on testing dataset
for i, estimator in enumerate(estimators):
    print('\nPerformance of', names[i])
    y_pred = estimator.predict(X_test)
    print('p_mean_squared_error :', mean_absolute_percentage_error(y_test, y_pred))    
    print('r2_score :', r2_score(y_test, y_pred))


Performance of LinearRegression
mean_squared_error : 0.3170187308083347
r2_score : 0.6222041218131589

Performance of Ridge
mean_squared_error : 0.3170325274330896
r2_score : 0.622161481526379

Performance of RandomForestRegressor
mean_squared_error : 0.19993418663279036
r2_score : 0.7881038287539117


In [10]:
# save the best model RandomForestRegressor with dump() 
import joblib  

joblib.dump(estimators[2], names[2]+".pkl")

# load the model
# RF_regressor = joblib.load(names[2]+"pkl")

['RandomForestRegressor.pkl']