# House Prices Competition : Term Project 

### Importing Libraries:

In [152]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import linear_model
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn import neighbors
from sklearn.neural_network import MLPRegressor
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

import matplotlib.pyplot as plt
plt.style.use(style='ggplot')
plt.rcParams['figure.figsize'] = (10, 6)

In [153]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
import lightgbm as lgb


In [154]:
# getting data from feature engineering notebook
%store -r train_set
%store -r test_set
%store -r ytrain

In [155]:
train_set.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,YrSold_0,YrSold_1,YrSold_2,YrSold_3,YrSold_4
0,856,854,0,1,3,3,3,706.0,0.0,2,...,0,0,0,0,1,0,0,1,0,0
1,1262,0,0,1,3,3,1,978.0,0.0,0,...,0,0,0,0,1,0,1,0,0,0
2,920,866,0,1,3,3,2,486.0,0.0,2,...,0,0,0,0,1,0,0,1,0,0
3,961,756,0,1,3,1,3,216.0,0.0,0,...,0,0,0,0,1,1,0,0,0,0
4,1145,1053,0,1,4,3,0,655.0,0.0,2,...,0,0,0,0,1,0,0,1,0,0


In [156]:
print ("Train data shape:", train_set.shape)

Train data shape: (1454, 701)


In [157]:
test_set.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,YrSold_0,YrSold_1,YrSold_2,YrSold_3,YrSold_4
1454,896,0,0,1,2,3,3,468.0,144.0,4,...,0,0,0,0,1,0,0,0,0,1
1455,1329,0,0,1,3,3,3,923.0,0.0,0,...,0,0,0,0,1,0,0,0,0,1
1456,928,701,0,1,3,3,3,791.0,0.0,2,...,0,0,0,0,1,0,0,0,0,1
1457,926,678,0,1,3,3,3,602.0,0.0,2,...,0,0,0,0,1,0,0,0,0,1
1458,1280,0,0,1,2,3,3,263.0,0.0,0,...,0,0,0,0,1,0,0,0,0,1


In [158]:
print ("Test data shape:", test_set.shape)


Test data shape: (1459, 701)


## Building the model:

In [159]:
train_set = train_set.select_dtypes(include=[np.number]).interpolate().dropna()
test_set = test_set.select_dtypes(include=[np.number]).interpolate().dropna()
train_set.head(5)

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,YrSold_0,YrSold_1,YrSold_2,YrSold_3,YrSold_4
0,856,854,0,1,3,3,3,706.0,0.0,2,...,0,0,0,0,1,0,0,1,0,0
1,1262,0,0,1,3,3,1,978.0,0.0,0,...,0,0,0,0,1,0,1,0,0,0
2,920,866,0,1,3,3,2,486.0,0.0,2,...,0,0,0,0,1,0,0,1,0,0
3,961,756,0,1,3,1,3,216.0,0.0,0,...,0,0,0,0,1,1,0,0,0,0
4,1145,1053,0,1,4,3,0,655.0,0.0,2,...,0,0,0,0,1,0,0,1,0,0


In [160]:
test_set.head(5)

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,YrSold_0,YrSold_1,YrSold_2,YrSold_3,YrSold_4
1454,896,0,0,1,2,3,3,468.0,144.0,4,...,0,0,0,0,1,0,0,0,0,1
1455,1329,0,0,1,3,3,3,923.0,0.0,0,...,0,0,0,0,1,0,0,0,0,1
1456,928,701,0,1,3,3,3,791.0,0.0,2,...,0,0,0,0,1,0,0,0,0,1
1457,926,678,0,1,3,3,3,602.0,0.0,2,...,0,0,0,0,1,0,0,0,0,1
1458,1280,0,0,1,2,3,3,263.0,0.0,0,...,0,0,0,0,1,0,0,0,0,1


In [161]:
# apply log to have 0 skew
y = np.log(ytrain)
X = train_set

train_test_split() returns four objects:

* X_train is the subset of our features used for training.
* X_test is the subset which will be our 'hold-out' set - what we'll use to test the model.
* y_train is the target variable SalePrice which corresponds to X_train.
* y_test is the target variable SalePrice which corresponds to X_test. 

random_state=42 allow reproducible results.

In [162]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
                                   X, y, random_state=42, test_size=.33)

### Random Forest:

In [163]:
from sklearn.ensemble import RandomForestRegressor

# Fit Random Forest on Training Set
regressor = RandomForestRegressor(n_estimators=300, random_state=0)
model_random_forest = regressor.fit(X_train, y_train)


# Score model
print ("R^2 is: \n", model_random_forest.score(X_test, y_test))

R^2 is: 
 0.881126538035


### KNN:

In [164]:
# find best k number

ourScore=[]
for nn in range(1,15):
    knn = neighbors.KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                                        metric_params=None, n_jobs=1, n_neighbors=nn,p=2,
                                        weights='uniform')
    knn.fit(X_train, y_train)
    train_score = knn.score(X_train, y_train)
    test_score = knn.score(X_test, y_test)
    print('k: %d, Train Acc: %.3f, Test Acc: %.3f' % (nn, train_score, test_score))
    rowScore=[nn,train_score,test_score]
    ourScore.append(rowScore)

k: 1, Train Acc: 1.000, Test Acc: 0.496
k: 2, Train Acc: 0.890, Test Acc: 0.616
k: 3, Train Acc: 0.851, Test Acc: 0.670
k: 4, Train Acc: 0.825, Test Acc: 0.668
k: 5, Train Acc: 0.810, Test Acc: 0.673
k: 6, Train Acc: 0.796, Test Acc: 0.681
k: 7, Train Acc: 0.786, Test Acc: 0.687
k: 8, Train Acc: 0.773, Test Acc: 0.683
k: 9, Train Acc: 0.767, Test Acc: 0.689
k: 10, Train Acc: 0.762, Test Acc: 0.683
k: 11, Train Acc: 0.757, Test Acc: 0.679
k: 12, Train Acc: 0.754, Test Acc: 0.679
k: 13, Train Acc: 0.746, Test Acc: 0.676
k: 14, Train Acc: 0.742, Test Acc: 0.672


In [165]:
k=5
knn = neighbors.KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=k, p=2,
          weights='uniform')
knn.fit(X_train, y_train)
train_score = knn.score(X_train, y_train)
test_score = knn.score(X_test, y_test)
print('k: %d, Train Acc: %.3f, Test Acc: %.3f' % (k, train_score, test_score))

k: 5, Train Acc: 0.810, Test Acc: 0.673


### Linear Regression:

In [166]:
lr = linear_model.LinearRegression()
model = lr.fit(X_train, y_train)
print ("R^2 is: \n", model.score(X_test, y_test))

R^2 is: 
 -16941.283784


### Linear Regression with ridge regularization:

In [167]:
#to check what alpha value is better for the model

for i in range (-5, 5):
    alpha = 10**i
    rm = linear_model.Ridge(alpha=alpha)
    ridge_model = rm.fit(X_train, y_train)
    preds_ridge = ridge_model.predict(X_test)

    plt.scatter(preds_ridge, y_test, alpha=.75, color='b')
    plt.xlabel('Predicted Price')
    plt.ylabel('Actual Price')
    plt.title('Ridge Regularization with alpha = {}'.format(alpha))
    overlay = 'R^2 is: {}\nRMSE is: {}'.format(
                    ridge_model.score(X_test, y_test),
                    mean_squared_error(y_test, preds_ridge))
    plt.annotate(s=overlay,xy=(12.1,10.6),size='x-large')
    #plt.show()

In [168]:
from sklearn.metrics import mean_squared_error

linm = linear_model.Ridge(alpha = 10)

linm.fit(X_train, y_train)
print ("R^2 is: \n", linm.score(X_test, y_test))

R^2 is: 
 0.904031801562


In [169]:
predictions = linm.predict(X_test)

print("Mean Absolute Error : " + str(mean_squared_error(y_test,predictions)))

Mean Absolute Error : 0.0155060339193


### XGBoost

In [170]:
learning_rates = np.arange(0.0, 0.09, 0.0015)
learning_rates
best_learning_rates = [0.0885, 0.0735, 0.0705, 0.0615, 0.06, 0.0585, 0.057, 0.0555]

In [171]:
# to find best parameters
actual_values = y_test

for i in best_learning_rates:
    n_estimators = 1000
    my_model = XGBRegressor(n_estimators=n_estimators,learning_rate=i)
    xgboost_model = my_model.fit(X_train, y_train, early_stopping_rounds=5, 
                 eval_set=[(X_test, y_test)], verbose=False)
    preds_xgboost = xgboost_model.predict(X_test)

    plt.scatter(preds_xgboost, actual_values, alpha=.75, color='b')
    plt.xlabel('Predicted Price')
    plt.ylabel('Actual Price')
    plt.title('XGBoost with  = {}'.format(n_estimators))
    overlay = 'R^2 is: {}\nRMSE is: {}\nlearning Rate is: {}'.format(
                        xgboost_model.score(X_test, y_test),
                        mean_squared_error(y_test, preds_xgboost),
                        i)
    plt.annotate(s=overlay,xy=(12.1,10.6),size='x-large')
    #plt.show()

In [172]:
# XGBoost
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

XGBoost = XGBRegressor(n_estimators = 1000,learning_rate=0.0585)
XGBoost.fit(X_train, y_train)

prediction = XGBoost.predict(X_test)

print ("Test Score: \n", XGBoost.score(X_test, y_test))

Test Score: 
 0.909256761397


### Lasso regression:

In [173]:
# Lasso regression
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
lasso.fit(X_train, y_train)
print ("Test Score: \n", lasso.score(X_test, y_test))

Test Score: 
 0.908285770304


### Elastic Net Regression:

In [174]:
# Elastic Net regression
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
ENet.fit(X_train, y_train)
print ("Test Score: \n", ENet.score(X_test, y_test))

Test Score: 
 0.908254691296


### Gradient Boosting Regression:

In [175]:
# Gradient boosting regression
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

GBoost.fit(X_train, y_train)
print ("Test Score: \n", GBoost.score(X_test, y_test))


Test Score: 
 0.90376415


### Light GBM:

In [176]:
# LightGBM
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

In [177]:
model_lgb.fit(X_train, y_train)
print ("Test Score: \n", model_lgb.score(X_test, y_test))

Test Score: 
 0.908112444063


### Avereging Models:

In [178]:
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone


class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)   

### Enet regression + GBoost + LGBM + Lasso regression:

In [179]:
#Best version
averaged_models = AveragingModels(models = (ENet, GBoost, model_lgb, lasso))

averaged_models.fit(X_train, y_train)
print ("Test Score: \n", averaged_models.score(X_test, y_test))

Test Score: 
 0.913520882028


### Making a submission

In [180]:
submission = pd.DataFrame()
%store -r test_ID
submission['Id'] = test_ID

In [181]:
feats = test_set.select_dtypes(
        include=[np.number]).interpolate()
feats.shape

(1459, 701)

In [182]:
predictions = averaged_models.predict(feats)
final_predictions = np.exp(predictions)

In [183]:
submission['SalePrice'] = final_predictions
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,125663.288071
1,1462,136981.295813
2,1463,185402.021033
3,1464,197464.691003
4,1465,191086.25611


In [184]:
#submission.to_csv("name_of_file.csv", index=False)