# House Prices Competition : Term Project 

### Importing Libraries:

In [102]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import linear_model
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn import neighbors
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

import matplotlib.pyplot as plt
plt.style.use(style='ggplot')
plt.rcParams['figure.figsize'] = (10, 6)

In [103]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
import lightgbm as lgb


In [104]:
# getting data from feature engineering notebook
%store -r train_set_1
%store -r test_set_1
%store -r ytrain_1

test = pd.read_csv("test.csv")

train_set = train_set_1
test_set = test_set_1

In [105]:
train_set.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtUnfSF,...,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,Street_Grvl,Street_Pave
0,856,854,0,0,3,706.0,0.0,1.0,0,150.0,...,0,0,0,0,0,0,0,1,0,1
1,1262,0,0,0,3,978.0,0.0,0.0,1,284.0,...,0,0,0,0,0,0,0,1,0,1
2,920,866,0,0,3,486.0,0.0,1.0,0,434.0,...,0,0,0,0,0,0,0,1,0,1
3,961,756,0,0,3,216.0,0.0,1.0,0,540.0,...,0,0,0,0,0,0,0,1,0,1
4,1145,1053,0,0,4,655.0,0.0,1.0,0,490.0,...,0,0,0,0,0,0,0,1,0,1


In [106]:
print ("Train data shape:", train_set.shape)

Train data shape: (1452, 291)


In [107]:
test_set.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtUnfSF,...,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,Street_Grvl,Street_Pave
1452,896,0,0,0,2,468.0,144.0,0.0,0,270.0,...,0,0,0,0,0,0,0,1,0,1
1453,1329,0,0,0,3,923.0,0.0,0.0,0,406.0,...,0,0,0,0,0,0,0,1,0,1
1454,928,701,0,0,3,791.0,0.0,0.0,0,137.0,...,0,0,0,0,0,0,0,1,0,1
1455,926,678,0,0,3,602.0,0.0,0.0,0,324.0,...,0,0,0,0,0,0,0,1,0,1
1456,1280,0,0,0,2,263.0,0.0,0.0,0,1017.0,...,0,0,0,0,0,0,0,1,0,1


In [108]:
print ("Test data shape:", test_set.shape)


Test data shape: (1459, 291)


## Building the model:

In [109]:
train_set = train_set.select_dtypes(include=[np.number]).interpolate().dropna()
test_set = test_set.select_dtypes(include=[np.number]).interpolate().dropna()
train_set.head(5)

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtUnfSF,...,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,Street_Grvl,Street_Pave
0,856,854,0,0,3,706.0,0.0,1.0,0,150.0,...,0,0,0,0,0,0,0,1,0,1
1,1262,0,0,0,3,978.0,0.0,0.0,1,284.0,...,0,0,0,0,0,0,0,1,0,1
2,920,866,0,0,3,486.0,0.0,1.0,0,434.0,...,0,0,0,0,0,0,0,1,0,1
3,961,756,0,0,3,216.0,0.0,1.0,0,540.0,...,0,0,0,0,0,0,0,1,0,1
4,1145,1053,0,0,4,655.0,0.0,1.0,0,490.0,...,0,0,0,0,0,0,0,1,0,1


In [110]:
test_set.head(5)

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtUnfSF,...,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,Street_Grvl,Street_Pave
1452,896,0,0,0,2,468.0,144.0,0.0,0,270.0,...,0,0,0,0,0,0,0,1,0,1
1453,1329,0,0,0,3,923.0,0.0,0.0,0,406.0,...,0,0,0,0,0,0,0,1,0,1
1454,928,701,0,0,3,791.0,0.0,0.0,0,137.0,...,0,0,0,0,0,0,0,1,0,1
1455,926,678,0,0,3,602.0,0.0,0.0,0,324.0,...,0,0,0,0,0,0,0,1,0,1
1456,1280,0,0,0,2,263.0,0.0,0.0,0,1017.0,...,0,0,0,0,0,0,0,1,0,1


In [111]:
# apply log to have 0 skew
test_ID = test["Id"]
y = np.log(ytrain_1)
X = train_set#.drop(['SalePrice'], axis=1)

### PCA:

In [112]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X)                
t_train = scaler.transform(X)
pca_hp = PCA(30)
x_fit = pca_hp.fit_transform(t_train)
np.exp(pca_hp.explained_variance_ratio_)

array([ 1.0736579 ,  1.03167675,  1.02673249,  1.02388893,  1.02276856,
        1.01835777,  1.01627425,  1.01571856,  1.01227279,  1.01219373,
        1.01183313,  1.01121734,  1.01068471,  1.01009807,  1.00991963,
        1.00967788,  1.0093183 ,  1.0090842 ,  1.00888568,  1.00856901,
        1.00840546,  1.00805222,  1.00798035,  1.00793523,  1.0078155 ,
        1.00767133,  1.0074948 ,  1.00746451,  1.00728964,  1.00713671])

train_test_split() returns four objects:

* X_train is the subset of our features used for training.
* X_test is the subset which will be our 'hold-out' set - what we'll use to test the model.
* y_train is the target variable SalePrice which corresponds to X_train.
* y_test is the target variable SalePrice which corresponds to X_test. 

random_state=42 allow reproducible results.

In [113]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(
                                   # X, y, random_state=42, test_size=.33)

# X_train1, X_test1, y_train1, y_test1 = train_test_split(
                                   # x_fit, y, random_state=42, test_size=.33)
    
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

from sklearn.cross_validation import KFold, cross_val_score, cross_val_predict
k_fold = KFold(len(y), n_folds=10, shuffle=True, random_state=0)

### Random Forest:

In [114]:
from sklearn.ensemble import RandomForestRegressor

# Fit Random Forest on Training Set
regressor = RandomForestRegressor(n_estimators=300, random_state=0)
model_random_forest = regressor.fit(X_train, y_train)


# Score model
cross_valid_scores = cross_val_score(model_random_forest, X_test, y_test,cv=5, n_jobs=1)

print ("Test score is: \n", model_random_forest.score(X_test, y_test)) 

print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean()) # mean of all cross validations tests
#print("Cross validation test prediction is: \n", cross_val_predict(model_random_forest, X_test, y_test, cv=5 ))

Test score is: 
 0.875852404421
Cross validation test scores are: 
 [ 0.84573491  0.87476335  0.82467794  0.87904882  0.74409852]
Cross validation test scores mean is: 
 0.833664706943


In [115]:
predictions_1 = model_random_forest.predict(X_test)
print("Mean squared Error : " + str(mean_squared_error(y_test,predictions_1)))

Mean squared Error : 0.019184894702


### KNN:

In [116]:
# find best k number

ourScore=[]
for nn in range(1,15):
    knn = neighbors.KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                                        metric_params=None, n_jobs=1, n_neighbors=nn,p=2,
                                        weights='uniform')
    knn.fit(X_train, y_train)
    train_score = knn.score(X_train, y_train)
    test_score = knn.score(X_test, y_test)
    #print('k: %d, Train Acc: %.3f, Test Acc: %.3f' % (nn, train_score, test_score))
    rowScore=[nn,train_score,test_score]
    ourScore.append(rowScore)

In [117]:
k=5
knn = neighbors.KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=k, p=2,
          weights='uniform')
knn.fit(X_train, y_train)
train_score = knn.score(X_train, y_train)
test_score = knn.score(X_test, y_test)
cross_valid_scores = cross_val_score(knn, X_test, y_test,cv=5, n_jobs=1)

print ("Test score is: \n", knn.score(X_test, y_test)) 
print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean()) # mean of all cross validations tests
#print("Cross validation test prediction is: \n", cross_val_predict(knn, X_test, y_test, cv=5 ))

Test score is: 
 0.680137288511
Cross validation test scores are: 
 [ 0.77000175  0.71250919  0.63535927  0.63135714  0.52528313]
Cross validation test scores mean is: 
 0.654902095849


In [118]:
predictions_2 = knn.predict(X_test)
print("Mean squared Error : " + str(mean_squared_error(y_test,predictions_2)))

Mean squared Error : 0.0494293297457


### Linear Regression:

In [119]:
# Linear Regression
from sklearn.linear_model import LinearRegression 

regressor = LinearRegression() 
regressor.fit(X_train, y_train)

# Score model
cross_valid_scores = cross_val_score(regressor, X_test, y_test,cv=5, n_jobs=1)

print ("Test score is: \n", regressor.score(X_test, y_test)) 
print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean()) # mean of all cross validations tests
#print("Cross validation test prediction is: \n", cross_val_predict(regressor, X_test, y_test, cv=5 ))

Test score is: 
 -345840.431275
Cross validation test scores are: 
 [ -7.51624459e+07  -2.32520598e+06  -2.54217721e+08  -8.33177760e+14
  -1.75764522e+08]
Cross validation test scores mean is: 
 -1.66635653458e+14


In [120]:
predictions_3 = regressor.predict(X_test)
print("Mean squared Error : " + str(mean_squared_error(y_test,predictions_3)))

Mean squared Error : 53443.8980607


### Linear Regression with ridge regularization:

In [121]:
#to check what alpha value is better for the model

for i in range (-5, 5):
    alpha = 10**i
    rm = linear_model.Ridge(alpha=alpha)
    ridge_model = rm.fit(X_train, y_train)
    preds_ridge = ridge_model.predict(X_test)

    plt.scatter(preds_ridge, y_test, alpha=.75, color='b')
    plt.xlabel('Predicted Price')
    plt.ylabel('Actual Price')
    plt.title('Ridge Regularization with alpha = {}'.format(alpha))
    overlay = 'R^2 is: {}\nRMSE is: {}'.format(
                    ridge_model.score(X_test, y_test),
                    mean_squared_error(y_test, preds_ridge))
    plt.annotate(s=overlay,xy=(12.1,10.6),size='x-large')
    #plt.show()

In [122]:
from sklearn.metrics import mean_squared_error

linm = linear_model.Ridge(alpha = 10)

linm.fit(X_train, y_train)

# Score model
cross_valid_scores = cross_val_score(linm, X_test, y_test,cv=5, n_jobs=1)

print ("Test score is: \n", linm.score(X_test, y_test)) 
print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean()) # mean of all cross validations tests
#print("Cross validation test prediction is: \n", cross_val_predict(linm, X_test, y_test, cv=5 ))

Test score is: 
 0.901239708666
Cross validation test scores are: 
 [ 0.90647181  0.90401869  0.88215003  0.87922977  0.84182268]
Cross validation test scores mean is: 
 0.882738595711


In [123]:
predictions = linm.predict(X_test)
print("Mean Squared Error : " + str(mean_squared_error(y_test,predictions)))

Mean Squared Error : 0.0152617195777


### MLP with PCA:

In [124]:
mlp_regressor = MLPRegressor(solver='lbfgs',hidden_layer_sizes = (200,5,5),alpha = 1.0, activation = 'relu', max_iter = 100)
#mlp_regressor.fit(X_train1,y_train1)

# Score model
#cross_valid_scores = cross_val_score(mlp_regressor, X_test, y_test,cv=5, n_jobs=1)

#print ("Test score is: \n", mlp_regressor.score(X_test, y_test)) 
#print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
#print("Cross validation test scores mean is: \n", cross_valid_scores.mean()) # mean of all cross validations tests
#print("Cross validation test prediction is: \n", cross_val_predict(mlp_regressor, X_test, y_test, cv=5 ))

In [125]:
#predictions7 = mlp_regressor.predict(X_test1)
#print("Mean Squared Error : " + str(mean_squared_error(y_test1,predictions7)))

### SVM:

In [126]:
from sklearn.svm import SVR

# SVM
svr_model = SVR(kernel='rbf', C=2, epsilon=0.05)
svr_model.fit(X, y)

# Score model
cross_valid_scores = cross_val_score(svr_model, X_test, y_test,cv=5, n_jobs=1)

print ("Test score is: \n", svr_model.score(X_test, y_test)) 
print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean()) # mean of all cross validations tests
#print("Cross validation test prediction is: \n", cross_val_predict(svr_model, X_test, y_test, cv=5 ))

Test score is: 
 0.984626677384
Cross validation test scores are: 
 [-0.00696766 -0.01162428 -0.0065101  -0.00469497 -0.01529933]
Cross validation test scores mean is: 
 -0.00901926798044


In [127]:
predictions_8 = svr_model.predict(X_test)
print("Mean Squared Error : " + str(mean_squared_error(y_test,predictions_8)))

Mean Squared Error : 0.0023756849598


### Basic Decision tree:

In [128]:
# Define model
basic_decision_tree = DecisionTreeRegressor()

# Fit model
basic_decision_tree.fit(X, y)

# Score model
cross_valid_scores = cross_val_score(basic_decision_tree, X_test, y_test,cv=5, n_jobs=1)

print ("Test score is: \n", basic_decision_tree.score(X_test, y_test)) 
print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean()) # mean of all cross validations tests
#print("Cross validation test prediction is: \n", cross_val_predict(basic_decision_tree, X_test, y_test, cv=5 ))

Test score is: 
 0.999999996944
Cross validation test scores are: 
 [ 0.56334312  0.72504334  0.69594589  0.65350629  0.60375484]
Cross validation test scores mean is: 
 0.648318693287


In [129]:
predictions_5 = basic_decision_tree.predict(X_test)
print("Mean Squared Error : " + str(mean_squared_error(y_test,predictions_5)))

Mean Squared Error : 4.72276373045e-10


### XGBoost

In [130]:
learning_rates = np.arange(0.0, 0.09, 0.0015)
learning_rates
best_learning_rates = [0.0885, 0.0735, 0.0705, 0.0615, 0.06, 0.0585, 0.057, 0.0555]

In [131]:
# to find best parameters
from xgboost import XGBRegressor

actual_values = y_test

for i in best_learning_rates:
    n_estimators = 1000
    my_model = XGBRegressor(n_estimators=n_estimators,learning_rate=i)
    xgboost_model = my_model.fit(X_train, y_train, early_stopping_rounds=5, 
                 eval_set=[(X_test, y_test)], verbose=False)
    preds_xgboost = xgboost_model.predict(X_test)

    plt.scatter(preds_xgboost, actual_values, alpha=.75, color='b')
    plt.xlabel('Predicted Price')
    plt.ylabel('Actual Price')
    plt.title('XGBoost with  = {}'.format(n_estimators))
    overlay = 'R^2 is: {}\nRMSE is: {}\nlearning Rate is: {}'.format(
                        xgboost_model.score(X_test, y_test),
                        mean_squared_error(y_test, preds_xgboost),
                        i)
    plt.annotate(s=overlay,xy=(12.1,10.6),size='x-large')
    #plt.show()

In [132]:
# XGBoost
XGBoost = XGBRegressor(n_estimators = 1000,learning_rate=0.0585)
XGBoost.fit(X_train, y_train)

# Score model
cross_valid_scores = cross_val_score(XGBoost, X_test, y_test,cv=5, n_jobs=1)

print ("Test score is: \n", XGBoost.score(X_test, y_test)) 
print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean()) # mean of all cross validations tests
#print("Cross validation test prediction is: \n", cross_val_predict(XGBoost, X_test, y_test, cv=5 ))

Test score is: 
 0.905846405165
Cross validation test scores are: 
 [ 0.8728141   0.90323918  0.84306822  0.8939824   0.81593535]
Cross validation test scores mean is: 
 0.865807852417


In [133]:
prediction = XGBoost.predict(X_test)
print("Mean squared Error : " + str(mean_squared_error(y_test,prediction)))

Mean squared Error : 0.0145498331587


### Lasso regression:

In [134]:
# Lasso regression
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
lasso.fit(X_train, y_train)
# Score model
cross_valid_scores = cross_val_score(lasso, X_test, y_test,cv=5, n_jobs=1)

print ("Test score is: \n", lasso.score(X_test, y_test)) 
print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean()) # mean of all cross validations tests
#print("Cross validation test prediction is: \n", cross_val_predict(lasso, X_test, y_test, cv=5 ))

Test score is: 
 0.904965675904
Cross validation test scores are: 
 [ 0.9097912   0.911444    0.8775565   0.8667184   0.85165889]
Cross validation test scores mean is: 
 0.883433798795


In [135]:
predictions_lasso = lasso.predict(X_test)
print("Mean squared Error : " + str(mean_squared_error(y_test,predictions_lasso)))

Mean squared Error : 0.0146859348533


### Elastic Net Regression:

In [136]:
# Elastic Net regression
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
ENet.fit(X_train, y_train)
# Score model
cross_valid_scores = cross_val_score(ENet, X_test, y_test,cv=5, n_jobs=1)

print ("Test score is: \n", ENet.score(X_test, y_test)) 
print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean()) # mean of all cross validations tests
#print("Cross validation test prediction is: \n", cross_val_predict(ENet, X_test, y_test, cv=5 ))

Test score is: 
 0.90495230902
Cross validation test scores are: 
 [ 0.90871428  0.91066253  0.87583394  0.86303148  0.8514074 ]
Cross validation test scores mean is: 
 0.881929926572


In [137]:
predictions_ENet = ENet.predict(X_test)
print("Mean squared Error : " + str(mean_squared_error(y_test,predictions_ENet)))

Mean squared Error : 0.0146880004772


### Gradient Boosting Regression:

In [138]:
# Gradient boosting regression
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

GBoost.fit(X_train, y_train)
# Score model
cross_valid_scores = cross_val_score(GBoost, X_test, y_test,cv=5, n_jobs=1)

print ("Test score is: \n", GBoost.score(X_test, y_test)) 
print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean()) # mean of all cross validations tests
#print("Cross validation test prediction is: \n", cross_val_predict(GBoost, X_test, y_test, cv=5 ))

Test score is: 
 0.900061935524
Cross validation test scores are: 
 [ 0.87968122  0.8804556   0.85831129  0.90637986  0.80641654]
Cross validation test scores mean is: 
 0.866248902207


In [139]:
predictions_GBoost = GBoost.predict(X_test)
print("Mean squared Error : " + str(mean_squared_error(y_test,predictions_GBoost)))

Mean squared Error : 0.0154437243408


### Light GBM:

In [140]:
# LightGBM
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)
model_lgb.fit(X_train, y_train)
# Score model
cross_valid_scores = cross_val_score(model_lgb, X_test, y_test,cv=5, n_jobs=1)

print ("Test score is: \n", model_lgb.score(X_test, y_test)) 
print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean()) # mean of all cross validations tests
#print("Cross validation test prediction is: \n", cross_val_predict(model_lgb, X_test, y_test, cv=5 ))

Test score is: 
 0.902791193686
Cross validation test scores are: 
 [ 0.86630833  0.88846564  0.86431452  0.88149719  0.80869944]
Cross validation test scores mean is: 
 0.861857025353


In [141]:
predictions_model_lgb = model_lgb.predict(X_test)
print("Mean squared Error : " + str(mean_squared_error(y_test,predictions_model_lgb)))

Mean squared Error : 0.0150219640142


### AdaBoost:

In [152]:
from sklearn.ensemble import AdaBoostRegressor

ada_boost = AdaBoostRegressor(n_estimators=100)
ada_boost.fit(X_train, y_train)
# Score model
cross_valid_scores = cross_val_score(ada_boost, X_test, y_test,cv=5, n_jobs=1)

print ("Test score is: \n", ada_boost.score(X_test, y_test)) 
print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean()) # mean of all cross validations tests
#print("Cross validation test prediction is: \n", cross_val_predict(model_lgb, X_test, y_test, cv=5 ))

Test score is: 
 0.841973305734
Cross validation test scores are: 
 [ 0.81137849  0.85855711  0.77033045  0.83148453  0.67782507]
Cross validation test scores mean is: 
 0.78991512951


### Avereging Models:

In [142]:
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone


class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)   

### Enet regression + GBoost + LGBM + Lasso regression (Best Score):

In [143]:
#Best version
averaged_models = AveragingModels(models = (ENet, GBoost, model_lgb, lasso))

averaged_models.fit(X_train, y_train)
print ("Test Score: \n", averaged_models.score(X_test, y_test))
print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean())

Test Score: 
 0.910253536225
Cross validation test scores are: 
 [ 0.86630833  0.88846564  0.86431452  0.88149719  0.80869944]
Cross validation test scores mean is: 
 0.861857025353


In [144]:
### Ridge LR + XGBoost + 

In [145]:
#Best version
averaged_models_1 = AveragingModels(models = (linm, XGBoost, model_lgb, lasso))

averaged_models_1.fit(X_train, y_train)
print ("Test Score: \n", averaged_models_1.score(X_test, y_test))
print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean())

Test Score: 
 0.911361191946
Cross validation test scores are: 
 [ 0.86630833  0.88846564  0.86431452  0.88149719  0.80869944]
Cross validation test scores mean is: 
 0.861857025353


In [146]:
predictions = averaged_models_1.predict(X_test)
print("Mean squared Error : " + str(mean_squared_error(y_test,predictions)))

Mean squared Error : 0.0136976168656


### Making a submission

In [147]:
submission = pd.DataFrame()
%store -r test_ID
submission['Id'] = test_ID

In [148]:
feats = test_set.select_dtypes(
        include=[np.number]).interpolate()
feats.shape

(1459, 291)

In [149]:
predictions = averaged_models_1.predict(feats)
final_predictions = np.exp(predictions)

In [150]:
submission['SalePrice'] = final_predictions
submission.head(10)

Unnamed: 0,Id,SalePrice
0,1461,119362.363249
1,1462,137190.902649
2,1463,184790.952414
3,1464,194707.129109
4,1465,188869.410467
5,1466,171158.002981
6,1467,176501.323929
7,1468,165348.480997
8,1469,187015.501099
9,1470,120568.067025


In [151]:
submission.to_csv("new_avereged_submission.csv", index=False)