# House Prices Competition : Term Project 

### Importing Libraries:

In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import linear_model
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn import neighbors
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

import matplotlib.pyplot as plt
plt.style.use(style='ggplot')
plt.rcParams['figure.figsize'] = (10, 6)

In [15]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
import lightgbm as lgb


In [16]:
# getting data from feature engineering notebook
%store -r train_set_1
%store -r test_set_1
%store -r ytrain_1

test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")


train_set = train_set_1
test_set = test_set_1

In [17]:
train_set.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtUnfSF,...,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,Street_Grvl,Street_Pave,SalePrice
0,6.75227,6.749931,0.0,0,1.098612,6.559615,0.0,0.0,0.0,5.010635,...,0,0,0,0,0,0,1,0,1,208500
1,7.140453,0.0,0.0,0,1.098612,6.88551,0.0,0.0,0.0,5.648974,...,0,0,0,0,0,0,1,0,1,181500
2,6.824374,6.763885,0.0,0,1.098612,6.186209,0.0,0.0,0.0,6.073045,...,0,0,0,0,0,0,1,0,1,223500
3,6.867974,6.628041,0.0,0,1.098612,5.375278,0.0,0.0,0.0,6.291569,...,0,0,0,0,0,0,1,0,1,140000
4,7.04316,6.959399,0.0,0,1.386294,6.484635,0.0,0.0,0.0,6.194405,...,0,0,0,0,0,0,1,0,1,250000


In [18]:
print ("Train data shape:", train_set.shape)

Train data shape: (1452, 292)


In [19]:
test_set.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtUnfSF,...,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,Street_Grvl,Street_Pave
1452,6.79794,0.0,0.0,0,0.693147,6.148468,4.969813,0.0,0.0,5.598422,...,0,0,0,0,0,0,0,1,0,1
1453,7.192182,0.0,0.0,0,1.098612,6.827629,0.0,0.0,0.0,6.006353,...,0,0,0,0,0,0,0,1,0,1
1454,6.833032,6.552508,0.0,0,1.098612,6.673298,0.0,0.0,0.0,4.919981,...,0,0,0,0,0,0,0,1,0,1
1455,6.830874,6.519147,0.0,0,1.098612,6.400257,0.0,0.0,0.0,5.780744,...,0,0,0,0,0,0,0,1,0,1
1456,7.154615,0.0,0.0,0,0.693147,5.572154,0.0,0.0,0.0,6.924612,...,0,0,0,0,0,0,0,1,0,1


In [20]:
print ("Test data shape:", test_set.shape)


Test data shape: (1459, 291)


## Building the model:

In [21]:
train_set = train_set.select_dtypes(include=[np.number]).interpolate().dropna()
test_set = test_set.select_dtypes(include=[np.number]).interpolate().dropna()
train_set.head(5)

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtUnfSF,...,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,Street_Grvl,Street_Pave,SalePrice
0,6.75227,6.749931,0.0,0,1.098612,6.559615,0.0,0.0,0.0,5.010635,...,0,0,0,0,0,0,1,0,1,208500
1,7.140453,0.0,0.0,0,1.098612,6.88551,0.0,0.0,0.0,5.648974,...,0,0,0,0,0,0,1,0,1,181500
2,6.824374,6.763885,0.0,0,1.098612,6.186209,0.0,0.0,0.0,6.073045,...,0,0,0,0,0,0,1,0,1,223500
3,6.867974,6.628041,0.0,0,1.098612,5.375278,0.0,0.0,0.0,6.291569,...,0,0,0,0,0,0,1,0,1,140000
4,7.04316,6.959399,0.0,0,1.386294,6.484635,0.0,0.0,0.0,6.194405,...,0,0,0,0,0,0,1,0,1,250000


In [22]:
test_set.head(5)

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtUnfSF,...,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,Street_Grvl,Street_Pave
1452,6.79794,0.0,0.0,0,0.693147,6.148468,4.969813,0.0,0.0,5.598422,...,0,0,0,0,0,0,0,1,0,1
1453,7.192182,0.0,0.0,0,1.098612,6.827629,0.0,0.0,0.0,6.006353,...,0,0,0,0,0,0,0,1,0,1
1454,6.833032,6.552508,0.0,0,1.098612,6.673298,0.0,0.0,0.0,4.919981,...,0,0,0,0,0,0,0,1,0,1
1455,6.830874,6.519147,0.0,0,1.098612,6.400257,0.0,0.0,0.0,5.780744,...,0,0,0,0,0,0,0,1,0,1
1456,7.154615,0.0,0.0,0,0.693147,5.572154,0.0,0.0,0.0,6.924612,...,0,0,0,0,0,0,0,1,0,1


In [23]:
# apply log to have 0 skew
test_ID = test["Id"]
from sklearn.preprocessing import MinMaxScaler
ytrain_1 = train_set['SalePrice']

ytrain_1

0       208500
1       181500
2       223500
3       140000
4       250000
5       143000
6       307000
7       200000
8       129900
9       118000
10      129500
11      345000
12      144000
13      279500
14      157000
15      132000
16      149000
17       90000
18      159000
19      139000
20      325300
21      139400
22      230000
23      129900
24      154000
25      256300
26      134800
27      306000
28      207500
29       68500
         ...  
1422    192140
1423    143750
1424     64500
1425    186500
1426    160000
1427    174000
1428    120500
1429    394617
1430    149700
1431    197000
1432    191000
1433    149300
1434    310000
1435    121000
1436    179600
1437    129000
1438    157900
1439    240000
1440    112000
1441     92000
1442    136000
1443    287090
1444    145000
1445     84500
1446    185000
1447    175000
1448    210000
1449    266500
1450    142125
1451    147500
Name: SalePrice, Length: 1452, dtype: int64

In [24]:
scalery = MinMaxScaler().fit(ytrain_1.reshape(-1,1))

#ytrain_1 = scalery.transform(ytrain_1.reshape(-1,1))


y = np.log(ytrain_1)
X = train_set.drop(['SalePrice'], axis=1)
y

0       12.247694
1       12.109011
2       12.317167
3       11.849398
4       12.429216
5       11.870600
6       12.634603
7       12.206073
8       11.774520
9       11.678440
10      11.771436
11      12.751300
12      11.877569
13      12.540758
14      11.964001
15      11.790557
16      11.911702
17      11.407565
18      11.976659
19      11.842229
20      12.692503
21      11.845103
22      12.345835
23      11.774520
24      11.944708
25      12.454104
26      11.811547
27      12.631340
28      12.242887
29      11.134589
          ...    
1422    12.165980
1423    11.875831
1424    11.074421
1425    12.136187
1426    11.982929
1427    12.066811
1428    11.699405
1429    12.885671
1430    11.916389
1431    12.190959
1432    12.160029
1433    11.913713
1434    12.644328
1435    11.703546
1436    12.098487
1437    11.767568
1438    11.969717
1439    12.388394
1440    11.626254
1441    11.429544
1442    11.820410
1443    12.567551
1444    11.884489
1445    11.344507
1446    12

### PCA:

In [25]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X)                
t_train = scaler.transform(X)
pca_hp = PCA(30)
x_fit = pca_hp.fit_transform(t_train)
np.exp(pca_hp.explained_variance_ratio_)

array([ 1.07256389,  1.03336776,  1.02966177,  1.02418463,  1.02263217,
        1.017437  ,  1.0162862 ,  1.01562187,  1.01250237,  1.01199846,
        1.01175213,  1.0110723 ,  1.01064457,  1.01040284,  1.00999696,
        1.00981712,  1.00948806,  1.00930801,  1.00872576,  1.00866775,
        1.00843132,  1.00839249,  1.00802618,  1.00780494,  1.00776914,
        1.00768878,  1.00758132,  1.00751713,  1.0074031 ,  1.0071763 ])

train_test_split() returns four objects:

* X_train is the subset of our features used for training.
* X_test is the subset which will be our 'hold-out' set - what we'll use to test the model.
* y_train is the target variable SalePrice which corresponds to X_train.
* y_test is the target variable SalePrice which corresponds to X_test. 

random_state=42 allow reproducible results.

In [26]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(
                                   # X, y, random_state=42, test_size=.33)

# X_train1, X_test1, y_train1, y_test1 = train_test_split(
                                   # x_fit, y, random_state=42, test_size=.33)
    
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

from sklearn.cross_validation import KFold, cross_val_score, cross_val_predict
k_fold = KFold(len(y), n_folds=10, shuffle=True, random_state=0)

### Random Forest:

In [27]:
from sklearn.ensemble import RandomForestRegressor

# Fit Random Forest on Training Set
regressor = RandomForestRegressor(n_estimators=300, random_state=0)
model_random_forest = regressor.fit(X_train, y_train)


# Score model
cross_valid_scores = cross_val_score(model_random_forest, X_test, y_test,cv=5, n_jobs=1)

print ("Test score is: \n", model_random_forest.score(X_test, y_test)) 

print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean()) # mean of all cross validations tests
#print("Cross validation test prediction is: \n", cross_val_predict(model_random_forest, X_test, y_test, cv=5 ))

Test score is: 
 0.8754758075
Cross validation test scores are: 
 [ 0.84899148  0.87587195  0.82510258  0.88049997  0.74345674]
Cross validation test scores mean is: 
 0.834784544912


In [28]:
predictions_1 = model_random_forest.predict(X_test)
print("Mean squared Error : " + str(mean_squared_error(y_test,predictions_1)))

Mean squared Error : 0.0192430913368


### KNN:

In [29]:
# find best k number

ourScore=[]
for nn in range(1,15):
    knn = neighbors.KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                                        metric_params=None, n_jobs=1, n_neighbors=nn,p=2,
                                        weights='uniform')
    knn.fit(X_train, y_train)
    train_score = knn.score(X_train, y_train)
    test_score = knn.score(X_test, y_test)
    #print('k: %d, Train Acc: %.3f, Test Acc: %.3f' % (nn, train_score, test_score))
    rowScore=[nn,train_score,test_score]
    ourScore.append(rowScore)

In [30]:
k=5
knn = neighbors.KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=k, p=2,
          weights='uniform')
knn.fit(X_train, y_train)
train_score = knn.score(X_train, y_train)
test_score = knn.score(X_test, y_test)
cross_valid_scores = cross_val_score(knn, X_test, y_test,cv=5, n_jobs=1)

print ("Test score is: \n", knn.score(X_test, y_test)) 
print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean()) # mean of all cross validations tests
#print("Cross validation test prediction is: \n", cross_val_predict(knn, X_test, y_test, cv=5 ))

Test score is: 
 0.715497005517
Cross validation test scores are: 
 [ 0.67494026  0.76897377  0.70339879  0.67084375  0.60238022]
Cross validation test scores mean is: 
 0.684107358399


In [31]:
predictions_2 = knn.predict(X_test)
print("Mean squared Error : " + str(mean_squared_error(y_test,predictions_2)))

Mean squared Error : 0.043965088217


### Linear Regression:

In [32]:
# Linear Regression
from sklearn.linear_model import LinearRegression 

regressor = LinearRegression() 
regressor.fit(X_train, y_train)

# Score model
cross_valid_scores = cross_val_score(regressor, X_test, y_test,cv=5, n_jobs=1)

print ("Test score is: \n", regressor.score(X_test, y_test)) 
print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean()) # mean of all cross validations tests
#print("Cross validation test prediction is: \n", cross_val_predict(regressor, X_test, y_test, cv=5 ))

Test score is: 
 0.885697265407
Cross validation test scores are: 
 [  7.97529319e-01   8.29158601e-01  -4.19454935e+01  -1.42448350e+02
  -1.71619920e+03]
Cross validation test scores mean is: 
 -379.793270517


In [33]:
predictions_3 = regressor.predict(X_test)
print("Mean squared Error : " + str(mean_squared_error(y_test,predictions_3)))

Mean squared Error : 0.0176635392501


### Linear Regression with ridge regularization:

In [34]:
#to check what alpha value is better for the model

for i in range (-5, 5):
    alpha = 10**i
    rm = linear_model.Ridge(alpha=alpha)
    ridge_model = rm.fit(X_train, y_train)
    preds_ridge = ridge_model.predict(X_test)

    plt.scatter(preds_ridge, y_test, alpha=.75, color='b')
    plt.xlabel('Predicted Price')
    plt.ylabel('Actual Price')
    plt.title('Ridge Regularization with alpha = {}'.format(alpha))
    overlay = 'R^2 is: {}\nRMSE is: {}'.format(
                    ridge_model.score(X_test, y_test),
                    mean_squared_error(y_test, preds_ridge))
    plt.annotate(s=overlay,xy=(12.1,10.6),size='x-large')
    #plt.show()

In [35]:
from sklearn.metrics import mean_squared_error

linm = linear_model.Ridge(alpha = 10)

linm.fit(X_train, y_train)

# Score model
cross_valid_scores = cross_val_score(linm, X_test, y_test,cv=5, n_jobs=1)

print ("Test score is: \n", linm.score(X_test, y_test)) 
print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean()) # mean of all cross validations tests
#print("Cross validation test prediction is: \n", cross_val_predict(linm, X_test, y_test, cv=5 ))

Test score is: 
 0.902661626488
Cross validation test scores are: 
 [ 0.89017587  0.89994917  0.87510542  0.92186206  0.82871156]
Cross validation test scores mean is: 
 0.883160817513


In [36]:
predictions = linm.predict(X_test)
print("Mean Squared Error : " + str(mean_squared_error(y_test,predictions)))

Mean Squared Error : 0.0150419864161


### MLP with PCA:

In [37]:
mlp_regressor = MLPRegressor(solver='lbfgs',hidden_layer_sizes = (200,5,5),alpha = 1.0, activation = 'relu', max_iter = 100)
#mlp_regressor.fit(X_train1,y_train1)

# Score model
#cross_valid_scores = cross_val_score(mlp_regressor, X_test, y_test,cv=5, n_jobs=1)

#print ("Test score is: \n", mlp_regressor.score(X_test, y_test)) 
#print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
#print("Cross validation test scores mean is: \n", cross_valid_scores.mean()) # mean of all cross validations tests
#print("Cross validation test prediction is: \n", cross_val_predict(mlp_regressor, X_test, y_test, cv=5 ))

In [38]:
#predictions7 = mlp_regressor.predict(X_test1)
#print("Mean Squared Error : " + str(mean_squared_error(y_test1,predictions7)))

### SVM:

In [39]:
from sklearn.svm import SVR

# SVM
svr_model = SVR(kernel='rbf', C=2, epsilon=0.05)
svr_model.fit(X, y)

# Score model
cross_valid_scores = cross_val_score(svr_model, X_test, y_test,cv=5, n_jobs=1)

print ("Test score is: \n", svr_model.score(X_test, y_test)) 
print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean()) # mean of all cross validations tests
#print("Cross validation test prediction is: \n", cross_val_predict(svr_model, X_test, y_test, cv=5 ))

Test score is: 
 0.986044145018
Cross validation test scores are: 
 [ 0.29159063  0.2800178   0.19889673  0.31789074  0.29590738]
Cross validation test scores mean is: 
 0.276860655368


In [40]:
predictions_8 = svr_model.predict(X_test)
print("Mean Squared Error : " + str(mean_squared_error(y_test,predictions_8)))

Mean Squared Error : 0.0021566394988


### Basic Decision tree:

In [41]:
# Define model
basic_decision_tree = DecisionTreeRegressor()

# Fit model
basic_decision_tree.fit(X, y)

# Score model
cross_valid_scores = cross_val_score(basic_decision_tree, X_test, y_test,cv=5, n_jobs=1)

print ("Test score is: \n", basic_decision_tree.score(X_test, y_test)) 
print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean()) # mean of all cross validations tests
#print("Cross validation test prediction is: \n", cross_val_predict(basic_decision_tree, X_test, y_test, cv=5 ))

Test score is: 
 0.999999996944
Cross validation test scores are: 
 [ 0.59845402  0.71014058  0.72930218  0.51766057  0.65055252]
Cross validation test scores mean is: 
 0.641221973964


In [42]:
predictions_5 = basic_decision_tree.predict(X_test)
print("Mean Squared Error : " + str(mean_squared_error(y_test,predictions_5)))

Mean Squared Error : 4.72276373045e-10


### XGBoost

In [43]:
learning_rates = np.arange(0.0, 0.09, 0.0015)
learning_rates
best_learning_rates = [0.0885, 0.0735, 0.0705, 0.0615, 0.06, 0.0585, 0.057, 0.0555]

In [44]:
# to find best parameters
from xgboost import XGBRegressor

actual_values = y_test

for i in best_learning_rates:
    n_estimators = 1000
    my_model = XGBRegressor(n_estimators=n_estimators,learning_rate=i)
    xgboost_model = my_model.fit(X_train, y_train, early_stopping_rounds=5, 
                 eval_set=[(X_test, y_test)], verbose=False)
    preds_xgboost = xgboost_model.predict(X_test)

    plt.scatter(preds_xgboost, actual_values, alpha=.75, color='b')
    plt.xlabel('Predicted Price')
    plt.ylabel('Actual Price')
    plt.title('XGBoost with  = {}'.format(n_estimators))
    overlay = 'R^2 is: {}\nRMSE is: {}\nlearning Rate is: {}'.format(
                        xgboost_model.score(X_test, y_test),
                        mean_squared_error(y_test, preds_xgboost),
                        i)
    plt.annotate(s=overlay,xy=(12.1,10.6),size='x-large')
    #plt.show()

In [45]:
# XGBoost
XGBoost = XGBRegressor(n_estimators = 1000,learning_rate=0.0585)
XGBoost.fit(X_train, y_train)

# Score model
cross_valid_scores = cross_val_score(XGBoost, X_test, y_test,cv=5, n_jobs=1)

print ("Test score is: \n", XGBoost.score(X_test, y_test)) 
print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean()) # mean of all cross validations tests
#print("Cross validation test prediction is: \n", cross_val_predict(XGBoost, X_test, y_test, cv=5 ))

Test score is: 
 0.904333573085
Cross validation test scores are: 
 [ 0.87026815  0.89698366  0.84196832  0.88867976  0.81518447]
Cross validation test scores mean is: 
 0.862616871264


In [46]:
prediction = XGBoost.predict(X_test)
print("Mean squared Error : " + str(mean_squared_error(y_test,prediction)))

Mean squared Error : 0.0147836155691


### Lasso regression:

In [47]:
# Lasso regression
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
lasso.fit(X_train, y_train)
# Score model
cross_valid_scores = cross_val_score(lasso, X_test, y_test,cv=5, n_jobs=1)

print ("Test score is: \n", lasso.score(X_test, y_test)) 
print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean()) # mean of all cross validations tests
#print("Cross validation test prediction is: \n", cross_val_predict(lasso, X_test, y_test, cv=5 ))

Test score is: 
 0.910054829989
Cross validation test scores are: 
 [ 0.90894524  0.90800054  0.88153855  0.91038455  0.84606603]
Cross validation test scores mean is: 
 0.890986983248


In [48]:
predictions_lasso = lasso.predict(X_test)
print("Mean squared Error : " + str(mean_squared_error(y_test,predictions_lasso)))

Mean squared Error : 0.0138994928381


### Elastic Net Regression:

In [49]:
# Elastic Net regression
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
ENet.fit(X_train, y_train)
# Score model
cross_valid_scores = cross_val_score(ENet, X_test, y_test,cv=5, n_jobs=1)

print ("Test score is: \n", ENet.score(X_test, y_test)) 
print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean()) # mean of all cross validations tests
#print("Cross validation test prediction is: \n", cross_val_predict(ENet, X_test, y_test, cv=5 ))

Test score is: 
 0.910029351699
Cross validation test scores are: 
 [ 0.90817287  0.90671866  0.88042411  0.91041826  0.84490656]
Cross validation test scores mean is: 
 0.890128090157


In [50]:
predictions_ENet = ENet.predict(X_test)
print("Mean squared Error : " + str(mean_squared_error(y_test,predictions_ENet)))

Mean squared Error : 0.0139034300735


### Gradient Boosting Regression:

In [51]:
# Gradient boosting regression
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

GBoost.fit(X_train, y_train)
# Score model
cross_valid_scores = cross_val_score(GBoost, X_test, y_test,cv=5, n_jobs=1)

print ("Test score is: \n", GBoost.score(X_test, y_test)) 
print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean()) # mean of all cross validations tests
#print("Cross validation test prediction is: \n", cross_val_predict(GBoost, X_test, y_test, cv=5 ))

Test score is: 
 0.90059787427
Cross validation test scores are: 
 [ 0.87818367  0.87965382  0.86103002  0.90096045  0.79996106]
Cross validation test scores mean is: 
 0.863957802646


In [52]:
predictions_GBoost = GBoost.predict(X_test)
print("Mean squared Error : " + str(mean_squared_error(y_test,predictions_GBoost)))

Mean squared Error : 0.015360904143


### Light GBM:

In [53]:
# LightGBM
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)
model_lgb.fit(X_train, y_train.ravel())
# Score model
cross_valid_scores = cross_val_score(model_lgb, X_test, y_test.ravel(),cv=5, n_jobs=1)

print ("Test score is: \n", model_lgb.score(X_test, y_test.ravel())) 
print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean()) # mean of all cross validations tests
#print("Cross validation test prediction is: \n", cross_val_predict(model_lgb, X_test, y_test, cv=5 ))

Test score is: 
 0.900108707523
Cross validation test scores are: 
 [ 0.86479062  0.89212898  0.85439393  0.8801254   0.79452261]
Cross validation test scores mean is: 
 0.857192309692


In [54]:
predictions_model_lgb = model_lgb.predict(X_test)
print("Mean squared Error : " + str(mean_squared_error(y_test,predictions_model_lgb)))

Mean squared Error : 0.0154364965256


### AdaBoost:

In [55]:
from sklearn.ensemble import AdaBoostRegressor

ada_boost = AdaBoostRegressor(n_estimators=100)
ada_boost.fit(X_train, y_train)
# Score model
cross_valid_scores = cross_val_score(ada_boost, X_test, y_test,cv=5, n_jobs=1)

print ("Test score is: \n", ada_boost.score(X_test, y_test)) 
print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean()) # mean of all cross validations tests
#print("Cross validation test prediction is: \n", cross_val_predict(model_lgb, X_test, y_test, cv=5 ))

Test score is: 
 0.842546999761
Cross validation test scores are: 
 [ 0.80575271  0.84585664  0.76866779  0.81159899  0.68821553]
Cross validation test scores mean is: 
 0.784018333418


### Avereging Models:

In [56]:
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone


class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)   

### Enet regression + GBoost + LGBM + Lasso regression (Best Score):

In [57]:
#Best version
averaged_models = AveragingModels(models = (ENet, GBoost, model_lgb, lasso))

averaged_models.fit(X_train, y_train.ravel())
print ("Test Score: \n", averaged_models.score(X_test, y_test))
print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean())

Test Score: 
 0.912363451983
Cross validation test scores are: 
 [ 0.80575271  0.84585664  0.76866779  0.81159899  0.68821553]
Cross validation test scores mean is: 
 0.784018333418


### Ridge LR + XGBoost + LGBM + Lasso regression:

In [58]:

averaged_models_1 = AveragingModels(models = (linm, XGBoost, model_lgb, lasso))

averaged_models_1.fit(X_train, y_train.ravel())
print ("Test Score: \n", averaged_models_1.score(X_test, y_test.ravel()))
print("Cross validation test scores are: \n", cross_valid_scores) #return a np.array of each test trill
print("Cross validation test scores mean is: \n", cross_valid_scores.mean())

Test Score: 
 0.912688599295
Cross validation test scores are: 
 [ 0.80575271  0.84585664  0.76866779  0.81159899  0.68821553]
Cross validation test scores mean is: 
 0.784018333418


In [59]:
predictions = averaged_models_1.predict(X_test)
print("Mean squared Error : " + str(mean_squared_error(y_test,predictions)))

Mean squared Error : 0.0134924886865


### Making a submission

In [60]:
submission = pd.DataFrame()
%store -r test_ID
submission['Id'] = test_ID

In [61]:
feats = test_set.select_dtypes(
        include=[np.number]).interpolate()
feats.shape

(1459, 291)

In [62]:

predictions = averaged_models_1.predict(feats)

#scalery = MinMaxScaler().fit(predictions.reshape(-1,1))

#final_predictions = scalery.inverse_transform(predictions.reshape(-1,1))
final_predictions = predictions


In [63]:
submission['SalePrice'] = final_predictions
submission.head(10)

Unnamed: 0,Id,SalePrice
0,1461,11.693361
1,1462,11.954011
2,1463,12.159277
3,1464,12.204537
4,1465,12.159554
5,1466,12.045458
6,1467,12.052121
7,1468,12.004721
8,1469,12.139637
9,1470,11.707953


In [64]:
#submission.to_csv("new_avereged_submission.csv", index=False)