In [2]:
#######################
# Importing Libraries #
#######################

#--Adding Data Types--#
import numpy as np
import pandas as pd
#--Processing--#
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
#--RandomForest--#
from sklearn.ensemble import RandomForestRegressor
#--Gradient Boosting--#
from sklearn.ensemble import GradientBoostingRegressor
#--Extreme Gradient Boosting--#
from xgboost import XGBRegressor
#--Linear ElasticNet Regression--#
from sklearn.linear_model import ElasticNet
#--Error Metric--#
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
#--Optimization--#
from bayes_opt import BayesianOptimization

In [3]:
pd.set_option('display.max_columns', 100)

# Data Preperation For Tree's

In [4]:
####################
# Loading the Data #
####################

train_clean = pd.read_csv("train_clean.csv")
test_clean = pd.read_csv("test_clean.csv")

print("Training Dimensions: ", train_clean.shape)
print("Testing Dimensions: ", test_clean.shape)

######################
# Getting Id Columns #
######################
colId = pd.read_csv("test.csv")
colId = colId.Id

Training Dimensions:  (1460, 65)
Testing Dimensions:  (1459, 64)


In [5]:
####################################
# Applying Transforms to Functions #
####################################

train_clean['SalePrice'] = train_clean['SalePrice'].apply(lambda x: np.log(x + 1))
train_clean['GarageArea'] = train_clean['GarageArea'].apply(lambda x: np.log(x + 1))
train_clean['X2ndFlrSF'] = train_clean['X2ndFlrSF'].apply(lambda x: np.log(x + 1))
train_clean['TotalBsmtSF'] = train_clean['TotalBsmtSF'].apply(lambda x: np.log(x + 1))
# train_clean['SaleType'] = train_clean['SaleType'].astype(str)
# train_clean['OverallCond'] = train_clean['OverallCond'].astype(str)

test_clean['GarageArea'] = test_clean['GarageArea'].apply(lambda x: np.log(x + 1))
test_clean['X2ndFlrSF'] = test_clean['X2ndFlrSF'].apply(lambda x: np.log(x + 1))
test_clean['TotalBsmtSF'] = test_clean['TotalBsmtSF'].apply(lambda x: np.log(x + 1))
# test_clean['SaleType'] = test_clean['SaleType'].astype(str)
# test_clean['OverallCond'] = test_clean['OverallCond'].astype(str)

In [7]:
###################################
# Adding New Columns to DataFrame #
###################################

# train_clean['BsmnQual_BsmtCond'] = train_clean['BsmtQual'] + train_clean['BsmtCond']
# train_clean['ExterQual_ExterCond'] = train_clean['ExterQual'] + train_clean['ExterCond']
# train_clean['GarageType_GarageQual'] = train_clean['GarageType'] + train_clean['GarageQual']
# train_clean['MoSold_YrSold'] = train_clean['MoSold'] + train_clean['YrSold']
# train_clean['SaleType_SaleCondition'] = train_clean['SaleType'] + train_clean['SaleCondition']
# train_clean['SaleType_OverallCond'] = train_clean['SaleType'] + train_clean['OverallCond']

# test_clean['BsmnQual_BsmtCond'] = test_clean['BsmtQual'] + test_clean['BsmtCond']
# test_clean['ExterQual_ExterCond'] = test_clean['ExterQual'] + test_clean['ExterCond']
# test_clean['GarageType_GarageQual'] = test_clean['GarageType'] + test_clean['GarageQual']
# test_clean['MoSold_YrSold'] = test_clean['MoSold'] + test_clean['YrSold']
# test_clean['SaleType_SaleCondition'] = test_clean['SaleType'] + test_clean['SaleCondition']
# test_clean['SaleType_OverallCond'] = test_clean['SaleType'] + test_clean['OverallCond']

In [8]:
###################################
# Applying Label Encoding to Data #
###################################

for c in train_clean.columns:
    if train_clean[c].dtype == 'object':
        le = LabelEncoder()
        # Need to convert the column type to string in order to encode missing values
        train_clean[c] = le.fit_transform(train_clean[c].astype(str))
for c in test_clean.columns:
    if test_clean[c].dtype == 'object':
        le = LabelEncoder()
        # Need to convert the column type to string in order to encode missing values
        test_clean[c] = le.fit_transform(test_clean[c].astype(str))

In [9]:
# train_clean.head()
# test_clean.head()

In [10]:
##################
# Splitting Data #  #(Only splitting the training data into two more sets called train_set, and test_set)
##################

train_set, test_set = train_test_split(train_clean, test_size = 0.2, random_state = 42)

print(train_set.shape)
print(test_set.shape)

X_train = train_set.drop("SalePrice", axis = 1)
Y_train = train_set.SalePrice

X_test = test_set.drop("SalePrice", axis = 1)
Y_test = test_set.SalePrice

#########################################
# The Full Original Training Set to Use #
#########################################

X_full_train = train_clean.drop("SalePrice", axis = 1)
Y_full_train = train_clean.SalePrice

(1168, 65)
(292, 65)


# Random Forest Section

In [11]:
#################################################
# RandomForest Model To See Best Features Split #
#################################################
mse = []
for i in range(1,65):
    randForest = RandomForestRegressor(n_estimators=1000, min_samples_leaf= 5, 
                                       max_features=i, oob_score = True, random_state=42, n_jobs=3)
    randForest.fit(X_train, Y_train)
    forestPredictions = randForest.predict(X_test)
    mse.append(mean_squared_error(Y_test, forestPredictions))

In [13]:
############################################
# Just to See The Index of the Lowest Tree #
############################################
lowest = 100000
index = 100000
for i,k in enumerate(mse):
    if k < lowest:
        lowest = k
        index = i
print(index, ':', lowest)

48 : 0.0224897570761


In [14]:
############################################
# Running the Forest on The Whole Training #
############################################

randForest = RandomForestRegressor(n_estimators=10000, min_samples_leaf= 5, 
                                       max_features=48, oob_score = True, random_state=42, n_jobs=3)
randForest.fit(X_full_train, Y_full_train)
# forestPredictions = randForest.predict(X_test)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=48, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=5,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10000, n_jobs=3, oob_score=True, random_state=42,
           verbose=0, warm_start=False)

In [15]:
###################################################
# Predicting The Kaggle DataSet with RandomForest #
###################################################

KagglePredictions = randForest.predict(test_clean)
KagglePredictions = [np.exp(x) - 1 for x in KagglePredictions]
pd.DataFrame({"SalePrice":KagglePredictions, "Id": colId}).to_csv("KaggleSubmitPythonForest.csv", index = False)
print(KagglePredictions[0:5])

[122753.72779751537, 152625.27336808146, 178481.14250168705, 182418.27386204046, 197268.41760919656]


# Gradient Boosting Section

In [16]:
################################
# Setting Up Gradient Boosting #
################################

def gradBoostCV(n_estimators, max_depth, max_features, min_samples_leaf):
    val = cross_val_score(GradientBoostingRegressor(
    n_estimators = int(n_estimators), max_depth = int(max_depth), min_samples_leaf = int(min_samples_leaf), 
        max_features = int(max_features), random_state = 42, learning_rate = 0.05
    ),X_train, Y_train, scoring = 'neg_mean_squared_error', cv = 10, n_jobs = 3).mean()
    return val

gradBoostBaye = BayesianOptimization(gradBoostCV, {
    'n_estimators': (100, 10000),
    'max_depth': (1,15),
    "max_features": (1,65),
    'min_samples_leaf': (2,10)
})

In [17]:
gradBoostBaye.maximize(n_iter=30)

[31mInitialization[0m
[94m--------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   max_depth |   max_features |   min_samples_leaf |   n_estimators | 
    1 | 00m03s | [35m  -0.01674[0m | [32m     3.5806[0m | [32m       32.0371[0m | [32m            2.4146[0m | [32m      680.2051[0m | 
    2 | 00m21s | [35m  -0.01626[0m | [32m     5.1711[0m | [32m       38.3526[0m | [32m            6.2481[0m | [32m     1979.0189[0m | 
    3 | 00m09s |   -0.01691 |     12.1591 |         9.0590 |             6.1274 |      5360.8641 | 
    4 | 00m58s |   -0.01919 |     12.4161 |        64.9074 |             9.9310 |      8964.6972 | 
    5 | 00m04s |   -0.01812 |     11.3823 |         4.1912 |             3.2435 |      2682.1776 | 
[31mBayesian Optimization[0m
[94m--------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   max_dep



    9 | 01m08s |   -0.01894 |     14.9550 |        62.4174 |             9.9662 |      9957.7299 | 
   10 | 00m08s |   -0.01922 |      1.0748 |         2.6378 |             4.5007 |      1424.4686 | 
   11 | 00m13s |   -0.02045 |     10.5849 |        63.0691 |             7.6110 |       109.0739 | 
   12 | 00m47s |   -0.02153 |     14.7419 |        63.1997 |             6.6354 |      1057.1127 | 
   13 | 00m40s |   -0.01729 |      2.2298 |        64.0690 |             4.6209 |      6053.6558 | 
   14 | 00m16s |   -0.01652 |      2.9570 |         4.7527 |             8.4139 |      8229.8027 | 
   15 | 00m11s |   -0.01739 |      1.0040 |         3.9660 |             4.8930 |      3976.6927 | 
   16 | 00m13s |   -0.01881 |      1.2904 |         2.5428 |             2.6455 |      6493.6360 | 
   17 | 00m17s |   -0.01833 |      1.1745 |        62.9760 |             5.0624 |      3257.9672 | 


  " state: %s" % convergence_dict)


   18 | 00m16s |   -0.01826 |      1.1532 |         3.5306 |             6.5945 |      9618.5282 | 
   19 | 00m14s |   -0.01924 |      1.0501 |        54.4267 |             3.2683 |      2315.6812 | 
   20 | 00m11s |   -0.01760 |     14.4244 |         3.5033 |             5.9629 |       587.6630 | 
   21 | 00m15s |   -0.01969 |     13.5469 |         2.4114 |             2.9288 |      8636.2477 | 
   22 | 00m11s |   -0.01863 |      1.4134 |         1.3990 |             9.2130 |      3227.4706 | 
   23 | 00m37s |   -0.02364 |     11.5495 |        63.0892 |             3.7163 |      8020.5608 | 


  " state: %s" % convergence_dict)


   24 | 00m14s |   -0.01794 |      1.9573 |         2.2218 |             9.0062 |      7532.7359 | 
   25 | 00m22s |   -0.01860 |      1.1132 |        63.6928 |             7.2892 |      5416.2626 | 
   26 | 01m12s |   -0.01918 |     14.3352 |        63.0232 |             9.1120 |      4335.6579 | 


  " state: %s" % convergence_dict)


   27 | 00m14s |   -0.01824 |      1.0876 |         5.0451 |             9.0042 |      5829.4599 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   28 | 00m10s |   -0.01847 |     14.3115 |         5.0504 |             3.4412 |      3608.1079 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   29 | 00m10s |   -0.01760 |      3.7891 |         1.8701 |             3.2528 |      5017.0229 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   30 | 00m11s |   -0.01782 |      1.0000 |         1.0000 |            10.0000 |      9032.2535 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   31 | 00m26s |   -0.01660 |     15.0000 |        18.9174 |            10.0000 |      6109.3864 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   32 | 00m08s |   -0.01739 |      3.4564 |         1.8182 |             2.3462 |      1883.1191 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   33 | 00m11s |   -0.01855 |      1.6808 |        64.8069 |             7.6449 |      1834.2592 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   34 | 00m09s | [35m  -0.01621[0m | [32m     6.0967[0m | [32m        3.8362[0m | [32m            9.8754[0m | [32m     2053.0914[0m | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   35 | 00m09s |   -0.01792 |      1.6748 |         2.9668 |             9.8187 |      8366.6109 | 


In [18]:
print('Final Results')
print('Gradient Boosting: ', gradBoostBaye.res['max']['max_val'])
print('Gradient Boosting: ', gradBoostBaye.res['max']['max_params'])

Final Results
Gradient Boosting:  -0.016208618031
Gradient Boosting:  {'n_estimators': 2053.0914166680031, 'max_depth': 6.0966770324132415, 'max_features': 3.8361866099820716, 'min_samples_leaf': 9.8753783728969999}


In [19]:
################################
# MSE of Running the GradBoost #
################################

testGradBoost = GradientBoostingRegressor(n_estimators=2053, max_depth=6, max_features=4, random_state=42, min_samples_leaf = 10, learning_rate=0.05)
testGradBoost.fit(X_train, Y_train)
testGradBoostPredictions = testGradBoost.predict(X_test)
mean_squared_error(Y_test, testGradBoostPredictions) ** 0.5

0.14493461727703502

In [21]:
#############################
# Running Gradient Boosting #
#############################

gradBoost = GradientBoostingRegressor(n_estimators=2053, max_depth=6, max_features=4, random_state=42, min_samples_leaf = 10, learning_rate=0.05)
gradBoost.fit(X_full_train, Y_full_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.05, loss='ls', max_depth=6, max_features=4,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=10,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=2053, presort='auto', random_state=42,
             subsample=1.0, verbose=0, warm_start=False)

In [22]:
########################################################
# Predicting The Kaggle DataSet with Gradient Boosting #
########################################################

KagglePredictionsGradBoost = gradBoost.predict(test_clean)
KagglePredictionsGradBoost = [np.exp(x) - 1 for x in KagglePredictionsGradBoost]
pd.DataFrame({"SalePrice":KagglePredictionsGradBoost, "Id": colId}).to_csv("KaggleSubmitPythonGradBoost.csv", index = False)
print(KagglePredictionsGradBoost[0:5])

[117153.32159245171, 153449.96971387681, 189146.32494695971, 196336.86585194804, 185043.01627257734]


# XG Boost Section

In [23]:
###########################
# Setting Up XG Boosting #
###########################

def xgBoostCV(n_estimators, max_depth, gamma, min_child_weight):
    val = cross_val_score(XGBRegressor(n_estimators=int(n_estimators), max_depth=int(max_depth), 
                                      gamma = gamma, min_child_weight = min_child_weight, learning_rate = 0.05),
                          X_train, Y_train, scoring = 'neg_mean_squared_error', 
                          cv = 10, n_jobs = 3).mean()
    return val

xgBoostBaye = BayesianOptimization(xgBoostCV, {
    'n_estimators': (100, 10000),
    'max_depth': (1,30),
    "gamma": (0,50),
    'min_child_weight': (1,50)
})

In [24]:
xgBoostBaye.maximize(n_iter=30)

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |     gamma |   max_depth |   min_child_weight |   n_estimators | 
    1 | 00m02s | [35m  -0.06261[0m | [32m  11.9017[0m | [32m    10.7694[0m | [32m           39.4609[0m | [32m      195.1660[0m | 
    2 | 00m32s |   -0.08452 |   22.2988 |     12.4642 |            14.7793 |      1393.3907 | 
    3 | 01m51s |   -0.12805 |   49.9858 |      9.1508 |            44.4861 |      7892.7456 | 
    4 | 01m36s | [35m  -0.04481[0m | [32m   5.4106[0m | [32m    24.6559[0m | [32m           29.1603[0m | [32m     4517.4215[0m | 
    5 | 01m30s |   -0.10083 |   30.6031 |      7.4938 |             4.4037 |      4648.2942 | 
[31mBayesian Optimization[0m
[94m---------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |     gamma |   max_depth |   min_child_weight |   

  " state: %s" % convergence_dict)


    9 | 01m13s |   -0.02131 |    0.3329 |     10.9412 |            48.9991 |      3851.2743 | 
   10 | 02m02s | [35m  -0.01733[0m | [32m   0.0000[0m | [32m    30.0000[0m | [32m           50.0000[0m | [32m     6459.4191[0m | 
   11 | 00m46s |   -0.02140 |    0.3447 |     25.3323 |            45.2507 |      2225.2212 | 
   12 | 00m30s |   -0.12805 |   50.0000 |      1.0000 |            50.0000 |      9526.7138 | 
   13 | 01m59s |   -0.01734 |    0.0000 |     30.0000 |            50.0000 |      5767.0668 | 
   14 | 37m27s | [35m  -0.01732[0m | [32m   0.0000[0m | [32m    30.0000[0m | [32m           50.0000[0m | [32m     8618.2341[0m | 
   15 | 00m17s |   -0.03590 |    2.9868 |     20.7251 |            44.1409 |       723.1147 | 
   16 | 00m35s |   -0.12806 |   49.7749 |     21.5569 |            49.5279 |      2642.3450 | 
   17 | 00m52s |   -0.08330 |   21.6742 |     16.9016 |            34.5698 |      2825.5382 | 
   18 | 00m32s |   -0.01956 |    0.0000 |     30.0000 |

  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   28 | 01m39s |   -0.02767 |    1.2629 |     29.8592 |            46.5494 |      5418.0479 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   29 | 23m48s |   -0.01736 |    0.0000 |     30.0000 |            50.0000 |      5036.0929 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   30 | 02m19s |   -0.01732 |    0.0000 |     30.0000 |            50.0000 |      8845.2401 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   31 | 01m26s |   -0.01740 |    0.0000 |     30.0000 |            50.0000 |      3426.0241 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   32 | 00m18s |   -0.01737 |    0.0000 |      1.0000 |            50.0000 |      5127.7293 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   33 | 01m59s |   -0.01736 |    0.0000 |     30.0000 |            50.0000 |      5187.2194 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   34 | 00m56s |   -0.01734 |    0.0000 |     30.0000 |            50.0000 |      2061.9749 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   35 | 00m22s |   -0.01738 |    0.0000 |      1.0000 |            50.0000 |      5779.5854 | 


In [26]:
print('Final Results')
print('XG Boosting: ', xgBoostBaye.res['max']['max_val'])
print('XG Boosting: ', xgBoostBaye.res['max']['max_params'])

Final Results
XG Boosting:  -0.0173196140455
XG Boosting:  {'n_estimators': 5212.5805130619019, 'max_depth': 8.2932709034167491, 'gamma': 0.0, 'min_child_weight': 45.073588297632938}


In [27]:
###############################
# MSE of Running the XG Boost #
###############################

testXGBoost = XGBRegressor(n_estimators=5212, max_depth=8, gamma=0, min_child_weight = 45, learning_rate=0.05, nthread = 3)
testXGBoost.fit(X_train, Y_train)
testXGBoostPredictions = testXGBoost.predict(X_test)
mean_squared_error(Y_test, testXGBoostPredictions) ** 0.5

0.14249108398714966

In [28]:
#######################
# Running XG Boosting #
#######################

XGBoost = XGBRegressor(n_estimators=2312, max_depth=1, gamma = 0, min_child_weight=50, learning_rate=0.05, nthread = 3)
XGBoost.fit(X_full_train, Y_full_train)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=1,
       min_child_weight=50, missing=None, n_estimators=2312, nthread=3,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [29]:
##################################################
# Predicting The Kaggle DataSet with XG Boosting #
##################################################

KagglePredictionsXGBoost = XGBoost.predict(test_clean)
KagglePredictionsXGBoost = [np.exp(x) - 1 for x in KagglePredictionsXGBoost]
pd.DataFrame({"SalePrice":KagglePredictionsXGBoost, "Id": colId}).to_csv("KaggleSubmitPythonXGBoost.csv", index = False)
print(KagglePredictionsXGBoost[0:5])

[115965.375, 163844.09375, 183128.4375, 200638.640625, 196824.90625]


# Data Preparation For Linear

In [30]:
####################
# Loading the Data #
####################

train_clean_2 = pd.read_csv("train_clean.csv")
test_clean_2 = pd.read_csv("test_clean.csv")

print("Training Dimensions: ", train_clean_2.shape)
print("Testing Dimensions: ", test_clean_2.shape)

######################
# Getting Id Columns #
######################
colId_2 = pd.read_csv("test.csv")
colId_2 = colId_2.Id

Training Dimensions:  (1460, 65)
Testing Dimensions:  (1459, 64)


In [10]:
###################################
# Adding New Columns to DataFrame #
###################################

# train_clean_2['SaleType'] = train_clean_2['SaleType'].astype(str)
# train_clean_2['OverallCond'] = train_clean_2['OverallCond'].astype(str)

# test_clean_2['SaleType'] = test_clean_2['SaleType'].astype(str)
# test_clean_2['OverallCond'] = test_clean_2['OverallCond'].astype(str)


# train_clean_2['BsmnQual_BsmtCond'] = train_clean_2['BsmtQual'] + train_clean_2['BsmtCond']
# train_clean_2['ExterQual_ExterCond'] = train_clean_2['ExterQual'] + train_clean_2['ExterCond']
# train_clean_2['GarageType_GarageQual'] = train_clean_2['GarageType'] + train_clean_2['GarageQual']
# train_clean_2['MoSold_YrSold'] = train_clean_2['MoSold'] + train_clean_2['YrSold']
# train_clean_2['SaleType_SaleCondition'] = train_clean_2['SaleType'] + train_clean_2['SaleCondition']
# train_clean_2['SaleType_OverallCond'] = train_clean_2['SaleType'] + train_clean_2['OverallCond']

# test_clean_2['BsmnQual_BsmtCond'] = test_clean_2['BsmtQual'] + test_clean_2['BsmtCond']
# test_clean_2['ExterQual_ExterCond'] = test_clean_2['ExterQual'] + test_clean_2['ExterCond']
# test_clean_2['GarageType_GarageQual'] = test_clean_2['GarageType'] + test_clean_2['GarageQual']
# test_clean_2['MoSold_YrSold'] = test_clean_2['MoSold'] + test_clean_2['YrSold']
# test_clean_2['SaleType_SaleCondition'] = test_clean_2['SaleType'] + test_clean_2['SaleCondition']
# test_clean_2['SaleType_OverallCond'] = test_clean_2['SaleType'] + test_clean_2['OverallCond']

In [31]:
####################################
# Applying Transforms to Functions #
####################################

full_one_hot = pd.concat([train_clean_2, test_clean_2])
full_one_hot['SalePrice'] = full_one_hot['SalePrice'].apply(lambda x: np.log(x+1))
full_one_hot['GarageArea'] = full_one_hot['GarageArea'].apply(lambda x: np.log(x+1))
full_one_hot['X2ndFlrSF'] = full_one_hot['X2ndFlrSF'].apply(lambda x: np.log(x+1))
full_one_hot['TotalBsmtSF'] = full_one_hot['TotalBsmtSF'].apply(lambda x: np.log(x+1))

full_one_hot = pd.get_dummies(full_one_hot, drop_first=True, dummy_na=True)

one_hot_train = full_one_hot[0:1460]
one_hot_test = full_one_hot[1460:].drop('SalePrice', axis = 1)
# train_clean_2['SalePrice'] = train_clean_2['SalePrice'].apply(lambda x: np.log(x + 1))
# train_clean_2['GarageArea'] = train_clean_2['GarageArea'].apply(lambda x: np.log(x + 1))
# train_clean_2['X2ndFlrSF'] = train_clean_2['X2ndFlrSF'].apply(lambda x: np.log(x + 1))
# train_clean_2['TotalBsmtSF'] = train_clean_2['TotalBsmtSF'].apply(lambda x: np.log(x + 1))

# test_clean_2['GarageArea'] = test_clean_2['GarageArea'].apply(lambda x: np.log(x + 1))
# test_clean_2['X2ndFlrSF'] = test_clean_2['X2ndFlrSF'].apply(lambda x: np.log(x + 1))
# test_clean_2['TotalBsmtSF'] = test_clean_2['TotalBsmtSF'].apply(lambda x: np.log(x + 1))

# one_hot_train = pd.get_dummies(train_clean_2, drop_first=True, dummy_na=True)
# one_hot_test = pd.get_dummies(test_clean_2, drop_first=True, dummy_na=True)

In [68]:
one_hot_train.tail()

Unnamed: 0,BedroomAbvGr,BsmtFinSF1,BsmtFullBath,BsmtHalfBath,BsmtUnfSF,EnclosedPorch,Fireplaces,FullBath,GarageArea,GarageCars,GarageYrBlt,GrLivArea,HalfBath,KitchenAbvGr,LotArea,LotFrontage,MSSubClass,MasVnrArea,MoSold,OpenPorchSF,OverallCond,OverallQual,SalePrice,ScreenPorch,TotRmsAbvGrd,TotalBsmtSF,WoodDeckSF,X1stFlrSF,X2ndFlrSF,YearBuilt,YearRemodAdd,YrSold,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,BldgType_nan,BsmtCond_Fa,BsmtCond_Gd,BsmtCond_Po,BsmtCond_TA,BsmtCond_nan,BsmtExposure_Av,BsmtExposure_Gd,BsmtExposure_Mn,BsmtExposure_No,BsmtExposure_nan,BsmtFinType1_Absent,BsmtFinType1_BLQ,BsmtFinType1_GLQ,...,MasVnrType_nan,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Neighborhood_nan,PavedDrive_P,PavedDrive_Y,PavedDrive_nan,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,RoofStyle_nan,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleCondition_nan,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleType_nan
1455,3,0.0,0,0,6.860664,0.0,1,2,6.133398,2,1,7.407318,1,1,8.976894,4.143135,60,0.0,8,3.713572,5,6,12.072547,0.0,7,6.860664,0.0,6.860664,6.543912,1999,2000,2007,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
1456,3,6.673298,1,0,6.380123,0.0,2,2,6.216606,2,1,7.637234,0,1,9.486152,4.454347,20,4.787492,2,0.0,6,6,12.254868,0.0,7,7.341484,5.857933,7.637234,0.0,1978,1988,2010,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
1457,4,5.620401,0,0,6.777647,0.0,2,2,5.533389,1,1,7.758333,0,1,9.109746,4.204693,70,0.0,5,4.110874,9,7,12.493133,0.0,9,7.050123,0.0,7.080868,7.050123,1941,2006,2010,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
1458,2,3.912023,1,0,0.0,4.727388,0,1,5.484797,1,1,6.98379,0,1,9.181735,4.234107,20,0.0,4,0.0,6,5,11.864469,0.0,5,6.98379,5.905362,6.98379,0.0,1950,1996,2010,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
1459,3,6.72263,1,0,4.919981,0.0,0,1,5.624018,1,1,7.136483,1,1,9.204121,4.330733,20,0.0,6,4.234107,6,5,11.90159,0.0,6,7.136483,6.602588,7.136483,0.0,1965,1965,2008,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0


In [33]:
##################
# Splitting Data #  #(Only splitting the training data into two more sets called train_set, and test_set)
##################

train_set_2, test_set_2 = train_test_split(one_hot_train, test_size = 0.2, random_state = 42)

print("Train Shape: ", train_set_2.shape)
print("Test Shape: ", test_set_2.shape)

X_train_2 = train_set_2.drop("SalePrice", axis = 1)
Y_train_2 = train_set_2.SalePrice

X_test_2 = test_set_2.drop("SalePrice", axis = 1)
Y_test_2 = test_set_2.SalePrice

#########################################
# The Full Original Training Set to Use #
#########################################

X_full_train_2 = one_hot_train.drop("SalePrice", axis = 1)
Y_full_train_2 = one_hot_train.SalePrice
print("Full Shape: ", X_full_train_2.shape)

Train Shape:  (1168, 250)
Test Shape:  (292, 250)
Full Shape:  (1460, 249)


# Linear Regression Section

In [69]:
def LinRegCV(alpha, l1_ratio):
    val = cross_val_score(ElasticNet(alpha = alpha, l1_ratio = l1_ratio, random_state=42),
                         X_train_2, Y_train_2, scoring = 'neg_mean_squared_error', 
                          cv = 10, n_jobs = 3).mean()
    return val

LinRegBaye = BayesianOptimization(LinRegCV,{
    'alpha': (0,1),
    'l1_ratio': (0,1)
})

In [70]:
LinRegBaye.maximize(n_iter=30)

[31mInitialization[0m
[94m------------------------------------------------------[0m
 Step |   Time |      Value |     alpha |   l1_ratio | 
    1 | 00m00s | [35m-132.15608[0m | [32m   0.5908[0m | [32m    0.6247[0m | 
    2 | 00m00s | [35m  -4.08939[0m | [32m   0.0030[0m | [32m    0.1142[0m | 
    3 | 00m00s |  -17.27027 |    0.6051 |     0.0677 | 
    4 | 00m00s | -183.31642 |    0.3775 |     0.8329 | 
    5 | 00m00s | -106.48408 |    0.6524 |     0.3642 | 
[31mBayesian Optimization[0m
[94m------------------------------------------------------[0m
 Step |   Time |      Value |     alpha |   l1_ratio | 
    6 | 00m00s | [35m  -1.80595[0m | [32m   0.0139[0m | [32m    0.0027[0m | 


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)


    7 | 00m01s | [35m  -0.30658[0m | [32m   0.0000[0m | [32m    0.0000[0m | 

    9 | 00m00s | -116.72580 |    1.0000 |     1.0000 | 




   10 | 00m02s |   -4.73056 |    1.0000 |     0.0000 | 


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)


   11 | 00m01s |   -4.91471 |    0.0000 |     0.4100 | 
   12 | 00m00s |  -71.71224 |    1.0000 |     0.2113 | 




   13 | 00m01s |   -0.83542 |    0.2979 |     0.0000 | 
   14 | 00m00s |  -62.74803 |    0.2794 |     0.0804 | 
   15 | 00m01s |   -0.63396 |    0.3504 |     0.0005 | 




   16 | 00m02s |   -0.42854 |    0.3242 |     0.0000 | 





   18 | 00m02s |   -3.00298 |    0.3780 |     0.0000 | 




  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)


   21 | 00m02s |   -7.98471 |    0.0000 |     0.6294 | 





   23 | 00m02s |   -2.42576 |    0.3400 |     0.0000 | 



   27 | 00m01s | -101.93034 |    0.9375 |     0.5918 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)





  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)





  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   30 | 00m01s |  -76.47505 |    0.4478 |     0.7099 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)










  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)





  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)





  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)





In [71]:
print('Final Results')
print('Linear Regression: ', LinRegBaye.res['max']['max_val'])
print('Linear Regression: ', LinRegBaye.res['max']['max_params'])

Final Results
Linear Regression:  -0.0813131519646
Linear Regression:  {'alpha': 0.34540609895217467, 'l1_ratio': 0.3449065268224436}


In [72]:
########################################
# MSE of Running the Linear Regression #
########################################

testLinReg = ElasticNet(alpha = 0.1145, l1_ratio = 0.0001, normalize = True, random_state=42)
testLinReg.fit(X_train_2, Y_train_2)
testLinRegPredictions = testLinReg.predict(X_test_2)
mean_squared_error(Y_test_2, testLinRegPredictions) ** 0.5

0.40384835031844579

In [55]:
#############################
# Running Linear Regression #
#############################

LinReg = ElasticNet(alpha = 0.1145, l1_ratio = 0.0001, random_state=42)
LinReg.fit(X_full_train_2, Y_full_train_2)

ElasticNet(alpha=0.1145, copy_X=True, fit_intercept=True, l1_ratio=0.0001,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=42, selection='cyclic', tol=0.0001, warm_start=False)

In [19]:
########################################################
# Predicting The Kaggle DataSet with Linear Regression #
########################################################

KagglePredictionsLinReg = LinReg.predict(one_hot_test)
KagglePredictionsLinReg = [np.exp(x) - 1 for x in KagglePredictionsLinReg]
pd.DataFrame({"SalePrice":KagglePredictionsLinReg, "Id": colId}).to_csv("KaggleSubmitPythonLinReg.csv", index = False)
print(KagglePredictionsLinReg[0:5])

[131916.00941133834, 136634.70584519126, 182618.13776871475, 195083.05667648453, 161032.24151391396]


# Support Vector Section