In [13]:
import pandas as pd
import os
import sklearn
import math
import joblib
from sklearn import tree
from sklearn import ensemble 
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import impute
from sklearn import pipeline
from sklearn import compose
from sklearn import feature_selection
from sklearn import neighbors
from sklearn import linear_model
from sklearn import svm
from sklearn import ensemble
from sklearn import metrics
from sklearn.compose import TransformedTargetRegressor
from feature_engine.selection import DropDuplicateFeatures, DropCorrelatedFeatures
import numpy as np
import xgboost as xgb
import lightgbm as lgbm
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from shutil import rmtree

In [14]:
warnings.filterwarnings('ignore')
sklearn.set_config(display="diagram")
dir = "C:/Users/pc/Downloads/ai-level1/house-prices"

In [15]:
house_train = pd.read_csv(os.path.join(dir, "train.csv"))
house_train.shape
house_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [16]:
#Create new features
house_train['TotalSF'] = house_train['TotalBsmtSF'] + house_train['GrLivArea'] + house_train['1stFlrSF'] + house_train['2ndFlrSF']
house_train['TotalBathRooms'] = house_train['FullBath'] + house_train['BsmtFullBath'] + 0.5 * house_train['HalfBath'] +  0.5 * house_train['BsmtHalfBath']
house_train['TotalPorchSF'] = house_train['OpenPorchSF'] + house_train['3SsnPorch'] + house_train['EnclosedPorch'] + house_train['ScreenPorch']

In [17]:
def cont_selector(df):
    return df.select_dtypes(include=['number']).columns

def cat_selector(df):
    return df.select_dtypes(exclude=['number']).columns

def cast_to_cat(df, features):
    for feature in features:
        df[feature] = df[feature].astype('category')

In [18]:
print(cont_selector(house_train))
print(cat_selector(house_train))

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold', 'SalePrice', 'TotalSF', 'TotalBathRooms',
       'TotalPorchSF'],
      dtype='object')
Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure',

In [19]:
features_to_cast = ["MSSubClass"]
features_to_cast.extend(cat_selector(house_train))
cast_to_cat(house_train, features_to_cast)
print(cont_selector(house_train))
print(cat_selector(house_train))

Index(['Id', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice', 'TotalSF', 'TotalBathRooms',
       'TotalPorchSF'],
      dtype='object')
Index(['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond',
       'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',

In [20]:
def get_features_to_drop_on_missingdata(df, threshold) :
    tmp = df.isnull().sum()
    return list(tmp[tmp/float(df.shape[0]) > threshold].index)

def drop_features(df, features):
    return df.drop(features, axis=1, inplace=True)

In [21]:
missing_data_features_to_drop = get_features_to_drop_on_missingdata(house_train, 0.25)
print(missing_data_features_to_drop)
drop_features(house_train, missing_data_features_to_drop)
house_train.info()

['Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 78 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Id              1460 non-null   int64   
 1   MSSubClass      1460 non-null   category
 2   MSZoning        1460 non-null   category
 3   LotFrontage     1201 non-null   float64 
 4   LotArea         1460 non-null   int64   
 5   Street          1460 non-null   category
 6   LotShape        1460 non-null   category
 7   LandContour     1460 non-null   category
 8   Utilities       1460 non-null   category
 9   LotConfig       1460 non-null   category
 10  LandSlope       1460 non-null   category
 11  Neighborhood    1460 non-null   category
 12  Condition1      1460 non-null   category
 13  Condition2      1460 non-null   category
 14  BldgType        1460 non-null   category
 15  HouseStyle      1460 non-null   c

In [22]:
target = house_train['SalePrice']
features_to_drop = ['Id', 'SalePrice']
drop_features(house_train, features_to_drop)
house_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 76 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   MSSubClass      1460 non-null   category
 1   MSZoning        1460 non-null   category
 2   LotFrontage     1201 non-null   float64 
 3   LotArea         1460 non-null   int64   
 4   Street          1460 non-null   category
 5   LotShape        1460 non-null   category
 6   LandContour     1460 non-null   category
 7   Utilities       1460 non-null   category
 8   LotConfig       1460 non-null   category
 9   LandSlope       1460 non-null   category
 10  Neighborhood    1460 non-null   category
 11  Condition1      1460 non-null   category
 12  Condition2      1460 non-null   category
 13  BldgType        1460 non-null   category
 14  HouseStyle      1460 non-null   category
 15  OverallQual     1460 non-null   int64   
 16  OverallCond     1460 non-null   int64   
 17  YearBuilt     

In [23]:
#define train, target data and metric
X_train = house_train
y_train = target
scoring = metrics.make_scorer(metrics.root_mean_squared_log_error, greater_is_better=False)

In [24]:
lasso_best_model = joblib.load(os.path.join(dir,'houseprice_lasso.pkl'))
print(lasso_best_model)
ridge_best_model = joblib.load(os.path.join(dir,'houseprice_ridge.pkl'))
print(ridge_best_model)
cb_best_model = joblib.load(os.path.join(dir,'houseprice_cb.pkl'))
print(cb_best_model)
hgb_best_model = joblib.load(os.path.join(dir,'houseprice_hgb.pkl'))
print(hgb_best_model)
xgb_best_model = joblib.load(os.path.join(dir,'houseprice_xgb.pkl'))
print(xgb_best_model)
lgb_best_model = joblib.load(os.path.join(dir,'houseprice_lgb.pkl'))
print(lgb_best_model)

Pipeline(memory='C:/Users/pc/Downloads/ai-level1/house-prices\\pipeline_cache',
         steps=[('linear_preprocessor',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('imp',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse_output=False))]),
                                                  Index(['MSSubClass', 'MSZoning', 'Street', 'LotShape', 'LandConto...
       'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
       'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'Yr

In [25]:
#grid search of voting regressor-I
estimators1 = [('lasso', lasso_best_model), ('ridge', ridge_best_model), ('hgb', hgb_best_model), ('xgb', xgb_best_model), ('lgb', lgb_best_model), ('cb', cb_best_model) ]
voting_regressor1 = ensemble.VotingRegressor(estimators=estimators1)
voting_params1 = {
            'weights' : [[1, 1, 2, 2, 2, 3], [1, 1, 1, 1, 1, 2], [1, 1, 1, 1, 1, 1]]
        }
cv = model_selection.KFold(10)
voting_grid1 = model_selection.GridSearchCV(voting_regressor1, voting_params1, cv=cv, scoring=scoring, n_jobs=4)
voting_grid1.fit(X_train, y_train)
print(voting_grid1.best_params_)
print(voting_grid1.best_score_)
print(voting_grid1.best_estimator_)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001070 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3873
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 72
[LightGBM] [Info] Start training from score 180921.195890
0:	learn: 78906.1285749	total: 144ms	remaining: 11m 59s
1:	learn: 78443.9597312	total: 150ms	remaining: 6m 15s
2:	learn: 77966.4062326	total: 156ms	remaining: 4m 19s
3:	learn: 77456.0137825	total: 162ms	remaining: 3m 22s
4:	learn: 76924.7021215	total: 169ms	remaining: 2m 48s
5:	learn: 76420.9375246	total: 175ms	remaining: 2m 25s
6:	learn: 75912.7719441	total: 181ms	remaining: 2m 9s
7:	learn: 75445.6004742	total: 187ms	remaining: 1m 56s
8:	learn: 74934.0744692	total: 193ms	remaining: 1m 47s
9:	learn: 74460.1829433	total: 199ms	remaining: 1m 39s
10:	learn: 73988.7076070	total: 205ms	re

In [36]:
#grid search of voting regressor-II
estimators2 = [
               ('ridge', ridge_best_model), 
               ('cb', cb_best_model) 
              ]
voting_regressor2 = ensemble.VotingRegressor(estimators=estimators2)
voting_params2 = {
            'weights' : [[1, 1], [1, 2], [1, 3], [2, 1], [0.5, 0.5]]
        }
cv = model_selection.KFold(10)
voting_grid2 = model_selection.GridSearchCV(voting_regressor2, voting_params2, cv=cv, scoring=scoring, n_jobs=8)
voting_grid2.fit(X_train, y_train)
print(voting_grid2.best_params_)
print(voting_grid2.best_score_)
print(voting_grid2.best_estimator_)

0:	learn: 78906.1285749	total: 12.8ms	remaining: 1m 4s
1:	learn: 78443.9597312	total: 25.6ms	remaining: 1m 3s
2:	learn: 77966.4062326	total: 39.4ms	remaining: 1m 5s
3:	learn: 77456.0137825	total: 55ms	remaining: 1m 8s
4:	learn: 76924.7021215	total: 67.9ms	remaining: 1m 7s
5:	learn: 76420.9375246	total: 77ms	remaining: 1m 4s
6:	learn: 75912.7719441	total: 88.5ms	remaining: 1m 3s
7:	learn: 75445.6004742	total: 100ms	remaining: 1m 2s
8:	learn: 74934.0744692	total: 111ms	remaining: 1m 1s
9:	learn: 74460.1829433	total: 121ms	remaining: 1m
10:	learn: 73988.7076070	total: 132ms	remaining: 59.7s
11:	learn: 73549.4718425	total: 143ms	remaining: 59.3s
12:	learn: 73079.0240681	total: 152ms	remaining: 58.5s
13:	learn: 72631.6318557	total: 162ms	remaining: 57.7s
14:	learn: 72190.2169136	total: 174ms	remaining: 57.7s
15:	learn: 71730.5186864	total: 184ms	remaining: 57.2s
16:	learn: 71298.3793959	total: 194ms	remaining: 56.8s
17:	learn: 70820.9081043	total: 218ms	remaining: 1m
18:	learn: 70416.323780

In [44]:
#grid search of voting regressor-III
estimators3 = [
               ('lasso', lasso_best_model), 
               ('ridge', ridge_best_model), 
               ('cb', cb_best_model) 
              ]
voting_regressor3 = ensemble.VotingRegressor(estimators=estimators3)
voting_params3 = {
            'weights' : [ [1, 1, 1], [1, 1, 2], [1, 1, 3], [1, 2, 3] ]
        }
cv = model_selection.KFold(10)
voting_grid3 = model_selection.GridSearchCV(voting_regressor3, voting_params3, cv=cv, scoring=scoring, n_jobs=8)
voting_grid3.fit(X_train, y_train)
print(voting_grid3.best_params_)
print(voting_grid3.best_score_)
print(voting_grid3.best_estimator_)

0:	learn: 78906.1285749	total: 21.4ms	remaining: 1m 47s
1:	learn: 78443.9597312	total: 28.9ms	remaining: 1m 12s
2:	learn: 77966.4062326	total: 35.5ms	remaining: 59.1s
3:	learn: 77456.0137825	total: 43.4ms	remaining: 54.2s
4:	learn: 76924.7021215	total: 50.3ms	remaining: 50.3s
5:	learn: 76420.9375246	total: 57.6ms	remaining: 47.9s
6:	learn: 75912.7719441	total: 64.9ms	remaining: 46.3s
7:	learn: 75445.6004742	total: 73.2ms	remaining: 45.6s
8:	learn: 74934.0744692	total: 80.2ms	remaining: 44.5s
9:	learn: 74460.1829433	total: 87.4ms	remaining: 43.6s
10:	learn: 73988.7076070	total: 95.6ms	remaining: 43.4s
11:	learn: 73549.4718425	total: 103ms	remaining: 42.8s
12:	learn: 73079.0240681	total: 110ms	remaining: 42.3s
13:	learn: 72631.6318557	total: 117ms	remaining: 41.8s
14:	learn: 72190.2169136	total: 125ms	remaining: 41.4s
15:	learn: 71730.5186864	total: 132ms	remaining: 41.3s
16:	learn: 71298.3793959	total: 141ms	remaining: 41.3s
17:	learn: 70820.9081043	total: 148ms	remaining: 41s
18:	learn

In [None]:
joblib.dump(voting_grid1.best_estimator_, os.path.join(dir, "houseprice_voting1.pkl"))
joblib.dump(voting_grid2.best_estimator_, os.path.join(dir, "houseprice_voting2.pkl"))
joblib.dump(voting_grid3.best_estimator_, os.path.join(dir, "houseprice_voting3.pkl"))

In [49]:
#grid search of stacking regressor-I
stacking_regressor1 = ensemble.StackingRegressor(estimators=estimators2, final_estimator=linear_model.Ridge())
stacking_params1 = {
            'final_estimator__alpha' : [0, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 20]
        }
cv = model_selection.KFold(10)
stacking_grid1 = model_selection.GridSearchCV(stacking_regressor1, stacking_params1, cv=cv, scoring=scoring, n_jobs=8)
stacking_grid1.fit(X_train, y_train)
print(stacking_grid1.best_params_)
print(stacking_grid1.best_score_)
print(stacking_grid1.best_estimator_)

0:	learn: 78906.1285749	total: 10.7ms	remaining: 53.5s
1:	learn: 78443.9597312	total: 21.5ms	remaining: 53.7s
2:	learn: 77966.4062326	total: 31.3ms	remaining: 52.1s
3:	learn: 77456.0137825	total: 40.4ms	remaining: 50.5s
4:	learn: 76924.7021215	total: 49.7ms	remaining: 49.6s
5:	learn: 76420.9375246	total: 58.1ms	remaining: 48.4s
6:	learn: 75912.7719441	total: 67ms	remaining: 47.8s
7:	learn: 75445.6004742	total: 76.4ms	remaining: 47.7s
8:	learn: 74934.0744692	total: 86.2ms	remaining: 47.8s
9:	learn: 74460.1829433	total: 94.5ms	remaining: 47.1s
10:	learn: 73988.7076070	total: 103ms	remaining: 46.7s
11:	learn: 73549.4718425	total: 112ms	remaining: 46.5s
12:	learn: 73079.0240681	total: 121ms	remaining: 46.3s
13:	learn: 72631.6318557	total: 129ms	remaining: 46.1s
14:	learn: 72190.2169136	total: 139ms	remaining: 46.3s
15:	learn: 71730.5186864	total: 148ms	remaining: 46.2s
16:	learn: 71298.3793959	total: 158ms	remaining: 46.3s
17:	learn: 70820.9081043	total: 167ms	remaining: 46.3s
18:	learn: 7

In [54]:
#grid search of stacking regressor-II
target_transformer = preprocessing.PowerTransformer(method='box-cox')

ridge_with_trans_target = TransformedTargetRegressor(
    regressor = linear_model.Ridge(),
    transformer= target_transformer
)

stacking_regressor2 = ensemble.StackingRegressor(estimators=estimators1, final_estimator=ridge_with_trans_target)
stacking_params2 = {
            'final_estimator__regressor__alpha' : [0, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 20]
        }
cv = model_selection.KFold(10)
stacking_grid2 = model_selection.GridSearchCV(stacking_regressor2, stacking_params2, cv=cv, scoring=scoring, n_jobs=8)
stacking_grid2.fit(X_train, y_train)
print(stacking_grid2.best_params_)
print(stacking_grid2.best_score_)
print(stacking_grid2.best_estimator_)

KeyboardInterrupt: 

In [None]:
#grid search of stacking regressor-III
stacking_regressor3 = ensemble.StackingRegressor(estimators=estimators2, final_estimator=xgb.XGBRegressor())
stacking_params3 = {
            'final_estimator__' : [0, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 20]
        }
cv = model_selection.KFold(10)
stacking_grid3 = model_selection.GridSearchCV(stacking_regressor3, stacking_params3, cv=cv, scoring=scoring, n_jobs=8)
stacking_grid3.fit(X_train, y_train)
print(stacking_grid3.best_params_)
print(stacking_grid3.best_score_)
print(stacking_grid3.best_estimator_)

In [53]:
#joblib.dump(stacking_grid1.best_estimator_, os.path.join(dir, "houseprice_stacking1.pkl"))
joblib.dump(stacking_grid2.best_estimator_, os.path.join(dir, "houseprice_stacking2.pkl"))

['C:/Users/pc/Downloads/ai-level1/house-prices\\houseprice_stacking1.pkl']

In [26]:
house_test = pd.read_csv(os.path.join(dir, "test.csv"))
print(house_test.shape)
house_test.head()

(1459, 80)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [27]:
#Create new features
house_test['TotalSF'] = house_test['TotalBsmtSF'] + house_test['GrLivArea'] + house_test['1stFlrSF'] + house_train['2ndFlrSF']
house_test['TotalBathRooms'] = house_test['FullBath'] + house_test['BsmtFullBath'] + 0.5 * house_test['HalfBath'] +  0.5 * house_test['BsmtHalfBath']
house_test['TotalPorchSF'] = house_test['OpenPorchSF'] + house_test['3SsnPorch'] + house_test['EnclosedPorch'] + house_test['ScreenPorch']

In [28]:
cast_to_cat(house_test, features_to_cast)

In [29]:
house_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 83 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Id              1459 non-null   int64   
 1   MSSubClass      1459 non-null   category
 2   MSZoning        1455 non-null   category
 3   LotFrontage     1232 non-null   float64 
 4   LotArea         1459 non-null   int64   
 5   Street          1459 non-null   category
 6   Alley           107 non-null    category
 7   LotShape        1459 non-null   category
 8   LandContour     1459 non-null   category
 9   Utilities       1457 non-null   category
 10  LotConfig       1459 non-null   category
 11  LandSlope       1459 non-null   category
 12  Neighborhood    1459 non-null   category
 13  Condition1      1459 non-null   category
 14  Condition2      1459 non-null   category
 15  BldgType        1459 non-null   category
 16  HouseStyle      1459 non-null   category
 17  OverallQual   

In [30]:
drop_features(house_test, missing_data_features_to_drop)

In [66]:
final_estimator = joblib.load(os.path.join(dir,'houseprice_stacking1.pkl'))

In [67]:
house_test['SalePrice'] = np.round(final_estimator.predict(house_test), 2)
house_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,TotalSF,TotalBathRooms,TotalPorchSF,SalePrice
0,1461,20,RH,80.0,11622,Pave,Reg,Lvl,AllPub,Inside,...,0,0,6,2010,WD,Normal,3528.0,1.0,120,128873.3
1,1462,20,RL,81.0,14267,Pave,IR1,Lvl,AllPub,Corner,...,0,12500,6,2010,WD,Normal,3987.0,1.5,36,163901.9
2,1463,60,RL,74.0,13830,Pave,IR1,Lvl,AllPub,Inside,...,0,0,3,2010,WD,Normal,4351.0,2.5,34,192143.08
3,1464,60,RL,78.0,9978,Pave,IR1,Lvl,AllPub,Inside,...,0,0,6,2010,WD,Normal,4212.0,2.5,36,195513.13
4,1465,120,RL,43.0,5005,Pave,IR1,HLS,AllPub,Inside,...,0,0,1,2010,WD,Normal,4893.0,2.0,226,198473.03


In [68]:
house_test.to_csv(os.path.join(dir, "house_prices_submit.csv"), columns=["Id", "SalePrice"], index=False)