In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest,chi2

In [2]:
X_test=pd.read_csv("/Users/MacAir/Documents/DataScience/Datasets/home-data-for-ml-course/test.csv",index_col='Id')
traindata=pd.read_csv("/Users/MacAir/Documents/DataScience/Datasets/home-data-for-ml-course/train.csv",index_col='Id')

In [3]:
y=traindata["SalePrice"]
X=traindata.drop("SalePrice", axis=1)

In [4]:
X.shape

(1460, 79)

In [5]:
X.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [6]:
low_cardinality_cols = [cname for cname in X.columns if X[cname].nunique() < 10 and X[cname].dtype == "object"]
numeric_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
print(len(numeric_cols))
print(len(low_cardinality_cols))
my_cols = numeric_cols+low_cardinality_cols
X=X[my_cols]
X_test=X_test[my_cols]

36
40


In [7]:
X_test.shape

(1459, 76)

In [8]:
X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.2,random_state=0)
bad_cols=[col for col in low_cardinality_cols if (bool(set(X_valid[col].unique()).difference(set(X_train[col].unique()))))]
X_train.drop(columns=bad_cols,inplace=True)
X_valid.drop(columns=bad_cols,inplace=True)
X_test.drop(columns=bad_cols,inplace=True)
X.drop(columns=bad_cols,inplace=True)
print(X_train.shape)
print(X_valid.shape)
print(X_test.shape)
print(bad_cols)

(1168, 72)
(292, 72)
(1459, 72)
['Condition2', 'RoofMatl', 'Functional', 'MiscFeature']


In [9]:
#Imputer
trf1=ColumnTransformer([
    ("Numerical Imputer",SimpleImputer(strategy='median'),slice(0,36)),
    ("Text Imputer",SimpleImputer(strategy='most_frequent'),slice(36,76))
],remainder='passthrough')

In [10]:
#Encoder column
trf2=ColumnTransformer([
    ("Ordinal Encoding",OrdinalEncoder(),slice(36,76))
],remainder='passthrough')

In [11]:
#Scaler
trf3 = ColumnTransformer([
    ('scale',StandardScaler(),slice(0,76))
])

In [12]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3)
])

In [13]:
from sklearn import set_config
set_config(display='diagram')
X_train_final=pd.DataFrame(pipe.fit_transform(X_train),columns=X_train.columns)
X_valid_final=pd.DataFrame(pipe.fit_transform(X_valid),columns=X_valid.columns)
X_test_final=pd.DataFrame(pipe.fit_transform(X_test),columns=X_test.columns)

In [14]:
# train the model
model = XGBRegressor(n_estimators=5000,learning_rate=0.05,n_jobs=8)

In [15]:
model.fit(X_train_final, y_train, 
             early_stopping_rounds=10, 
             eval_set=[(X_valid_final, y_valid)], 
             verbose=True)

[0]	validation_0-rmse:190005.04460
[1]	validation_0-rmse:181046.40228
[2]	validation_0-rmse:172552.94766
[3]	validation_0-rmse:164555.09972
[4]	validation_0-rmse:157019.08541
[5]	validation_0-rmse:149830.72131
[6]	validation_0-rmse:142881.13624
[7]	validation_0-rmse:136523.85782
[8]	validation_0-rmse:130387.36884
[9]	validation_0-rmse:124418.64888
[10]	validation_0-rmse:118911.38238
[11]	validation_0-rmse:113678.63878
[12]	validation_0-rmse:108643.49995
[13]	validation_0-rmse:104003.62078
[14]	validation_0-rmse:99624.61132
[15]	validation_0-rmse:95390.81465
[16]	validation_0-rmse:91492.17050
[17]	validation_0-rmse:87730.34177
[18]	validation_0-rmse:84163.69836
[19]	validation_0-rmse:80717.46506
[20]	validation_0-rmse:77474.08057
[21]	validation_0-rmse:74526.71580
[22]	validation_0-rmse:71741.38298
[23]	validation_0-rmse:69124.12813
[24]	validation_0-rmse:66696.47215
[25]	validation_0-rmse:64347.28121
[26]	validation_0-rmse:61965.94747
[27]	validation_0-rmse:59825.23802
[28]	validation_



[44]	validation_0-rmse:37665.47636
[45]	validation_0-rmse:36966.08019
[46]	validation_0-rmse:36269.20848
[47]	validation_0-rmse:35773.54193
[48]	validation_0-rmse:35305.62476
[49]	validation_0-rmse:34715.00716
[50]	validation_0-rmse:34330.44112
[51]	validation_0-rmse:33923.79328
[52]	validation_0-rmse:33469.08572
[53]	validation_0-rmse:33161.64071
[54]	validation_0-rmse:32750.24298
[55]	validation_0-rmse:32493.23117
[56]	validation_0-rmse:32118.33883
[57]	validation_0-rmse:31834.42241
[58]	validation_0-rmse:31504.65745
[59]	validation_0-rmse:31261.24925
[60]	validation_0-rmse:31031.02311
[61]	validation_0-rmse:30861.70556
[62]	validation_0-rmse:30688.42590
[63]	validation_0-rmse:30467.55570
[64]	validation_0-rmse:30283.31589
[65]	validation_0-rmse:30110.03861
[66]	validation_0-rmse:29917.99894
[67]	validation_0-rmse:29781.12691
[68]	validation_0-rmse:29664.45190
[69]	validation_0-rmse:29543.95509
[70]	validation_0-rmse:29433.95057
[71]	validation_0-rmse:29290.47550
[72]	validation_0-rm

Hence, around 225 value for n_estimators is good.

In [16]:
y_pred = model.predict(X_valid_final)

from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_pred,y_valid)

16016.64613655822

In [17]:
model1 = XGBRegressor(n_estimators=225,learning_rate=0.05,n_jobs=8)
X=pd.DataFrame(pipe.fit_transform(X))

In [18]:
model1.fit(X,y)

In [19]:
predictions=model1.predict(X_test_final)
X_test_final.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,SaleType,SaleCondition
0,-1.501777,0.06426,-0.161306,0.74262,0.3187,0.0,0.572445,-0.207992,-1.193183,-0.426625,...,1.185921,0.366678,-0.701628,-0.360738,-0.088827,1.81896,-0.057227,-0.092244,-0.038281,1.713905
1,-0.039098,0.06426,-0.161306,-1.384637,0.3187,0.0,-1.950811,-0.207992,-0.056113,-0.426625,...,-0.741235,2.347867,-0.178826,-0.360738,-0.088827,-0.301543,-0.057227,19.730438,-0.038281,1.713905
2,-0.039098,0.06426,-0.161306,-1.384637,0.3187,0.0,0.572445,-0.207992,-0.056113,-0.426625,...,0.042537,0.930495,-0.207871,-0.360738,-0.088827,-0.301543,-0.057227,-0.092244,-1.140614,1.713905
3,-0.039098,0.06426,-0.161306,-1.384637,0.3187,0.0,0.572445,-0.207992,-0.056113,-0.426625,...,-0.012788,2.089451,-0.178826,-0.360738,-0.088827,-0.301543,-0.057227,-0.092244,-0.038281,1.713905
4,-0.039098,0.06426,-0.161306,-1.384637,-2.53396,0.0,0.572445,-0.207992,-0.056113,2.866737,...,0.153187,-0.729632,0.489198,-0.360738,-0.088827,2.24306,-0.057227,-0.092244,-1.875504,1.713905


In [20]:
print(X_test.shape)
print(X_test_final.shape)

(1459, 72)
(1459, 72)


In [22]:
output=pd.DataFrame({"Id": X_test.index,"SalePrice": predictions})
output.to_csv('submission_house_kaggle_4.csv', index=False)