In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('train.csv')

In [3]:
droplist = ['Id','Alley','PoolQC','Fence','MiscFeature']

In [4]:
df.drop(droplist, axis='columns',inplace=True)

# target, cat, num

In [5]:
df.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'WoodDeckSF'

In [29]:
categorical_mask = (df.dtypes == object)

In [30]:
categorical_columns = df.columns[categorical_mask].tolist()

In [31]:
numeric_columns = df.columns[~categorical_mask].tolist()

In [32]:
numeric_columns = [x for x in numeric_columns if x not in ['SalePrice','MSSubClass','OverallCond','YrSold','MoSold',
                                                          'YearBuilt','YearRemodAdd','GarageYrBlt']]

In [33]:
categorical_columns.extend(['MSSubClass','OverallCond','YrSold','MoSold','YearBuilt','YearRemodAdd','GarageYrBlt'])

## make separate df's for cat, num, target

In [34]:
cat_feats = df[categorical_columns]

In [35]:
num_feats = df[numeric_columns]

In [36]:
target = df['SalePrice']

In [37]:
# some of them need to be converted to string so as represent truly categorical

In [38]:
from sklearn_pandas import DataFrameMapper
from sklearn_pandas import CategoricalImputer

In [39]:
from sklearn.preprocessing import Imputer

In [40]:
numeric_imputation_mapper = DataFrameMapper(
                                            [([numeric_feature],Imputer(strategy="median")) for numeric_feature in numeric_columns],
                                            input_df=True,
                                            df_out=True
                                           )

In [41]:
categorical_imputation_mapper = DataFrameMapper(
                                                [(category_feature, CategoricalImputer()) for category_feature in categorical_columns],
                                                input_df=True,
                                                df_out=True
                                               )

In [42]:
num_trans = numeric_imputation_mapper.fit_transform(num_feats)

In [43]:
cat_trans = categorical_imputation_mapper.fit_transform(cat_feats)

In [44]:
cat_trans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 46 columns):
MSZoning         1460 non-null object
Street           1460 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non-null object
MasVnrType       1460 non-null object
ExterQual        1460 non-null object
ExterCond        1460 non-null object
Foundation       1460 non-null object
BsmtQual         1460 non-null object
BsmtCond         1460 non-null object
BsmtExposure     1460 non-null object
BsmtFinType1     14

In [45]:
from sklearn.preprocessing import StandardScaler

In [46]:
scl = StandardScaler()

In [47]:
num_trans_scale = scl.fit_transform(num_trans)

In [48]:
num_trans_scale

array([[-0.22087509, -0.20714171,  0.65147924, ..., -0.27020835,
        -0.06869175, -0.08768781],
       [ 0.46031974, -0.09188637, -0.07183611, ..., -0.27020835,
        -0.06869175, -0.08768781],
       [-0.08463612,  0.07347998,  0.65147924, ..., -0.27020835,
        -0.06869175, -0.08768781],
       ..., 
       [-0.1754621 , -0.14781027,  0.65147924, ..., -0.27020835,
        -0.06869175,  4.95311151],
       [-0.08463612, -0.08016039, -0.79515147, ..., -0.27020835,
        -0.06869175, -0.08768781],
       [ 0.23325479, -0.05811155, -0.79515147, ..., -0.27020835,
        -0.06869175, -0.08768781]])

In [49]:
cols_num = num_trans.columns

In [50]:
cols_num

Index(['LotFrontage', 'LotArea', 'OverallQual', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal'],
      dtype='object')

In [51]:
num_trans_scale_df = pd.DataFrame(num_trans_scale,columns=cols_num)

In [52]:
num_trans_scale_df.head()

Unnamed: 0,LotFrontage,LotArea,OverallQual,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,Fireplaces,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal
0,-0.220875,-0.207142,0.651479,0.514104,0.575425,-0.288653,-0.944591,-0.459303,-0.793434,1.161852,...,-0.951226,0.311725,0.351,-0.752176,0.216503,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688
1,0.46032,-0.091886,-0.071836,-0.57075,1.171992,-0.288653,-0.641228,0.466465,0.25714,-0.795163,...,0.600495,0.311725,-0.060731,1.626195,-0.704483,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688
2,-0.084636,0.07348,0.651479,0.325915,0.092907,-0.288653,-0.301643,-0.313369,-0.627826,1.189351,...,0.600495,0.311725,0.631726,-0.752176,-0.070361,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688
3,-0.44794,-0.096897,0.651479,-0.57075,-0.499274,-0.288653,-0.06167,-0.687324,-0.521734,0.937276,...,0.600495,1.650307,0.790804,-0.752176,-0.176048,4.092524,-0.116339,-0.270208,-0.068692,-0.087688
4,0.641972,0.375148,1.374795,1.366489,0.463568,-0.288653,-0.174865,0.19968,-0.045611,1.617877,...,0.600495,1.650307,1.698485,0.780197,0.56376,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688


# we need to take care of cat variables & convert them to labels & dummies

In [53]:
from sklearn.feature_extraction import DictVectorizer

In [54]:
cat_trans_dict = cat_trans.to_dict('records')

In [55]:
dv = DictVectorizer(sparse=False)

In [56]:
cat_trans_dummies = dv.fit_transform(cat_trans_dict)

In [57]:
cat_cols = cat_trans.columns

In [58]:
cat_trans_dummies_df = pd.DataFrame(cat_trans_dummies)

In [59]:
cat_trans_dummies_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,236,237,238,239,240,241,242,243,244,245
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,2003.0,2003.0,2008.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1976.0,1976.0,2007.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,2001.0,2002.0,2008.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1915.0,1970.0,2006.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,2000.0,2000.0,2008.0


# lets concat num & cat df's

In [60]:
result = pd.concat([num_trans_scale_df,cat_trans_dummies_df],axis=1)

In [61]:
target.head(1)

0    208500
Name: SalePrice, dtype: int64

In [62]:
import xgboost

In [63]:
from sklearn.model_selection import cross_val_score

In [64]:
xgmodel = xgboost.XGBRegressor(learning_rate=0.05,n_estimators=300,objective="reg:linear")

In [65]:
cross_val_scores = cross_val_score(xgmodel,X=result,y=target,cv=20, scoring="neg_mean_squared_error")

In [66]:
print("20-fold RMSE: ", np.mean(np.sqrt(np.abs(cross_val_scores))))

20-fold RMSE:  26002.110598
