In [24]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [25]:
X_full = pd.read_csv('train.csv', index_col='Id')
X_test_full = pd.read_csv('test.csv', index_col='Id')

In [26]:
X_full.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [27]:
X_full.dropna(subset=['SalePrice'], axis=0, inplace=True)
y = X_full.SalePrice
X_full.drop('SalePrice', axis=1, inplace=True)

In [66]:
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, test_size=0.2, random_state=1)

In [67]:
cat_cols = [col for col in X_train_full.columns if X_train_full[col].nunique() < 10 
                                                and X_train_full[col].dtypes == 'object']

num_cols = [col for col in X_train_full.columns if X_train_full[col].dtypes in ['int64', 'float64']]

In [68]:
my_cols = cat_cols + num_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

# Train

**preprocessing and bundle numerical and categorical columns**

In [69]:
num_trans = SimpleImputer(strategy='mean')

cat_trans = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                            ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[('num', num_trans, num_cols),
                                              ('cat', cat_trans, cat_cols)])

**model**

In [77]:
for i in range(1, 16):
    my_model_i = XGBRegressor(n_estimators=500, learning_rate=i/50, random_state=1)
    model = Pipeline(steps=[('preprocessor', preprocessor), ('model', my_model)])

    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    print(i/50, ': ', mean_absolute_error(y_valid, preds))

0.02 :  15459.873983304795
0.04 :  15459.873983304795
0.06 :  15459.873983304795
0.08 :  15459.873983304795
0.1 :  15459.873983304795
0.12 :  15459.873983304795
0.14 :  15459.873983304795
0.16 :  15459.873983304795
0.18 :  15459.873983304795
0.2 :  15459.873983304795
0.22 :  15459.873983304795
0.24 :  15459.873983304795
0.26 :  15459.873983304795
0.28 :  15459.873983304795
0.3 :  15459.873983304795


In [70]:
my_model = XGBRegressor(n_estimators=500, learning_rate=0.1, random_state=1)

In [71]:
model = Pipeline(steps=[('preprocessor', preprocessor), ('model', my_model)])

model.fit(X_train, y_train)
preds = model.predict(X_valid)
print('MAE:', mean_absolute_error(y_valid, preds))

MAE: 15459.873983304795


In [72]:
#preds_test = model.predict(X_test)

In [73]:
#output = pd.DataFrame({'Id': X_test.index, 'SalePrice': preds_test})

#output.to_csv('Submission_2.csv', index=False)