In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/home-data-for-ml-course/sample_submission.csv
/kaggle/input/home-data-for-ml-course/sample_submission.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv.gz
/kaggle/input/home-data-for-ml-course/data_description.txt
/kaggle/input/home-data-for-ml-course/test.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv
/kaggle/input/home-data-for-ml-course/test.csv


# Import libaries

In [2]:
import pandas as pd                                     # Data analysis tool
import numpy as np                                      # Package for scientific computing
from sklearn.model_selection import train_test_split    # Splits arrays or matrices into random train and test subsets
from sklearn.model_selection import KFold               # Cross-validator
from sklearn.model_selection import cross_validate      # Evaluate metrics by cross-validation
from sklearn.model_selection import GridSearchCV        # Search over specified parameter values for an estimator
from sklearn.compose import ColumnTransformer           # Applies transformers to columns of DataFrames
from sklearn.pipeline import Pipeline                   # Helps building a chain of transforms and estimators
from sklearn.impute import SimpleImputer                # Imputation transformer for completing missing values
from sklearn.preprocessing import OneHotEncoder         # Encode categorical features
from sklearn.metrics import mean_absolute_error         # One of many statistical measures of error
from xgboost import XGBRegressor                        # Our model estimator

In [3]:
# Read training and test sets
X_full = pd.read_csv('../input/home-data-for-ml-course/train.csv', index_col='Id')
X_test_full = pd.read_csv('../input/home-data-for-ml-course/test.csv', index_col='Id')

# Obtain target vectors and predictors
X = X_full.copy()
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

# Training and validation split

In [4]:

# Split training and validation sets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, 
                                                                train_size=0.8, 
                                                                test_size=0.2, 
                                                                random_state=0)

In [5]:
X_train_full

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
619,20,RL,90.0,11694,Pave,,Reg,Lvl,AllPub,Inside,...,260,0,,,,0,7,2007,New,Partial
871,20,RL,60.0,6600,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2009,WD,Normal
93,30,RL,80.0,13360,Pave,Grvl,IR1,HLS,AllPub,Inside,...,0,0,,,,0,8,2009,WD,Normal
818,20,RL,,13265,Pave,,IR1,Lvl,AllPub,CulDSac,...,0,0,,,,0,7,2008,WD,Normal
303,20,RL,118.0,13704,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,1,2006,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
764,60,RL,82.0,9430,Pave,,Reg,Lvl,AllPub,Inside,...,180,0,,,,0,7,2009,WD,Normal
836,20,RL,60.0,9600,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2010,WD,Normal
1217,90,RM,68.0,8930,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2010,WD,Normal
560,120,RL,,3196,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,10,2006,WD,Normal


# Preprocessing

In [6]:
# Select categorical columns with no more than 15 unique values
categorical_cols = [col for col in X_train_full.columns if 
                   X_train_full[col].nunique() <= 15 and
                   X_train_full[col].dtype == 'object']

# Select numeric values
numeric_cols = [col for col in X_train_full.columns if
                X_train_full[col].dtype in ['int64', 'float64']]

# Keep selected columns
my_columns = categorical_cols + numeric_cols
X_train = X_train_full[my_columns].copy()
X_valid = X_valid_full[my_columns].copy()
X_test = X_test_full[my_columns].copy()

# Pipeline

In [7]:
# Preprocessing numerical values
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing categorical values
categorical_transformer = Pipeline(steps=[
                                   ('imputer', SimpleImputer(strategy='constant', fill_value='NA')),
                                   ('onehot', OneHotEncoder(handle_unknown='ignore'))
                                   ])

# Pack the preprocessors together
preprocessor = ColumnTransformer(transformers=[
                                 ('num', numerical_transformer, numeric_cols),
                                 ('cat', categorical_transformer, categorical_cols)
                                 ])

# Model

In [8]:

# Define the model with default parameters
model = XGBRegressor(verbosity=0, random_state=0)

# Pack preprocessing and modeling together in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                              ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

print('MAE:', mean_absolute_error(y_valid, preds))

MAE: 17158.38955479452


# Cross validation

In [9]:

# Using KFold cross-validator
kfold = KFold(shuffle=True, random_state=0)

# Evaluating the Mean Absolute Error
scores = cross_validate(my_pipeline, X_train, y_train, 
                              scoring='neg_mean_absolute_error', cv=kfold)

# Multiply by -1 since sklearn calculates negative MAE
print('Average MAE score:', (scores['test_score'] * -1).mean())

Average MAE score: 18309.7395617824


# Hyperparameter tuning

In [10]:

# parameters to be searched over
param_grid = {'model__n_estimators': [10],
              'model__max_depth': [2],
              'model__min_child_weight': [0.0001],
              'model__learning_rate': [0.01]}

# find the best parameter
kfold = KFold(shuffle=True, random_state=0)
grid_search = GridSearchCV(my_pipeline, param_grid, scoring='neg_mean_absolute_error', cv=kfold, n_jobs=-1)
grid_result = grid_search.fit(X_train, y_train)

# Test predictions

In [11]:
# Define final model
final_model = XGBRegressor(n_estimators=400, 
                           max_depth=3, 
                           min_child_weight=0.0001, 
                           learning_rate=0.1, 
                           verbosity=0, 
                           random_state=0
                           )

# Create a pipeline
final_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('final_model', final_model)
                                 ])

# Fit the model
final_pipeline.fit(X_train, y_train)

# Get predictions on the test set
final_prediction = final_pipeline.predict(X_test)

# submit results

In [12]:
# Save test predictions to .csv file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': final_prediction})
output.to_csv('housing_price_submission.csv', index=False)