# XGboost with gridsearchcv pipeline (Mean Absolute Error: 15585)

Data from the [Housing Prices Competition for Kaggle Learn Users](https://www.kaggle.com/c/home-data-for-ml-course). 
Some of the codes are taken from the tutorial [Intermediate Machine Learning](https://www.kaggle.com/learn/intermediate-machine-learning)

### Reading the file

In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV, cross_val_score
from math import sqrt

In [None]:
# Read the data
X = pd.read_csv('train.csv', index_col='Id')
X_test_full = pd.read_csv('test.csv', index_col='Id')

In [5]:
print(X.shape)
print(X_test_full.shape)

(1460, 80)
(1459, 79)


In [6]:
display(X.head())

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [7]:
# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice              
X.drop(['SalePrice'], axis=1, inplace=True)

# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
X.shape

(1460, 79)

In [9]:
# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

In [10]:
len(low_cardinality_cols)

40

In [11]:
# Select numeric columns
numeric_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

In [12]:
#total features retain 40+36 = 76
len(numeric_cols)

36

In [13]:
# Keep selected columns only
my_cols = low_cardinality_cols + numeric_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

# One-hot encode the data
X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_test = pd.get_dummies(X_test)
X_train, X_valid = X_train.align(X_valid, join='left', axis=1)
X_train, X_test = X_train.align(X_test, join='left', axis=1)

In [14]:
X_train.shape

(1022, 228)

In [15]:
# Define the model
my_model_1 = XGBRegressor(random_state=0)

# Fit the model
my_model_1.fit(X_train, y_train)

  if getattr(data, 'base', None) is not None and \




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [16]:
# Get predictions
predictions_1 = my_model_1.predict(X_valid) # Your code here

Finally, use the `mean_absolute_error()` function to calculate the mean absolute error (MAE) corresponding to the predictions for the validation set.  Recall that the labels for the validation data are stored in `y_valid`.

In [17]:
# Calculate MAE
mae_1 = mean_absolute_error(predictions_1,y_valid) # Your code here

print("Mean Absolute Error:" , mae_1)

Mean Absolute Error: 16233.512191424086


### Improving the model

In [18]:
param_grid = {
    "n_estimators": [10, 50, 100, 500, 1000, 1500],
    "learning_rate": [0.01, 0.05, 0.1],
}

fit_params = {"early_stopping_rounds": 10,
              "eval_set": [(X_valid, y_valid)], 
              "eval_metric" : "mae", 
              "verbose": False,
              "objective":"reg:squarederror"}

# model = XGBRegressor()
gridsearch = GridSearchCV(XGBRegressor(**fit_params), param_grid=param_grid, verbose=False,cv=5)

gridsearch.fit(X_train,y_train)

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1,
                                    early_stopping_rounds=10, eval_metric='mae',
                                    eval_set=[(      MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  \
Id                                                                            
893           20         70.0     8414            6            8       1963   
1106          60         98.0    12256            8            5       1994   
414           30         56.0     8960            5            6       1927   
523           50         50.0     500...
                                    nthread=None, objective='reg:squarederror',
                                    random_state=0, reg_alpha=0, reg_lambda=1,
                      

In [29]:
# Calculate MAE
predict_grid = gridsearch.predict(X_valid)
mae_2 = mean_absolute_error(predict_grid,y_valid)

print("Mean Absolute Error:" , mae_2)

#Calculate RMSE
mse_2 = mean_squared_error(predict_grid, y_valid)
rootMeanSquaredError_2 = sqrt(mse_2)
print("RMSE:", rootMeanSquaredError_2)

print("==============================================")
print("-gridsearch.best_score_ {}".format(-gridsearch.best_score_))
print("==============================================")
print("gridsearch.best_params_ {}".format(gridsearch.best_params_))
print("==============================================")
print("Mean_test_score: {}".format(gridsearch.cv_results_['mean_test_score'].mean()))


Mean Absolute Error: 15585.552520333904
RMSE: 26866.33724899692
-gridsearch.best_score_ -0.8030738383471413
gridsearch.best_params_ {'learning_rate': 0.1, 'n_estimators': 1500}
Mean_test_score: 0.13992102569026238


In [20]:
predict_test = gridsearch.predict(X_test)

In [21]:
predict_test.shape

(1459,)

In [22]:

# Save predictions in format used for competition scoring
output = pd.DataFrame({'Id': X_test.index,
                        'SalePrice': predict_test})
output.to_csv('submission.csv', index=False)

In [23]:
output.shape

(1459, 2)