<a href="https://colab.research.google.com/github/gisandnes/Extreme-Gradient-Boosting-with-XGBoost_DataCamp/blob/master/Fine_tuning_your_XGBoost_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!mkdir -p data #Make folders for downloads

#!wget --no-verbose https://raw.githubusercontent.com/gisandnes/Extreme-Gradient-Boosting-with-XGBoost_DataCamp/master/data/chronic_kidney_disease.csv -O ./data/chronic_kidney_disease.csv
#!wget --no-verbose https://raw.githubusercontent.com/gisandnes/Extreme-Gradient-Boosting-with-XGBoost_DataCamp/master/data/ames_unprocessed_data.csv -O ./data/ames_unprocessed_data.csv 
!wget --no-verbose https://raw.githubusercontent.com/gisandnes/Extreme-Gradient-Boosting-with-XGBoost_DataCamp/master/data/ames_housing_trimmed_processed.csv -O ./data/ames_housing_trimmed_processed.csv

2019-01-11 15:51:44 URL:https://raw.githubusercontent.com/gisandnes/Extreme-Gradient-Boosting-with-XGBoost_DataCamp/master/data/ames_housing_trimmed_processed.csv [196754/196754] -> "./data/ames_housing_trimmed_processed.csv" [1]


In [0]:
from IPython.display import display
import matplotlib
import matplotlib.pyplot as plt 
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import xgboost as xgb

%matplotlib inline

In [0]:
ames_processed  = pd.read_csv("./data/ames_housing_trimmed_processed.csv")
features = ames_processed.loc[:,"MSSubClass":"PavedDrive_Y"] # Pandas Data Frame
response = ames_processed.loc[:,"SalePrice"]                 # Pandas Series

# Note that we use Pandas Data Frame and Pandas Series in DMatrix, not Numpy arrays

###Automated boosting round selection using early_stopping

In [14]:
# Create your housing DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=features, label=response)

# Create the parameter dictionary for each tree: params
params = {"objective":"reg:linear", "max_depth":4}

# Perform cross-validation with early stopping: cv_results
cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=3, num_boost_round=50, early_stopping_rounds=10, metrics="rmse", as_pandas=True, seed=123)

# Print cv_results
print(cv_results)

    test-rmse-mean  test-rmse-std  train-rmse-mean  train-rmse-std
0    142640.656250     705.559400    141871.630208      403.632626
1    104907.664063     111.113862    103057.036458       73.769561
2     79262.059895     563.766991     75975.966146      253.726099
3     61620.136719    1087.694282     57420.529948      521.658354
4     50437.562500    1846.448017     44552.955729      544.170190
5     43035.658854    2034.471024     35763.949219      681.798925
6     38600.880208    2169.796232     29861.464844      769.571318
7     36071.817708    2109.795430     25994.675781      756.521419
8     34383.184896    1934.546688     23306.836588      759.238254
9     33509.139974    1887.375633     21459.770833      745.624404
10    32916.805990    1850.893363     20148.721354      749.612769
11    32197.832682    1734.456935     19215.382813      641.387376
12    31770.852865    1802.155484     18627.389323      716.256596
13    31482.782552    1779.123767     17960.695312      557.04

###Grid Search with XGBoost

In [15]:
# Create your housing DMatrix: housing_dmatrix
#housing_dmatrix = xgb.DMatrix(data=features, label=response)

# Create the parameter grid: gbm_param_grid
gbm_param_grid = {
    'colsample_bytree': [0.3, 0.7],
    'n_estimators': [50],
    'max_depth': [2, 5]
}

# Instantiate the regressor: gbm
gbm = xgb.XGBRegressor()

# Perform grid search: grid_mse
grid_mse = GridSearchCV(estimator=gbm, param_grid=gbm_param_grid, scoring="neg_mean_squared_error", cv=4, verbose=1)

# Fit grid_mse to the data
grid_mse.fit(features, response)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))

Fitting 4 folds for each of 4 candidates, totalling 16 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Best parameters found:  {'colsample_bytree': 0.3, 'max_depth': 5, 'n_estimators': 50}
Lowest RMSE found:  29655.33697347771


[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    1.4s finished


###Random Search with XGBoost

In [16]:
# Create the parameter grid: gbm_param_grid 
gbm_param_grid = {
    'n_estimators': [25],
    'max_depth': range(2, 12)
}

# Instantiate the regressor: gbm
gbm = xgb.XGBRegressor(n_estimators=10)

# Perform random search: grid_mse
randomized_mse = RandomizedSearchCV(estimator=gbm, param_distributions=gbm_param_grid, n_iter=5,scoring='neg_mean_squared_error', cv=4,verbose=1)


# Fit randomized_mse to the data
randomized_mse.fit(features, response)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", randomized_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(randomized_mse.best_score_)))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 4 folds for each of 5 candidates, totalling 20 fits
Best parameters found:  {'n_estimators': 25, 'max_depth': 5}
Lowest RMSE found:  36636.35808132903


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    2.4s finished
