## XGBoost Autotunning

### Classes

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split

class CSVData:
    
    def __init__(self, train_csv, test_csv, index_col, label_col):
        # Read the data
        X = pd.read_csv(train_csv, index_col=index_col)
        X_test_full = pd.read_csv(test_csv, index_col=index_col) 
        
        # Remove rows with missing target, separate target from predictors
        X.dropna(axis=0, subset=[label_col], inplace=True)
        y = X.SalePrice              
        X.drop([label_col], axis=1, inplace=True)

        # Break off validation set from training data
        X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                        random_state=0)

        # "Cardinality" means the number of unique values in a column
        # Select categorical columns with relatively low cardinality (convenient but arbitrary)
        low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                                X_train_full[cname].dtype == "object"]

        # Select numeric columns
        numeric_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

        # Keep selected columns only
        my_cols = low_cardinality_cols + numeric_cols
        X_train = X_train_full[my_cols].copy()
        X_valid = X_valid_full[my_cols].copy()
        X_test = X_test_full[my_cols].copy()

        # One-hot encode the data (to shorten the code, we use pandas)
        X_train = pd.get_dummies(X_train)
        X_valid = pd.get_dummies(X_valid)
        X_test = pd.get_dummies(X_test)
        X_train, X_valid = X_train.align(X_valid, join='left', axis=1)
        X_train, X_test = X_train.align(X_test, join='left', axis=1)
        
        self.X_train = X_train
        self.y_train = y_train
        self.X_valid = X_valid
        self.y_valid = y_valid
        self.X_test = X_test
        
    def train_test_split(self):
        return self.X_train, self.X_valid, self.y_train, self.y_valid
        

[XGBoost Tunning](https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/)

In [87]:
import time
import random
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

class XGBAutoTunning:
    
    def __init__(self, csv_data):
        self.start = time.time()
        self.random = False
        
        self.X_train, self.X_valid, self.y_train, self.y_valid = csv_data.train_test_split()
        
        self.learning_rate = 0.1
        self.n_estimators = 1000
        self.max_depth = 5
        self.min_child_weight = 1
        self.gamma = 0
        self.subsample = 0.8
        self.colsample_bytree = 0.8
        self.objective ='reg:linear'
        self.reg_alpha = 0
        
    def create_model(self):
        self.model = XGBRegressor(
                         learning_rate=self.learning_rate,
                         n_estimators=self.n_estimators,
                         max_depth=self.max_depth,
                         min_child_weight=self.min_child_weight,
                         gamma=self.gamma,
                         subsample=self.subsample,
                         colsample_bytree=self.colsample_bytree,
                         objective=self.objective,
                         nthread=4,
                         scale_pos_weight=1,
                         seed=27,
                         reg_alpha=self.reg_alpha,
                         silent=True)        
        return self.model
    
    def model(self):
        return self.model
    
    def best_params(self, test_params):
        if self.random:
            result = {}
            for key, value in test_params.items():
                result[key] = random.choice(value)
            return result
        else:
            model = GridSearchCV(estimator = self.model, param_grid = test_params)
            model.fit(self.X_train, self.y_train)
            return model.best_params_
        
    
    def best_model(self):
        # find max_depth and min_child_weight 
        param_test1 = {
             'max_depth': range(3,10,2),
             'min_child_weight': range(1,6,2)
        }
        self.create_model()
        best_param = self.best_params(test_params=param_test1)
        self.max_depth = best_param['max_depth']
        self.min_child_weight = best_param['min_child_weight']
        
        param_test2 = {
             'max_depth':[best_param['max_depth']-1, best_param['max_depth'], best_param['max_depth']+1],
             'min_child_weight':[best_param['min_child_weight']-1, best_param['min_child_weight'], best_param['min_child_weight']+1]
        }
        self.create_model()
        best_param2 = self.best_params(test_params=param_test2)
        self.max_depth = best_param2['max_depth']
        self.min_child_weight = best_param2['min_child_weight']
        
        param_test3 = {}
        if best_param2['min_child_weight'] == best_param['min_child_weight'] + 1:
            param_test3['min_child_weight'] = range(best_param2['min_child_weight'], best_param2['min_child_weight'] + 8, 2)
            
        if best_param2['max_depth'] == best_param['max_depth'] + 1:
            param_test3['max_depth'] = range(best_param2['max_depth'], best_param2['max_depth'] + 8, 2)
        
        self.create_model()
        if param_test3:
            best_param = self.best_params(test_params=param_test3)
            if 'min_child_weight' in best_param:
                self.min_child_weight = best_param['min_child_weight']
            if 'max_depth' in best_param:
                self.min_child_weight = best_param['max_depth']              
        
        # find gamma
        param_test4 = {
            'gamma':[i/10.0 for i in range(0,5)]
        }
        self.create_model()
        best_param = self.best_params(test_params=param_test4)
        self.gamma = best_param['gamma']
        
        # find subsample and colsample_bytree
        param_test5 = {
            'subsample':[i/10.0 for i in range(6,10)],
            'colsample_bytree':[i/10.0 for i in range(6,10)]
        }
        self.create_model()
        best_param = self.best_params(test_params=param_test5)
        self.subsample = best_param['subsample']
        self.colsample_bytree = best_param['colsample_bytree']
        
        subsample_range = range( int(abs(self.subsample * 100 - 5)), int(self.subsample * 100 + 10), 5 )
        colsample_bytree_range = range( int(abs(self.subsample * 100 - 5)), int(self.subsample * 100 + 10), 5 )
        param_test6 = {
            'subsample':[i/100.0 for i in subsample_range],
            'colsample_bytree':[i/100.0 for i in colsample_bytree_range]
        }
        self.create_model()
        best_param = self.best_params(test_params=param_test6)
        self.subsample = best_param['subsample']
        self.colsample_bytree = best_param['colsample_bytree']
        
        # find reg_alpha
        param_test7 = {
            'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
        }
        self.create_model()
        best_param = self.best_params(test_params=param_test7)
        self.reg_alpha = best_param['reg_alpha']
        
        param_test8 = {
            'reg_alpha':[0, self.reg_alpha*0.1, self.reg_alpha*0.5, self.reg_alpha, self.reg_alpha*5]          
        }
        self.create_model()
        best_param = self.best_params(test_params=param_test8)
        self.reg_alpha = best_param['reg_alpha']
        
        # Set learning rate and increase n_estimators
        self.learning_rate = 0.01
        self.n_estimators = 5000
        self.create_model()
    
    def fit(self):
        self.model.fit(self.X_train, self.y_train, 
             early_stopping_rounds=500, 
             eval_set=[(self.X_valid, self.y_valid)], 
             verbose=False)
    
    def accuracy(self):
        print(self.model)
        kfold = KFold(n_splits=5, random_state=42, shuffle=True)
        scores = -1 * cross_val_score(self.model, self.X_train, self.y_train, cv=kfold, scoring='neg_mean_absolute_error')
        print(scores)
        print("Average MAE score: ", scores.mean())
        
        # Get predictions
        predictions = self.model.predict(self.X_valid)
        # Calculate MAE
        mae = mean_absolute_error(self.y_valid, predictions)
        # Print MAE
        print("Mean Absolute Error: ", mae)
    
    def tunning(self, random=False):
        self.random = random
        self.best_model()
        self.fit()
        self.end = time.time()
        print('Execution time is ' + str(self.end - self.start) + ' seconds')
        print(self.model)
        return self.model
    
    def predict(self, X_test):
        return self.model.predict(X_test)

### Use XGBAutoTunning

In [92]:
index_col='Id'
label_col='SalePrice'
csv_data = CSVData(train_csv='../input/train.csv', test_csv='../input/test.csv', index_col=index_col, label_col=label_col)
X_test = csv_data.X_test

xgba = XGBAutoTunning(csv_data)
xgba.tunning(random=True)

  if getattr(data, 'base', None) is not None and \


Execution time is 19.136436462402344 seconds
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.65, gamma=0.4,
             importance_type='gain', learning_rate=0.01, max_delta_step=0,
             max_depth=4, min_child_weight=8, missing=None, n_estimators=5000,
             n_jobs=1, nthread=4, objective='reg:linear', random_state=0,
             reg_alpha=0.005, reg_lambda=1, scale_pos_weight=1, seed=27,
             silent=True, subsample=0.7, verbosity=1)


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.65, gamma=0.4,
             importance_type='gain', learning_rate=0.01, max_delta_step=0,
             max_depth=4, min_child_weight=8, missing=None, n_estimators=5000,
             n_jobs=1, nthread=4, objective='reg:linear', random_state=0,
             reg_alpha=0.005, reg_lambda=1, scale_pos_weight=1, seed=27,
             silent=True, subsample=0.7, verbosity=1)

In [93]:
xgba.accuracy()

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.65, gamma=0.4,
             importance_type='gain', learning_rate=0.01, max_delta_step=0,
             max_depth=4, min_child_weight=8, missing=None, n_estimators=5000,
             n_jobs=1, nthread=4, objective='reg:linear', random_state=0,
             reg_alpha=0.005, reg_lambda=1, scale_pos_weight=1, seed=27,
             silent=True, subsample=0.7, verbosity=1)


  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \


[17333.02572449 14043.00719485 16664.10022703 14440.05597841
 15788.87692798]
Average MAE score:  15653.813210550326
Mean Absolute Error:  15586.836271939212


In [94]:
preds_test = xgba.predict(X_test)

# Save predictions in format used for competition scoring
output = pd.DataFrame({index_col: X_test.index, label_col: preds_test})
output.to_csv('../output/submission.csv', index=False)