In [1]:
# What is XGBoost?

# What is XGBoost?
# Optimized gradient-boosting machine learning library
# Originally written in C++
# Has APIs in several languages:
# Python
# R
# Scala
# Julia
# Java


In [None]:
# What makes XGBoost so popular?

# Speed and performance
# Core algorithm is parallelizable
# Consistently outperforms single-algorithm methods
# State-of-the-art performance in many ML tasks


In [9]:
# Using XGBoost: a quick example:
# (Breast-Cancer dataset):

import os
import pandas as pd
#from xgboost import XGBClassifier
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split

os.chdir('C:/Users/stayde/Documents/Python Scripts/Python_24_MLwithTreeBasedModels')

BC = pd.read_csv('BreastCancer.csv')

BC.diagnosis.unique()

#M - Maleficent
#B - Benign

X = BC.drop(['diagnosis', 'id'], axis = 1)

y = BC['diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 123)

xg_cl = xgb.XGBClassifier( objective = 'binary:logistic', n_estimators = 10, seed = 123)

xg_cl.fit(X_train, y_train)

preds = xg_cl.predict(X_test)

accuracy = float(np.sum(preds==y_test))/y_test.shape[0]

print('Accuracy:', accuracy)


Accuracy: 0.9736842105263158


pandas.core.series.Series

In [15]:
# Cross-validation in XGBoost example:

# First convert y into numerical array

from sklearn import preprocessing
encoder = preprocessing.LabelEncoder()
y = encoder.fit_transform(y)

print(type (y))


import xgboost as xgb

cancer_dmatrix = xgb.DMatrix(data = X, label = y)

params = {'objective': 'binary:logistic', 'max_depth': 4}

cv_results = xgb.cv(dtrain = cancer_dmatrix, params = params, nfold = 4, num_boost_round = 10, metrics = 'error', as_pandas = True)

cv_results


<class 'numpy.ndarray'>


Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.023432,0.004962,0.070312,0.005138
1,0.016986,0.004489,0.059761,0.007919
2,0.011716,0.001648,0.059786,0.01179
3,0.013472,0.002543,0.059785,0.010687
4,0.009373,1e-05,0.05805,0.017602
5,0.00703,0.001656,0.054528,0.017592
6,0.00703,0.001656,0.052768,0.017675
7,0.005858,0.001168,0.052768,0.017675
8,0.005272,0.001013,0.052768,0.017675
9,0.0041,0.001941,0.047498,0.017573


In [16]:
# When should I use XGBoost?:

# When to use XGBoost:
    
# You have a large number oftraining samples
    # Greater than 1000 training samples and less 100 features
    # The number offeatures < number oftraining samples
# You have a mixture of categorical and numeric features
    # Or just numeric features



In [None]:
# When to NOT use XGBoost:
    
    # Image recognition
    # Computer vision
    # Natural language processing and understanding problems

# When the number oftraining samples is signicantly smaller than the number of features


In [17]:
# Regression review:

# Objective (loss) functions and base learners:

# Objective Functions and Why We Use Them:

# Quantifies how far off a prediction is from the actual result
# Measures the difference between estimated and true values for some collection of data
# Goal: Find the model that yields the minimum value of the loss function


In [None]:
# Regression review:

# Common loss functions and XGBoost:
    
# Loss function names in xgboost:

    # reg:linear - use for regression problems
    
    # reg:logistic - use for classication problems when you want just decision, not probability
    
    # binary:logistic - use when you want probability rather than just decision


In [18]:
# Base learners and why we need them:

# XGBoost involves creating a meta-modelthat is composed of many individual models that combine to give a final prediction
# Individual models = base learners
# Want base learners that when combined create final prediction that is non-linear
# Each base learner should be good at distinguishing or predicting different parts of the dataset
# Two kinds of base learners:tree and linear


In [33]:
# Trees as base learners example: Scikit-learn API:

import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

auto_old = pd.read_csv('auto.csv')

auto = pd.get_dummies(auto_old, drop_first = True)

X = auto.drop('mpg', axis =1)

y = auto['mpg']

# Set seed for reproducibility:
SEED = 1

# Split dataset into 70% train and 30% test:

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=SEED)


In [42]:
xg_reg = xgb.XGBRegressor(objective= 'reg:linear', n_estimators = 10, seed = 123)

xg_reg.fit(X_train, y_train)

preds = xg_reg.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test,preds))

print("RMSE: %f" % (rmse))

RMSE: 9.447065


In [35]:
# Linear base learners example: learning API only:

import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

auto_old = pd.read_csv('auto.csv')

auto = pd.get_dummies(auto_old, drop_first = True)

X = auto.drop('mpg', axis =1)

y = auto['mpg']

# Set seed for reproducibility:
SEED = 1

# Split dataset into 70% train and 30% test:

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=SEED)


In [52]:
DM_train = xgb.DMatrix(data = X_train, label = y_train)

DM_test = xgb.DMatrix(data = X_test, label = y_test)

params = {'booster': 'gblinear', 'objective': 'reg:linear'}

xg_reg = xgb.train(params = params, dtrain = DM_train, num_boost_round = 10)

preds = xg_reg.predict(DM_test)

rmse = np.sqrt(mean_squared_error(y_test,preds))
print("RMSE: %f" % (rmse))


RMSE: 5.356493


In [1]:
# Regularization and base learners in XGBoost:

#Regularization in XGBoost:
    
# Regularization is a control on model complexity
# Want models that are both accurate and as simple as possible
# Regularization parameters in XGBoost:
    # gamma - minimum loss reduction allowed for a split to occur
    # alpha - l1 regularization on leaf weights, larger values mean more regularization
    # lambda - l2 regularization on leaf weights


In [14]:
# L1 regularization in XGBoost example:

import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import os

os.chdir('C:/Users/stayde/Documents/Python Scripts/Python_24_MLwithTreeBasedModels')

auto_old = pd.read_csv('auto.csv')

auto = pd.get_dummies(auto_old, drop_first = True)

X = auto.drop('mpg', axis =1)

y = auto['mpg']

# Set seed for reproducibility:
SEED = 1

# Split dataset into 70% train and 30% test:

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=SEED)

auto_DMatrix = xgb.DMatrix(data = X, label = y)


In [23]:
params = {'objective': 'reg:linear', 'max_depth':4 }

l1_params = [1,10,100]

rmses_l1 = []

for reg in l1_params:
    params['alpha'] = reg
    cv_results = xgb.cv(dtrain=auto_DMatrix, params = params, nfold = 4, num_boost_round = 10, metrics = 'rmse',
                        as_pandas = True, seed = 123)
    rmses_l1.append(cv_results['test-rmse-mean'].tail(1).values[0])

    print('Best rmse as a fucntion of l1:')

print(pd.DataFrame(list(zip(l1_params,rmses_l1)), columns=["l1","rmse"]))


Best rmse as a fucntion of l1:
Best rmse as a fucntion of l1:
Best rmse as a fucntion of l1:
    l1      rmse
0    1  3.969474
1   10  4.109471
2  100  4.310505


In [None]:
# Base learners in XGBoost:

    # Linear Base Learner:
        #Sum oflinear terms
        #Boosted model is weighted sum of linear models (thus is itself linear)
        #Rarely used
        
    # Tree Base Learner:
        #Decision tree
        #Boosted model is weighted sum of decision trees (nonlinear)
        #Almost exclusively used in XGBoost


In [12]:
# Why tune your model?

#Untuned model example:
        
import os
import pandas as pd
import xgboost as xgb

os.chdir('C:/Users/stayde/Documents/Python Scripts/XGBoost')


In [13]:
housing_data = pd.read_csv('Ames_housing_trimmed_processed.csv')

X = housing_data.drop('SalePrice', axis = 1)

y = housing_data['SalePrice']

housing_dmatrix = xgb.DMatrix(data = X, label = y)

untuned_params = {'objective': 'reg:linear'}

untuned_cv_results_rmse = xgb.cv(dtrain = housing_dmatrix, params = untuned_params, metrics = 'rmse', as_pandas = True, seed = 123)

untuned_cv_results_rmse

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,141760.239583,417.078594,142563.78125,767.918982
1,102689.513021,119.371304,104973.690105,555.198377
2,75277.822917,94.41718,79158.580729,660.998064
3,56039.447917,256.403853,61316.019531,978.407785
4,42472.856771,292.832042,50255.43099,1647.817557
5,33088.195313,296.928635,43090.342448,1564.793036
6,26456.966146,239.910827,38370.486979,1657.280555
7,21774.842448,213.561293,35764.684896,1518.868932
8,18601.769531,166.786679,34405.320313,1500.546509
9,16316.602213,123.657185,33649.875,1080.103649


In [15]:
print('Untuned rmse: ', (untuned_cv_results_rmse["test-rmse-mean"]).tail(1))

Untuned rmse:  9    33649.875
Name: test-rmse-mean, dtype: float64


In [20]:
# Tuned model example:

tuned_params = {'objective':'reg:linear', 'colsample_bytree': 0.3, 'learning_rate':0.1, 'max_depth': 5}

tuned_cv_results_rmse = xgb.cv(dtrain = housing_dmatrix, params = tuned_params, nfold = 4, num_boost_round = 200, metrics = 'rmse', 
                              as_pandas = True, seed= 123)

tuned_cv_results_rmse["test-rmse-mean"].tail(1)




199    29965.411133
Name: test-rmse-mean, dtype: float64

In [21]:
# Tunable parameters in XGBoost:

# Common tree tunable parameters :
    # learning rate: learning rate/eta
    # gamma: min loss reduction to create new tree split
    # lambda: L2 reg on leaf weights
    # alpha: L1 reg on leaf weights
    # max_depth: max depth per tree
    # subsample: % samples used per tree
    # colsample_bytree: % features used per tree


In [22]:
# Linear tunable parameters:
    
# lambda: L2 reg on weights
# alpha: L1 reg on weights
# lambda_bias: L2 reg term on bias
# You can also tune the number of estimators used for both base model types !


In [None]:
# Grid search: review:

# Search exhaustively over a given set of hyperparameters, once per set of hyperparameters

# Number of models = number of distinct values per hyperparameter multiplied across each hyperparameter

# Pick final model hyperparameter values that give best cross-validated evaluation metric value


In [9]:
import os
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import numpy as np

os.chdir('C:/Users/stayde/Documents/Python Scripts/XGBoost')

housing_data = pd.read_csv('Ames_housing_trimmed_processed.csv')

X = housing_data.drop('SalePrice', axis = 1)

y = housing_data['SalePrice']

housing_dmatrix = xgb.DMatrix(data = X, label = y)

gbm_param_grid = { 'learning_rate': [0.01, 0.1, 0.5, 0.9],
                   'n_estimators': [200],
                    'subsample': [0.3,0.5,0.9]
    
}
 
gbm = xgb.XGBRegressor()

grid_mse = GridSearchCV(estimator = gbm, param_grid = gbm_param_grid, scoring = 'neg_mean_squared_error', cv = 4, verbose = 1)

grid_mse.fit(X,y)

print("Best parameters found: ",grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))


Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:   51.4s finished
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


Best parameters found:  {'learning_rate': 0.1, 'n_estimators': 200, 'subsample': 0.5}
Lowest RMSE found:  28410.039476552454


In [15]:
# Random search: review:

# Create a (possibly innite) range of hyperparameter values per hyperparameter that you would like to search over
# Set the number ofiterations you would like for the random search to continue
# During each iteration, randomly draw a value in the range of specied values for each hyperparameter searched over and train/evaluate a model with those hyperparameters
# After you've reached the maximum number ofiterations, select the hyperparameter conguration with the best evaluated score

import os
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
import numpy as np


os.chdir('C:/Users/stayde/Documents/Python Scripts/XGBoost')

housing_data = pd.read_csv('Ames_housing_trimmed_processed.csv')

X = housing_data.drop('SalePrice', axis = 1)

y = housing_data['SalePrice']

housing_dmatrix = xgb.DMatrix(data = X, label = y)

gbm_param_grid = {'learning_rate': np.arange(0.05, 1.05, 0.05),
                   'n_estimators': [200],
                   'subsample' : np.arange(0.05, 1.05, 0.05)}

gbm = xgb.XGBRegressor()

randomized_mse = RandomizedSearchCV( estimator = gbm, param_distributions = gbm_param_grid , n_iter = 75, scoring = 'neg_mean_squared_error', cv =4, verbose = 1)

randomized_mse.fit(X,y)

print("Best parameters found: ",randomized_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(randomized_mse.best_score_)))


Fitting 4 folds for each of 75 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:  4.9min finished
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


Best parameters found:  {'subsample': 0.45, 'n_estimators': 200, 'learning_rate': 0.15000000000000002}
Lowest RMSE found:  27709.159575188245


In [None]:
# Preprocessing I: LabelEncoder and OneHotEncoder:
        
# LabelEncoder : Converts a categorical column of strings into integers
# OneHotEncoder : Takes the column ofintegers and encodes them as dummy variables


In [None]:
#Pipeline Operator theory and code not covered.