### Installing XGBoost

* Execute below command in terminal


* !pip install xgboost
   *   OR
* sudo conda install -c conda-forge xgboost (Run in terminal)

In [217]:
import pandas as pd
import numpy

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score,classification_report, recall_score, precision_score,confusion_matrix

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

#### Problem Statement - Tennis Australia is trying to better automate how tennis points get categorized into three outcomes – winners, forced errors and unforced errors.

#### Dataset Description:

The dataset includes point outcomes of rallies only (where the number of shots hit exceeds two, which represents the serve and return). All points were played at a past Australian Open.


In [173]:
data = pd.read_csv("./DataSet/train.csv")

In [174]:
data.shape

(8001, 27)

#### See the top 5 rows of the data

In [175]:
data.head()

Unnamed: 0,rally,serve,hitpoint,speed,net.clearance,distance.from.sideline,depth,outside.sideline,outside.baseline,player.distance.travelled,...,previous.depth,opponent.depth,opponent.distance.from.center,same.side,previous.hitpoint,previous.time.to.net,server.is.impact.player,outcome,gender,ID
0,4,1,B,35.515042,-0.021725,3.474766,6.797621,False,False,1.46757,...,0.705435,12.5628,2.0724,True,F,0.445318,False,UE,mens,8644
1,4,2,B,33.38264,1.114202,2.540801,2.608708,False,True,2.311931,...,3.8566,12.3544,5.1124,False,B,0.432434,False,FE,mens,1182
2,23,1,B,22.31669,-0.254046,3.533166,9.435749,False,False,3.903728,...,2.908892,13.862,1.6564,False,F,0.397538,True,FE,mens,9042
3,9,1,F,36.837309,0.766694,0.586885,3.34218,True,False,0.583745,...,0.557554,14.2596,0.1606,True,B,0.671984,True,UE,mens,1222
4,4,1,B,35.544208,0.116162,0.918725,5.499119,False,False,2.333456,...,3.945317,11.3658,1.1082,False,F,0.340411,False,W,mens,4085


#### Different classes in Outcome variable

In [176]:
pd.unique(data.outcome)

array(['UE', 'FE', 'W'], dtype=object)

#### Outcome variable - classes
* Winner – the point winning player hits a shot that is not touched by the opponent
* Forced error – the point winning player hits a shot that causes the opponent to not be able to return it, i.e. a good shot that is hard to handle
* Unforced error – the player attempting to return the ball makes an error on an otherwise normal looking rally shot

#### Check the number of columns

In [177]:
len(data.columns)

27

#### Display data type of each variable

In [178]:
data.dtypes

rally                                   int64
serve                                   int64
hitpoint                               object
speed                                 float64
net.clearance                         float64
distance.from.sideline                float64
depth                                 float64
outside.sideline                         bool
outside.baseline                         bool
player.distance.travelled             float64
player.impact.depth                   float64
player.impact.distance.from.center    float64
player.depth                          float64
player.distance.from.center           float64
previous.speed                        float64
previous.net.clearance                float64
previous.distance.from.sideline       float64
previous.depth                        float64
opponent.depth                        float64
opponent.distance.from.center         float64
same.side                                bool
previous.hitpoint                 

#### Identifying categorical attributes

In [179]:
categorical_list = ["hitpoint","outside.sideline",
                    "outside.baseline","same.side","previous.hitpoint","server.is.impact.player","gender","outcome"]

#### Converting to appropriate datatype

In [180]:
for i in data[categorical_list]:
    data[i] = data[i].astype("category")    

#### Display data type of each variable after conversion

In [181]:
data.dtypes

rally                                    int64
serve                                    int64
hitpoint                              category
speed                                  float64
net.clearance                          float64
distance.from.sideline                 float64
depth                                  float64
outside.sideline                      category
outside.baseline                      category
player.distance.travelled              float64
player.impact.depth                    float64
player.impact.distance.from.center     float64
player.depth                           float64
player.distance.from.center            float64
previous.speed                         float64
previous.net.clearance                 float64
previous.distance.from.sideline        float64
previous.depth                         float64
opponent.depth                         float64
opponent.distance.from.center          float64
same.side                             category
previous.hitp

#### Dropping ID column and checking the length of columns

In [182]:
data.drop(["ID"], axis=1, inplace=True)
len(data.columns)

26

#### Display summary statistics 

In [183]:
data.describe()

Unnamed: 0,rally,serve,speed,net.clearance,distance.from.sideline,depth,player.distance.travelled,player.impact.depth,player.impact.distance.from.center,player.depth,player.distance.from.center,previous.speed,previous.net.clearance,previous.distance.from.sideline,previous.depth,opponent.depth,opponent.distance.from.center,previous.time.to.net
count,8001.0,8001.0,8001.0,8001.0,8001.0,8001.0,8001.0,8001.0,8001.0,8001.0,8001.0,8001.0,8001.0,8001.0,8001.0,8001.0,8001.0,8001.0
mean,5.966004,1.3987,30.806938,0.629658,1.46763,4.421146,2.690463,11.899694,1.919544,12.253954,1.213795,28.763676,0.821562,2.19342,4.218717,12.61681,2.367952,0.549988
std,3.548182,0.489661,7.298917,0.982504,1.108697,3.144965,1.713136,2.788231,1.205449,2.039085,0.964364,6.47747,0.674663,1.038942,2.052946,2.075401,1.313927,0.186788
min,3.0,1.0,5.176078,-0.998184,0.000497,0.003135,0.0,2.156,0.0002,1.3898,0.0004,8.449117,0.028865,0.000164,0.000467,2.1612,0.0002,0.003201
25%,3.0,1.0,26.77029,-0.027092,0.5395,1.641161,1.444233,11.2214,0.9424,11.3742,0.5518,24.033218,0.404815,1.354458,2.733674,12.0824,1.3522,0.432164
50%,5.0,1.0,32.41769,0.44587,1.210847,3.860266,2.360894,12.6918,1.8294,12.5516,0.9838,29.793417,0.658382,2.168822,4.126864,12.9016,2.332,0.507559
75%,7.0,2.0,35.681431,0.970844,2.215955,7.029345,3.565853,13.553,2.7452,13.498,1.5966,33.581003,1.021397,3.022677,5.595515,13.7128,3.259,0.624135
max,38.0,2.0,55.052795,12.815893,7.569757,11.886069,14.480546,18.1256,7.7462,18.7458,9.3526,54.207506,6.730275,4.114361,9.997963,20.211,6.8526,1.635257


In [184]:
data.describe(include=['category'])

Unnamed: 0,hitpoint,outside.sideline,outside.baseline,same.side,previous.hitpoint,server.is.impact.player,outcome,gender
count,8001,8001,8001,8001,8001,8001,8001,8001
unique,4,2,2,2,4,2,3,2
top,F,False,False,False,F,True,UE,mens
freq,4402,6500,6380,6036,3684,4670,3501,4005


#### Checking for null values

In [185]:
data.isnull().sum()

rally                                 0
serve                                 0
hitpoint                              0
speed                                 0
net.clearance                         0
distance.from.sideline                0
depth                                 0
outside.sideline                      0
outside.baseline                      0
player.distance.travelled             0
player.impact.depth                   0
player.impact.distance.from.center    0
player.depth                          0
player.distance.from.center           0
previous.speed                        0
previous.net.clearance                0
previous.distance.from.sideline       0
previous.depth                        0
opponent.depth                        0
opponent.distance.from.center         0
same.side                             0
previous.hitpoint                     0
previous.time.to.net                  0
server.is.impact.player               0
outcome                               0


#### Display all the columns

In [186]:
data.columns

Index(['rally', 'serve', 'hitpoint', 'speed', 'net.clearance',
       'distance.from.sideline', 'depth', 'outside.sideline',
       'outside.baseline', 'player.distance.travelled', 'player.impact.depth',
       'player.impact.distance.from.center', 'player.depth',
       'player.distance.from.center', 'previous.speed',
       'previous.net.clearance', 'previous.distance.from.sideline',
       'previous.depth', 'opponent.depth', 'opponent.distance.from.center',
       'same.side', 'previous.hitpoint', 'previous.time.to.net',
       'server.is.impact.player', 'outcome', 'gender'],
      dtype='object')

#### Creating a list of numerical attributes

In [187]:
numeric_list = ['rally','serve','speed','net.clearance','distance.from.sideline','depth','player.distance.travelled','player.impact.depth','player.impact.distance.from.center','player.depth','player.distance.from.center','previous.speed','previous.net.clearance','previous.distance.from.sideline','previous.depth','opponent.depth','opponent.distance.from.center','previous.time.to.net']
numeric_data = data[numeric_list]

#### Creating a list of categorical attributes

In [188]:
categorical_list = ["hitpoint","outside.sideline",
                    "outside.baseline","same.side","previous.hitpoint","server.is.impact.player","gender"]

In [189]:
from sklearn.preprocessing import LabelEncoder

le1 = LabelEncoder()
for i in categorical_list:
    le1.fit(data[i])
    data[i] = le1.transform(data[i])

In [190]:
data.head()

Unnamed: 0,rally,serve,hitpoint,speed,net.clearance,distance.from.sideline,depth,outside.sideline,outside.baseline,player.distance.travelled,...,previous.distance.from.sideline,previous.depth,opponent.depth,opponent.distance.from.center,same.side,previous.hitpoint,previous.time.to.net,server.is.impact.player,outcome,gender
0,4,1,0,35.515042,-0.021725,3.474766,6.797621,0,0,1.46757,...,2.449182,0.705435,12.5628,2.0724,1,1,0.445318,0,UE,0
1,4,2,0,33.38264,1.114202,2.540801,2.608708,0,1,2.311931,...,0.583291,3.8566,12.3544,5.1124,0,0,0.432434,0,FE,0
2,23,1,0,22.31669,-0.254046,3.533166,9.435749,0,0,3.903728,...,1.11525,2.908892,13.862,1.6564,0,1,0.397538,1,FE,0
3,9,1,1,36.837309,0.766694,0.586885,3.34218,1,0,0.583745,...,3.256695,0.557554,14.2596,0.1606,1,0,0.671984,1,UE,0
4,4,1,0,35.544208,0.116162,0.918725,5.499119,0,0,2.333456,...,1.431146,3.945317,11.3658,1.1082,0,1,0.340411,0,W,0


In [191]:
# Divide into train and test
y=data["outcome"]
X=data.drop('outcome', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(6400, 25)
(1601, 25)
(6400,)
(1601,)


In [192]:
y_train.value_counts()

UE    2767
W     2171
FE    1462
Name: outcome, dtype: int64

#### With Standardization

In [193]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [194]:
X_train.dtypes

rally                                   int64
serve                                   int64
hitpoint                                int64
speed                                 float64
net.clearance                         float64
distance.from.sideline                float64
depth                                 float64
outside.sideline                        int64
outside.baseline                        int64
player.distance.travelled             float64
player.impact.depth                   float64
player.impact.distance.from.center    float64
player.depth                          float64
player.distance.from.center           float64
previous.speed                        float64
previous.net.clearance                float64
previous.distance.from.sideline       float64
previous.depth                        float64
opponent.depth                        float64
opponent.distance.from.center         float64
same.side                               int64
previous.hitpoint                 

In [195]:
continuous_cols=['rally','serve','speed','net.clearance','distance.from.sideline','depth','player.distance.travelled','player.impact.depth','player.impact.distance.from.center','player.depth','player.distance.from.center','previous.speed','previous.net.clearance','previous.distance.from.sideline','previous.depth','opponent.depth','opponent.distance.from.center','previous.time.to.net']
categorical_cols=["hitpoint","outside.sideline",
                    "outside.baseline","same.side","previous.hitpoint","server.is.impact.player","gender"]

In [196]:
scaler = StandardScaler()
scaler.fit(X_train[continuous_cols])

X_train_num = pd.DataFrame(scaler.transform(X_train[continuous_cols]), columns=continuous_cols)
X_test_num = pd.DataFrame(scaler.transform(X_test[continuous_cols]), columns=continuous_cols)

In [197]:
X_train_num.head()

Unnamed: 0,rally,serve,speed,net.clearance,distance.from.sideline,depth,player.distance.travelled,player.impact.depth,player.impact.distance.from.center,player.depth,player.distance.from.center,previous.speed,previous.net.clearance,previous.distance.from.sideline,previous.depth,opponent.depth,opponent.distance.from.center,previous.time.to.net
0,-0.826632,-0.815168,0.334783,0.199363,-0.765017,-1.204024,0.130585,-0.741256,-0.401646,-0.712978,0.086026,-1.400818,-0.01784,1.034988,0.634947,-0.480712,0.814835,0.682134
1,0.00756,-0.815168,0.29371,0.487594,0.857224,-1.30757,1.093291,0.631147,1.074472,0.838741,0.121077,0.016195,0.394119,-1.153055,-1.403107,0.327623,-1.783663,-0.172335
2,-0.826632,-0.815168,0.078221,-0.692645,0.015536,0.742131,-1.320338,-0.292535,-0.052576,-0.635704,0.232414,0.331064,-0.991767,1.173984,1.169029,-0.006076,1.339355,-0.382251
3,-0.826632,-0.815168,0.461045,-0.051382,-0.649263,-1.257422,-0.572307,-0.851849,-0.884169,-0.515542,-0.484681,-1.33022,0.4291,0.436658,0.53281,0.186206,0.397691,1.359821
4,0.563688,1.226741,-2.094231,-0.408501,-0.479723,0.096463,4.840486,-2.770874,0.56008,0.979555,0.41076,-2.859353,2.300631,-0.194781,2.459385,-3.632265,-0.55953,2.128145


In [198]:
print(X_train.hitpoint.value_counts())
print(X_train['outside.sideline'].value_counts())

1    3526
0    2430
2     344
3     100
Name: hitpoint, dtype: int64
0    5199
1    1201
Name: outside.sideline, dtype: int64


In [199]:
ohe = OneHotEncoder()

ohe.fit(X_train[categorical_cols])

columns_ohe = list(ohe.get_feature_names())
print(columns_ohe)

['x0_0.0', 'x0_1.0', 'x0_2.0', 'x0_3.0', 'x1_0.0', 'x1_1.0', 'x2_0.0', 'x2_1.0', 'x3_0.0', 'x3_1.0', 'x4_0.0', 'x4_1.0', 'x4_2.0', 'x4_3.0', 'x5_0.0', 'x5_1.0', 'x6_0.0', 'x6_1.0']


In [200]:
X_train_cat = ohe.transform(X_train[categorical_cols])
X_test_cat = ohe.transform(X_test[categorical_cols])

In [201]:
X_train_cat = pd.DataFrame(X_train_cat.todense(), columns=columns_ohe)
X_test_cat = pd.DataFrame(X_test_cat.todense(), columns=columns_ohe)

In [202]:
X_train_cat.head()

Unnamed: 0,x0_0.0,x0_1.0,x0_2.0,x0_3.0,x1_0.0,x1_1.0,x2_0.0,x2_1.0,x3_0.0,x3_1.0,x4_0.0,x4_1.0,x4_2.0,x4_3.0,x5_0.0,x5_1.0,x6_0.0,x6_1.0
0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
4,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [203]:
# X_train_num = X_train.drop(columns=categorical_cols, axis=1)
# X_test_num = X_test.drop(columns=categorical_cols, axis=1)

In [204]:
X_train = pd.concat([X_train_cat, X_train_num], axis=1)
X_test = pd.concat([X_test_cat, X_test_num], axis=1)

In [205]:
X_train.head()

Unnamed: 0,x0_0.0,x0_1.0,x0_2.0,x0_3.0,x1_0.0,x1_1.0,x2_0.0,x2_1.0,x3_0.0,x3_1.0,...,player.impact.distance.from.center,player.depth,player.distance.from.center,previous.speed,previous.net.clearance,previous.distance.from.sideline,previous.depth,opponent.depth,opponent.distance.from.center,previous.time.to.net
0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,-0.401646,-0.712978,0.086026,-1.400818,-0.01784,1.034988,0.634947,-0.480712,0.814835,0.682134
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,1.074472,0.838741,0.121077,0.016195,0.394119,-1.153055,-1.403107,0.327623,-1.783663,-0.172335
2,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,-0.052576,-0.635704,0.232414,0.331064,-0.991767,1.173984,1.169029,-0.006076,1.339355,-0.382251
3,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,-0.884169,-0.515542,-0.484681,-1.33022,0.4291,0.436658,0.53281,0.186206,0.397691,1.359821
4,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.56008,0.979555,0.41076,-2.859353,2.300631,-0.194781,2.459385,-3.632265,-0.55953,2.128145


In [206]:
X_train.isna().sum()

x0_0.0                                0
x0_1.0                                0
x0_2.0                                0
x0_3.0                                0
x1_0.0                                0
x1_1.0                                0
x2_0.0                                0
x2_1.0                                0
x3_0.0                                0
x3_1.0                                0
x4_0.0                                0
x4_1.0                                0
x4_2.0                                0
x4_3.0                                0
x5_0.0                                0
x5_1.0                                0
x6_0.0                                0
x6_1.0                                0
rally                                 0
serve                                 0
speed                                 0
net.clearance                         0
distance.from.sideline                0
depth                                 0
player.distance.travelled             0


In [207]:
print(type(X_train))
print(type(X_test))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [208]:
X_train = X_train.values
X_test = X_test.values

print(type(X_train))
print(type(X_test))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


## Model Building

### Build AdaBoost Classifier
<img src="img/bigd.png">

### Getting Started
To begin with, a __weak classifier__ is trained, and __all of the example data samples are given an equal weight__. 

Once the initial classifier is trained, two things happen. 
- A weight is calculated for the classifier, with more accurate classifiers being given a higher weight, and less accurate a lower weight. The weight is calculated based on the classifier’s error rate, which is the __number of misclassifications in the training set, divided by total training set size__. This output weight per model is known as the __“alpha”__.

    <img src="img/weightage_to_weak_learner.png">

- Secondly, the AdaBoost algorithm directs its attention to __misclassified data examples from our first weak classifier__, by assigning weights to each data sample, the value of which is defined by whether the classifier correctly or incorrectly classified the sample.

    <img src="img/adaboost_formula.png" >

### Final Model

Once all of the iterations have been completed, all of the __weak learners are combined with their weights to form a strong classifier__, as expressed in the below equation:

<img src="img/final_model.png">

The final classifier is therefore __built up of “T” weak classifiers, ht(x) is the output of the weak classifier, with at the weight applied to the classifier__. 

The final output is therefore a combination of all of the classifiers.

#### Create Adaboost Classifier

The most important parameters are base_estimator, n_estimators, and learning_rate.
-  **base_estimator** is the learning algorithm to use to train the weak models. This will almost always not needed to be changed because by far the most common learner to use with AdaBoost is a decision tree – this parameter’s default argument.
-  **n_estimators** is the number of models to iteratively train.
-  **learning_rate** is the contribution of each model to the weights and defaults to 1. Reducing the learning rate will mean the weights will be increased or decreased to a small degree, forcing the model train slower (but sometimes resulting in better performance scores).
-  **loss** is exclusive to AdaBoostRegressor and sets the loss function to use when updating weights. This defaults to a linear loss function however can be changed to square or exponential.

In [211]:
# Create adaboost-decision tree classifer object
Adaboost_model = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=2),
    n_estimators = 600,
    learning_rate = 1)

In [212]:
%time Adaboost_model.fit(X_train, y_train)

CPU times: user 22 s, sys: 0 ns, total: 22 s
Wall time: 22.1 s


AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='gini',
                                                         max_depth=2,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=None,
                             

In [213]:
y_preds = Adaboost_model.predict(X_test)

In [214]:
print(accuracy_score(y_test, y_preds))

0.830106183635228


In [218]:
param_grid = {'n_estimators' : [100, 150, 200],
              'learning_rate' : [0.1, 0.5, 0.9]}

Adaboost_model_clf = GridSearchCV(AdaBoostClassifier(
            DecisionTreeClassifier(max_depth=2)), param_grid, n_jobs=-1)

In [219]:
%time Adaboost_model_clf.fit(X_train, y_train)

CPU times: user 5.63 s, sys: 55.7 ms, total: 5.69 s
Wall time: 36.7 s


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=DecisionTreeClassifier(class_weight=None,
                                                                                criterion='gini',
                                                                                max_depth=2,
                                                                                max_features=None,
                                                                                max_leaf_nodes=None,
                                                                                min_impurity_decrease=0.0,
                                                                                min_impurity_split=None,
                                                                                min_samples_leaf=1,
                                                                              

In [220]:
best_ada_model = Adaboost_model_clf.best_estimator_
print (Adaboost_model_clf.best_score_, Adaboost_model_clf.best_params_) 

0.855 {'learning_rate': 0.1, 'n_estimators': 150}


In [221]:
y_pred_test = best_ada_model.predict(X_test)

In [222]:
print(accuracy_score(y_test,y_pred_test))

0.8557151780137414


In [223]:
print(confusion_matrix(y_test, y_pred_test))

[[230 110  16]
 [ 52 652  30]
 [ 10  13 488]]


## Build Gradient Boosting Classifier

<img src="img/grad_1.png">
<img src="img/grad_2.png">
<img src="img/grad_3.png">

Gradient boosting is a type of boosting. 

The key idea behind gradient boosting is to set the target outcomes for this next model in order to minimize the error. The target outcome for each case in the data set depends on how much a change in that case’s prediction impacts the overall prediction error.

If, for case X(i), a small change in the prediction causes a large drop in error, then the next target outcome is a high value. Predictions from the new model that are close to its targets will reduce the error.

If, for case X(j), a small change in the prediction causes no change in error, then the next target outcome is zero because changing this prediction does not decrease the error.

The name gradient boosting arises because of setting target outcomes based on the gradient of the error with respect to the prediction of each case. Each new model takes a step in the direction that minimizes prediction error, in the space of possible predictions for each training case.

1. Initialize the outcome
2. Iterate from 1 to total number of trees
  <br>2.1 Update the weights for targets based on previous run (higher for the ones mis-classified)
  <br>2.2 Fit the model on selected subsample of data
  <br>2.3 Make predictions on the full set of observations
  <br>2.4 Update the output with current results taking into account the learning rate
3. Return the final output.

The most important parameters are learning_rate, n_estimators and subsample
- **learning_rate**
    -  This determines the impact of each tree on the final outcome (step 2.4). GBM works by starting with an initial estimate which is updated using the output of each tree. The learning parameter controls the magnitude of this change in the estimates.
    -  Lower values are generally preferred as they make the model robust to the specific characteristics of tree and thus allowing it to generalize well.
    -  Lower values would require higher number of trees to model all the relations and will be computationally expensive.
- **n_estimators**
    -  The number of sequential trees to be modeled (step 2)
    -  Though GBM is fairly robust at higher number of trees but it can still overfit at a point. Hence, this should be tuned using CV for a particular learning rate.
- **subsample**
    -  The fraction of observations to be selected for each tree. Selection is done by random sampling.
    -  Values slightly less than 1 make the model robust by reducing the variance.
    -  Typical values ~0.8 generally work fine but can be fine-tuned further.

In [224]:
GBM_model = GradientBoostingClassifier(n_estimators=50,
                                       learning_rate=0.3,
                                       subsample=0.8)

In [225]:
%time GBM_model.fit(X=X_train, y=y_train)

CPU times: user 6.99 s, sys: 116 ms, total: 7.1 s
Wall time: 2.49 s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.3, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=50,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=0.8, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [226]:
y_pred = GBM_model.predict(X_test)

In [227]:
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred,digits=4))

0.8663335415365396
              precision    recall  f1-score   support

          FE     0.7593    0.7444    0.7518       356
          UE     0.8788    0.8597    0.8691       734
           W     0.9195    0.9609    0.9397       511

    accuracy                         0.8663      1601
   macro avg     0.8525    0.8550    0.8535      1601
weighted avg     0.8652    0.8663    0.8656      1601



In [228]:
# Model in use
GBM = GradientBoostingClassifier() 
 
# Use a grid over parameters of interest
param_grid = { 
           "n_estimators" : [100,150],
           "max_depth" : [5, 10],
           "learning_rate" : [0.1,0.2]}
 
CV_GBM = GridSearchCV(estimator=GBM, param_grid=param_grid, cv= 5)

In [229]:
%time CV_GBM.fit(X=X_train, y=y_train)

CPU times: user 21min 33s, sys: 11.8 s, total: 21min 45s
Wall time: 10min 42s


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_change=None,
                                                  presort=

In [230]:
# Find best model
best_gbm_model = CV_GBM.best_estimator_
print (CV_GBM.best_score_, CV_GBM.best_params_)

0.87359375 {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150}


In [231]:
y_pred_test=best_gbm_model.predict(X_test)

In [232]:
print(accuracy_score(y_test,y_pred_test))

0.8775765146783261


In [233]:
print(confusion_matrix(y_test, y_pred_test))

[[273  72  11]
 [ 73 639  22]
 [  8  10 493]]


### Build XGBOOST Classifier
XGBoost (eXtreme Gradient Boosting) is an advanced implementation of gradient boosting algorithm.

<img src="img/xgboost.png" >

#### The XGBoost Advantages
-  Regularization:
    -  Standard GBM implementation has no regularization like XGBoost, therefore it also helps to reduce overfitting.
    -  In fact, XGBoost is also known as ‘regularized boosting‘ technique.
-  Parallel Processing:
    -  XGBoost implements parallel processing and is blazingly faster as compared to GBM.
    -  Boosting is sequential process that each tree can be built only after the previous one, and it  can be parallelized with making a tree using all cores. Refer. http://zhanpengfang.github.io/418home.html
    
    -  XGBoost also supports implementation on Hadoop.
-  High Flexibility
    -  XGBoost allow users to define custom optimization objectives and evaluation criteria.
    -  This adds a whole new dimension to the model and there is no limit to what we can do.
-  Handling Missing Values
    -  XGBoost has an in-built routine to handle missing values.
    -  User is required to supply a different value than other observations and pass that as a parameter. XGBoost tries different things as it encounters a missing value on each node and learns which path to take for missing values in future.
-  Tree Pruning:
    -  A GBM would stop splitting a node when it encounters a negative loss in the split. Thus it is more of a greedy algorithm.
    -  XGBoost on the other hand make splits upto the max_depth specified and then start pruning the tree backwards and remove splits beyond which there is no positive gain.
    -  Another advantage is that sometimes a split of negative loss say -2 may be followed by a split of positive loss +10. GBM would stop as it encounters -2. But XGBoost will go deeper and it will see a combined effect of +8 of the split and keep both.
-  Built-in Cross-Validation
    -  XGBoost allows user to run a cross-validation at each iteration of the boosting process and thus it is easy to get the exact optimum number of boosting iterations in a single run.
    -  This is unlike GBM where we have to run a grid-search and only a limited values can be tested.
-  Continue on Existing Model
    -  User can start training an XGBoost model from its last iteration of previous run. This can be of significant advantage in certain specific applications.
    -  GBM implementation of sklearn also has this feature so they are even on this point.

#### Create XGBoost Classifier

There are different hyperparameters that we can tune and the parametres are different from baselearner to baselearner. 
<br>In tree based learners, which are the most common ones in xgboost applications, the following are the most commonly tuned hyperparameters:

-  **learning rate/eta:** governs how quickly the model fits the residual error using additional base learners. If it is a smaller learning rate, it will need more boosting rounds, hence more time, to achieve the same reduction in residual error as one with larger learning rate. Typically, it lies between 0.01 – 0.3
-  **max_depth:** max depth per tree. This controls how deep our tree can grow. The Larger the depth, more complex the model will be and higher chances of overfitting. Larger data sets require deep trees to learn the rules from data. Default = 6.
-  **subsample:** % samples used per tree. This is the fraction of the total training set that can be used in any boosting round. Low value may lead to underfitting issues. A very high value can cause over-fitting problems.
-  **colsample_bytree:** % features used per tree. This is the fraction of the number of columns that we can use in any boosting round. A smaller value is an additional regularization and a larger value may be cause overfitting issues.
-  **n_estimators:** number of estimators (base learners). This is the number of boosting rounds.
<br><br>The three hyperparameters below are regularization hyperparameters.
-  **gamma:** min loss reduction to create new tree split. default = 0 means no regularization.
-  **lambda:** L2 reg on leaf weights. Equivalent to Ridge regression.
-  **alpha:** L1 reg on leaf weights. Equivalent to Lasso regression.


Refer: https://xgboost.readthedocs.io/en/latest/python/python_api.html

In [234]:
XGB_model = XGBClassifier(n_estimators=500, gamma=0.5,learning_rate=0.1)
%time XGB_model.fit(X_train, y_train)
y_pred = XGB_model.predict(X_test)
print(accuracy_score(y_test,y_pred))

CPU times: user 14.2 s, sys: 3.97 ms, total: 14.2 s
Wall time: 14.4 s
0.8694565896314803


In [235]:
XGB = XGBClassifier(n_jobs=-1)
 
# Use a grid over parameters of interest
param_grid = {
     'colsample_bytree': np.linspace(0.5, 0.9, 2),
     'n_estimators':[100],
     'max_depth': [10, 15]
}

 
CV_XGB = GridSearchCV(estimator=XGB, param_grid=param_grid, cv= 10)

In [236]:
%time CV_XGB.fit(X = X_train, y=y_train)

CPU times: user 5min 31s, sys: 231 ms, total: 5min 31s
Wall time: 5min 31s


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bytree=1,
                                     gamma=0, learning_rate=0.1,
                                     max_delta_step=0, max_depth=3,
                                     min_child_weight=1, missing=None,
                                     n_estimators=100, n_jobs=-1, nthread=None,
                                     objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=True,
                                     subsample=1),
             iid='warn', n_jobs=None,
             param_grid={'colsample_bytree': array([0.5, 0.9]),
                         'max_depth': [10, 15], 'n_estimators': [100]},
             pre_dispatch='2*n_jobs', refit=True, return_train

In [237]:
# Find best model
best_xgb_model = CV_XGB.best_estimator_
print (CV_XGB.best_score_, CV_XGB.best_params_)

0.87546875 {'colsample_bytree': 0.9, 'max_depth': 10, 'n_estimators': 100}


In [238]:
y_pred_test=best_xgb_model.predict(X_test)

In [239]:
print(accuracy_score(y_test,y_pred_test))

0.8750780762023735


In [240]:
print(confusion_matrix(y_test, y_pred_test))

[[267  76  13]
 [ 74 639  21]
 [  4  12 495]]
