# AdaBoosting
Trainer : - Rajesh Jakhotia

In [1]:
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn.metrics import auc,confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from numpy import mean, std
from sklearn.metrics import roc_auc_score

In [2]:
#Load the Dataset
dev = pd.read_csv("DEV_SAMPLE.csv")
holdout = pd.read_csv("HOLDOUT_SAMPLE.csv")


In [3]:
dev.head()

Unnamed: 0,Cust_ID,Target,Age,Gender,Balance,Occupation,No_OF_CR_TXNS,AGE_BKT,SCR,Holding_Period
0,C16505,0,41,M,91519.92,SELF-EMP,38,41-45,926,15
1,C17241,0,52,M,117288.96,SAL,17,>50,768,13
2,C18802,0,31,F,259827.44,SENP,8,31-35,816,5
3,C19289,0,45,F,26677.55,PROF,14,41-45,353,18
4,C14028,0,39,F,43440.31,SENP,1,36-40,751,31


In [4]:
## Get Frequency of 0 and 1
target_freq = pd.crosstab(index=dev['Target'], 
                             columns="count",
                             )

target_freq['prop'] = target_freq/target_freq.sum()
target_freq

col_0,count,prop
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,12765,0.911786
1,1235,0.088214


## Modeling Data Preparation

In [5]:
X =  dev.loc[:,'Age':'Holding_Period']
X.head()

Unnamed: 0,Age,Gender,Balance,Occupation,No_OF_CR_TXNS,AGE_BKT,SCR,Holding_Period
0,41,M,91519.92,SELF-EMP,38,41-45,926,15
1,52,M,117288.96,SAL,17,>50,768,13
2,31,F,259827.44,SENP,8,31-35,816,5
3,45,F,26677.55,PROF,14,41-45,353,18
4,39,F,43440.31,SENP,1,36-40,751,31


In [6]:
#Data Preprocessing
#Splitting into features and response variables
X =  dev.loc[:,'Age':'Holding_Period']

#Categorical Variable to Numerical Variables
X_train = pd.get_dummies(X)
print(X_train.columns)


y_train = dev["Target"]

print (type(X_train) , type(y_train))

Index(['Age', 'Balance', 'No_OF_CR_TXNS', 'SCR', 'Holding_Period', 'Gender_F',
       'Gender_M', 'Gender_O', 'Occupation_PROF', 'Occupation_SAL',
       'Occupation_SELF-EMP', 'Occupation_SENP', 'AGE_BKT_26-30',
       'AGE_BKT_31-35', 'AGE_BKT_36-40', 'AGE_BKT_41-45', 'AGE_BKT_46-50',
       'AGE_BKT_<25', 'AGE_BKT_>50'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>


In [7]:
X_train.head()

Unnamed: 0,Age,Balance,No_OF_CR_TXNS,SCR,Holding_Period,Gender_F,Gender_M,Gender_O,Occupation_PROF,Occupation_SAL,Occupation_SELF-EMP,Occupation_SENP,AGE_BKT_26-30,AGE_BKT_31-35,AGE_BKT_36-40,AGE_BKT_41-45,AGE_BKT_46-50,AGE_BKT_<25,AGE_BKT_>50
0,41,91519.92,38,926,15,False,True,False,False,False,True,False,False,False,False,True,False,False,False
1,52,117288.96,17,768,13,False,True,False,False,True,False,False,False,False,False,False,False,False,True
2,31,259827.44,8,816,5,True,False,False,False,False,False,True,False,True,False,False,False,False,False
3,45,26677.55,14,353,18,True,False,False,True,False,False,False,False,False,False,True,False,False,False
4,39,43440.31,1,751,31,True,False,False,False,False,False,True,False,False,True,False,False,False,False


### AdaBoost Classifier

In [8]:
model = AdaBoostClassifier(n_estimators=200)
model.fit(X_train, y_train)

In [9]:
pred_y_train = model.predict(X_train)
pred_y_train

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [10]:
## Let us see the classification accuracy of our model
score = accuracy_score(y_train, pred_y_train)
score

0.9152857142857143

In [11]:
## Predict Probability
dev["predict_prob"] = model.predict_proba(X_train)[:,1]

In [12]:
roc_auc_score(dev["Target"],
              dev["predict_prob"])

0.7977105921270681

In [13]:
# X =  dev.loc[:,'Age':'Holding_Period']
X_test = holdout.loc[:,'Age':'Holding_Period']

#Categorical Variable to Numerical Variables
X_test = pd.get_dummies(X_test)


In [14]:
## AUC for Hold Out dataset

holdout["predict_prob"] = model.predict_proba(X_test)[:,1]

roc_auc_score(holdout["Target"],
              holdout["predict_prob"])


0.7571667987836478

### Tuning the AdaBoost Model

In [15]:
np.arange(5,20,2)

array([ 5,  7,  9, 11, 13, 15, 17, 19])

In [16]:
param_dist = {"n_estimators":np.arange(5,20,2),
               "learning_rate": [0.05, 0.1,0.2,0.3],
              }
              
tree = AdaBoostClassifier(random_state=1212)
tree_cv  = GridSearchCV(tree, param_dist,
                        scoring = 'roc_auc', verbose = 100, n_jobs=-1,cv=5)
tree_cv.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [17]:
tree_cv.best_score_

0.7619975546749003

In [18]:
## Building the model using best combination of parameters
print("Tuned Decision Tree parameter : {}".format(tree_cv.best_params_))
classifier = tree_cv.best_estimator_

#classifier.fit(X_train,y_train)


Tuned Decision Tree parameter : {'learning_rate': 0.3, 'n_estimators': 19}


In [19]:
## Predict Probability
dev["predict_prob"] = classifier.predict_proba(X_train)[:,1]
roc_auc_score(dev["Target"],
              dev["predict_prob"])

0.7718760337524639

In [20]:
holdout["predict_prob"] = classifier.predict_proba(X_test)[:,1]

roc_auc_score(holdout["Target"],
              holdout["predict_prob"])

0.758850925329818

## Practice Exercise
1. Build model using Gradient BoostingClassifier
2. Build model using Extrement Gradient Boosting - XGBClassifier

### Thank you