# PART II. MODEL BUILDING - BOOSTING

### PREPROCESSING

In [3]:
!pip install xgboost



In [4]:
# Useful libraries

import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, confusion_matrix, precision_score, recall_score, classification_report

In [5]:
# Loading dataset

dataset = pd.read_csv("Dataset_for_model_building.csv")

In [6]:
# Splitting the dataset in two tables : Y for the target 'class' and X for the explanatory features

target_name = "class"
Y = dataset.loc[:, target_name]
X = dataset.loc[:, [c for c in dataset.columns if c != target_name]]

display(Y.head())
print()
display(X.head())

0    0
1    0
2    1
3    0
4    0
Name: class, dtype: int64




Unnamed: 0,purchase_value,source,browser,sex,age,country,month
0,34,SEO,Chrome,M,39,Japan,4
1,16,Ads,Chrome,F,53,United States,6
2,15,SEO,Opera,M,53,United States,1
3,44,SEO,Safari,M,41,Unknown country,5
4,39,Ads,Safari,M,45,United States,9


In [7]:
# One Hot Encoding for categorical variables of X

X = pd.get_dummies(X)
X.head()

Unnamed: 0,purchase_value,age,month,source_Ads,source_Direct,source_SEO,browser_Chrome,browser_FireFox,browser_IE,browser_Opera,...,country_Tunisia,country_Turkey,country_Ukraine,country_United Arab Emirates,country_United Kingdom,country_United States,country_Unknown country,country_Uruguay,country_Venezuela,country_Viet Nam
0,34,39,4,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,16,53,6,1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,15,53,1,0,0,1,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,44,41,5,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,39,45,9,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [8]:
# Splitting the dataset in Train and Test sets

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42, stratify = Y)

In [9]:
# Standardizing X_train and X_test

sc_x = StandardScaler()
X_train = sc_x.fit_transform(X_train)
X_test = sc_x.transform(X_test)

### ADABOOST

In [22]:
params = { 'n_estimators' : [50, 100, 150, 200], 'learning_rate' : [1.0, 0.5, 0.1]}

grid_ab = GridSearchCV(AdaBoostClassifier(), params)

grid_ab.fit(X_train, Y_train)

GridSearchCV(estimator=AdaBoostClassifier(),
             param_grid={'learning_rate': [1.0, 0.5, 0.1],
                         'n_estimators': [50, 100, 150, 200]})

In [23]:
# Predicting

Y_train_pred = grid_ab.predict(X_train)
Y_test_pred = grid_ab.predict(X_test)

In [24]:
# Computing f1_score

print("f1 score on train set is : ", f1_score(Y_train, Y_train_pred))
print("f1 score on test set is : ", f1_score(Y_test, Y_test_pred))

f1 score on train set is :  0.638198516058735
f1 score on test set is :  0.6382716049382716


In [26]:
# Computing classification scores

print(classification_report(Y_test, Y_test_pred))

              precision    recall  f1-score   support

           0       0.95      0.98      0.97     27171
           1       0.76      0.55      0.64      2816

    accuracy                           0.94     29987
   macro avg       0.86      0.77      0.80     29987
weighted avg       0.94      0.94      0.94     29987



In [25]:
# Visualizing a confusion matrix

print("confusion matric for Train set")
print(confusion_matrix(Y_train, Y_train_pred) )

print("confusion matric for Test set")
print(confusion_matrix(Y_test, Y_test_pred))

confusion matric for Train set
[[106823   1860]
 [  5113   6150]]
confusion matric for Test set
[[26678   493]
 [ 1265  1551]]


### XGBOOST

In [10]:
params = {'n_estimators': [50, 100, 150, 200],
              "learning_rate": [1.0, 0.5, 0.1]}

xgb_grid = GridSearchCV(XGBClassifier(), params)
xgb_grid.fit(X_train, Y_train)





GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, validate_parameter

In [11]:
xgb_grid.best_params_

{'learning_rate': 0.5, 'n_estimators': 200}

In [12]:
# Predicting

Y_train_pred = xgb_grid.predict(X_train)
Y_test_pred = xgb_grid.predict(X_test)

In [13]:
# Computing f1_score

print("f1 score on train set is : ", f1_score(Y_train, Y_train_pred))
print("f1 score on test set is : ", f1_score(Y_test, Y_test_pred))

f1 score on train set is :  0.6801218583396802
f1 score on test set is :  0.6524132091447926


In [14]:
# Computing classification scores

print(classification_report(Y_test, Y_test_pred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97     27171
           1       0.81      0.55      0.65      2816

    accuracy                           0.95     29987
   macro avg       0.88      0.77      0.81     29987
weighted avg       0.94      0.95      0.94     29987



In [15]:
# Visualizing a confusion matrix

print("confusion matric for Train set")
print(confusion_matrix(Y_train, Y_train_pred) )

print("confusion matric for Test set")
print(confusion_matrix(Y_test, Y_test_pred))

confusion matric for Train set
[[107815    868]
 [  5012   6251]]
confusion matric for Test set
[[26804   367]
 [ 1275  1541]]
