In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer, f1_score, ConfusionMatrixDisplay, RocCurveDisplay
import matplotlib.pyplot as plt
import warnings

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

In [3]:
# Import dataset
print("Loading dataset...")
df = pd.read_csv('conversion_data_train.csv')
print("...Done.")
print()

Loading dataset...
...Done.



In [4]:
# Basic stats
print("Number of rows : {}".format(df.shape[0]))
print()

print("Display of dataset: ")
display(df.head())
print()

print("Basics statistics: ")
data_desc = df.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*df.isnull().sum()/df.shape[0])

Number of rows : 284580

Display of dataset: 


Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0



Basics statistics: 


Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
count,284580,284580.0,284580.0,284580,284580.0,284580.0
unique,4,,,3,,
top,US,,,Seo,,
freq,160124,,,139477,,
mean,,30.564203,0.685452,,4.873252,0.032258
std,,8.266789,0.464336,,3.341995,0.176685
min,,17.0,0.0,,1.0,0.0
25%,,24.0,0.0,,2.0,0.0
50%,,30.0,1.0,,4.0,0.0
75%,,36.0,1.0,,7.0,0.0



Percentage of missing values: 


country                0.0
age                    0.0
new_user               0.0
source                 0.0
total_pages_visited    0.0
converted              0.0
dtype: float64

In [5]:
# Remove age outlier
mask = (df['age'] < 100) & (df['age'] >= 17)
df = df[mask]

df.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0


In [6]:
# Separate target variable Y from features X
print("Separating labels from features...")
target_variable = "converted"

X = df.drop(target_variable, axis = 1)
Y = df.loc[:,target_variable]

print("...Done.")
print()

print('Y : ')
print(Y.head())
print()
print('X :')
print(X.head())

Separating labels from features...
...Done.

Y : 
0    0
1    0
2    1
3    0
4    0
Name: converted, dtype: int64

X :
   country  age  new_user  source  total_pages_visited
0    China   22         1  Direct                    2
1       UK   21         1     Ads                    3
2  Germany   20         0     Seo                   14
3       US   23         1     Seo                    3
4       US   28         1  Direct                    3


In [7]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.items():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['age', 'new_user', 'total_pages_visited']
Found categorical features  ['country', 'source']


In [8]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
# WARNING : don't forget stratify=Y for classification problems
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify = Y)
print("...Done.")
print()

Dividing into train and test sets...


...Done.



In [9]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_transformer = OneHotEncoder(drop='first')

In [10]:
# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [11]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) # MUST use this syntax because X_train is a numpy array and not a pandas DataFrame anymore
print()
# Label encoding
print("Encoding labels...")
print(Y_train.head())
encoder = LabelEncoder()
Y_train = encoder.fit_transform(Y_train)
print("...Done")
print(Y_train[0:5])

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head()) 
X_test = preprocessor.transform(X_test) # Don't fit again !! The test set is used for validating decisions
# we made based on the training set, therefore we can only apply transformations that were parametered using the training set.
# Otherwise this creates what is called a leak from the test set which will introduce a bias in all your results.
print('...Done.')
print(X_test[0:5,:]) # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()
# Label encoding
print("Encoding labels...")
print(Y_test[0:5])
Y_test = encoder.transform(Y_test)
print("...Done")
print(Y_test[0:5])

Performing preprocessings on train set...
        country  age  new_user  source  total_pages_visited
70238        US   23         0  Direct                    5
11738        US   26         1     Ads                    4
51223   Germany   39         1     Ads                    2
251695       US   39         1     Seo                    8
30005        US   44         1     Seo                    2


...Done.
[[-0.91614386 -1.47777425  0.0376251   0.          0.          1.
   1.          0.        ]
 [-0.55333446  0.67669334 -0.2616379   0.          0.          1.
   0.          0.        ]
 [ 1.01883962  0.67669334 -0.86016391  1.          0.          0.
   0.          0.        ]
 [ 1.01883962  0.67669334  0.9354141   0.          0.          1.
   0.          1.        ]
 [ 1.62352196  0.67669334 -0.86016391  0.          0.          1.
   0.          1.        ]]

Encoding labels...
70238     0
11738     0
51223     0
251695    0
30005     0
Name: converted, dtype: int64
...Done
[0 0 0 0 0]
Performing preprocessings on test set...
       country  age  new_user  source  total_pages_visited
138303      UK   34         1     Ads                    1
133130      UK   32         0     Ads                    5
245758      US   44         1     Ads                    1
185267      US   35         1  Direct                    1
177637      US   29         1  Direct                    3


In [12]:
model_score_df = pd.DataFrame(columns=['model', 'f1_score', 'set'])

### Bagging with logistic regression as base estimator

In [13]:
# Perform grid search
from sklearn.ensemble import BaggingClassifier


print("Grid search...")
logistic_regression = LogisticRegression(max_iter = 1000) # max_iter changed because of convergence warning
model = BaggingClassifier(logistic_regression)

# Grid of values to be tested
params = {
    'base_estimator__C': [0.01, 0.05, 0.1, 0.5,], # base_estimator__ prefix because C is a parameter from LogisticRegression! 
    'n_estimators': [5, 10, 20, 30] # n_estimators is a hyperparameter of the ensemble method
}
print(params)
gridsearch_bag_lr = GridSearchCV(model, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
gridsearch_bag_lr.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch_bag_lr.best_params_)
print("Best validation accuracy : ", gridsearch_bag_lr.best_score_)
print()
print("Accuracy on training set : ", gridsearch_bag_lr.score(X_train, Y_train))
print("Accuracy on test set : ", gridsearch_bag_lr.score(X_test, Y_test))

Grid search...
{'base_estimator__C': [0.01, 0.05, 0.1, 0.5], 'n_estimators': [5, 10, 20, 30]}


  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(param

...Done.
Best hyperparameters :  {'base_estimator__C': 0.5, 'n_estimators': 5}
Best validation accuracy :  0.9862427633943264

Accuracy on training set :  0.9862383709182911
Accuracy on test set :  0.9861198959870686


In [14]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = gridsearch_bag_lr.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

# It's also possible to get the probabilities estimated by the model:
print("Probabilities on training set...")
Y_train_proba = gridsearch_bag_lr.predict_proba(X_train)
print("...Done.")
print(Y_train_proba)
print()

Predictions on training set...


...Done.
[0 0 0 ... 0 0 0]

Probabilities on training set...
...Done.
[[9.94626157e-01 5.37384329e-03]
 [9.99543001e-01 4.56998625e-04]
 [9.99940571e-01 5.94290607e-05]
 ...
 [9.99822255e-01 1.77745189e-04]
 [8.72880324e-01 1.27119676e-01]
 [9.95618450e-01 4.38155013e-03]]



In [15]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = gridsearch_bag_lr.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

# It's also possible to get the probabilities estimated by the model:
print("Probabilities on test set...")
Y_test_proba = gridsearch_bag_lr.predict_proba(X_test)
print("...Done.")
print(Y_test_proba)
print()

Predictions on test set...


...Done.
[0 0 0 ... 0 0 0]

Probabilities on test set...
...Done.
[[9.99962300e-01 3.77000410e-05]
 [9.95129351e-01 4.87064943e-03]
 [9.99987099e-01 1.29011604e-05]
 ...
 [9.99757881e-01 2.42118833e-04]
 [9.97376526e-01 2.62347406e-03]
 [9.99535081e-01 4.64919483e-04]]



In [16]:
print("f1-score on training set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))

f1-score on training set :  0.762992662077313
f1-score on test set :  0.76247745039086


In [17]:
rows_model_score = pd.DataFrame({'model': ['bagging_log_reg', 'bagging_log_reg'], 'f1_score': [f1_score(Y_train, Y_train_pred), f1_score(Y_test, Y_test_pred)], 'set': ['train', 'test']})

model_score_df = pd.concat([model_score_df, rows_model_score], ignore_index=True)

  model_score_df = pd.concat([model_score_df, rows_model_score], ignore_index=True)


### Bagging with decision tree as base estimator

In [18]:
# Perform grid search
from sklearn.tree import DecisionTreeClassifier


print("Grid search...")
decision_tree = DecisionTreeClassifier()
model = BaggingClassifier(decision_tree)

# Grid of values to be tested
params = {
    'base_estimator__max_depth': [1, 2, 3],
    'base_estimator__min_samples_leaf': [1, 2, 3],
    'base_estimator__min_samples_split': [2, 3, 4],
    'n_estimators': [2, 4, 6, 8, 10]
}
print(params)
gridsearch_bag_dec_tr = GridSearchCV(model, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
gridsearch_bag_dec_tr.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch_bag_dec_tr.best_params_)
print("Best validation accuracy : ", gridsearch_bag_dec_tr.best_score_)
print()
print("Accuracy on training set : ", gridsearch_bag_dec_tr.score(X_train, Y_train))
print("Accuracy on test set : ", gridsearch_bag_dec_tr.score(X_test, Y_test))

  estimator = estimator.set_params(**clone(parameters, safe=False))


Grid search...
{'base_estimator__max_depth': [1, 2, 3], 'base_estimator__min_samples_leaf': [1, 2, 3], 'base_estimator__min_samples_split': [2, 3, 4], 'n_estimators': [2, 4, 6, 8, 10]}


  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(param

...Done.
Best hyperparameters :  {'base_estimator__max_depth': 3, 'base_estimator__min_samples_leaf': 1, 'base_estimator__min_samples_split': 2, 'n_estimators': 2}
Best validation accuracy :  0.9845033424425437

Accuracy on training set :  0.9845033426746669
Accuracy on test set :  0.9843804905474735


In [19]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = gridsearch_bag_dec_tr.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

# It's also possible to get the probabilities estimated by the model:
print("Probabilities on training set...")
Y_train_proba = gridsearch_bag_dec_tr.predict_proba(X_train)
print("...Done.")
print(Y_train_proba)
print()

# Predictions on test set
print("Predictions on test set...")
Y_test_pred = gridsearch_bag_dec_tr.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

# It's also possible to get the probabilities estimated by the model:
print("Probabilities on test set...")
Y_test_proba = gridsearch_bag_dec_tr.predict_proba(X_test)
print("...Done.")
print(Y_test_proba)
print()

print("f1-score on training set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))

Predictions on training set...
...Done.
[0 0 0 ... 0 0 0]

Probabilities on training set...
...Done.
[[0.99775517 0.00224483]
 [0.99775517 0.00224483]
 [0.99775517 0.00224483]
 ...
 [0.99775517 0.00224483]
 [0.9569952  0.0430048 ]
 [0.99775517 0.00224483]]

Predictions on test set...
...Done.
[0 0 0 ... 0 0 0]

Probabilities on test set...
...Done.
[[0.99775517 0.00224483]
 [0.99775517 0.00224483]
 [0.99775517 0.00224483]
 ...
 [0.99775517 0.00224483]
 [0.99775517 0.00224483]
 [0.99775517 0.00224483]]

f1-score on training set :  0.7282809611829943
f1-score on test set :  0.7265456782528452


In [20]:
rows_model_score = pd.DataFrame({'model': ['bagging_dec_tree', 'bagging_dec_tree'], 'f1_score': [f1_score(Y_train, Y_train_pred), f1_score(Y_test, Y_test_pred)], 'set': ['train', 'test']})

model_score_df = pd.concat([model_score_df, rows_model_score], ignore_index=True)

### Adaboost with logistic regression as base estimator


In [21]:
# Perform grid search
from sklearn.ensemble import AdaBoostClassifier


print("Grid search...")
logistic_regression = LogisticRegression(max_iter = 1000) # max_iter changed because of convergence warning
model = AdaBoostClassifier(logistic_regression)

# Grid of values to be tested
params = {
    'base_estimator__C': [0.01, 0.05, 0.1, 0.5,], # base_estimator__ prefix because C is a parameter from LogisticRegression! 
    'n_estimators': [5, 10, 20, 30] # n_estimators is a hyperparameter of the ensemble method
}
print(params)
gridsearch_adboost_lr = GridSearchCV(model, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
gridsearch_adboost_lr.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch_adboost_lr.best_params_)
print("Best validation accuracy : ", gridsearch_adboost_lr.best_score_)
print()
print("Accuracy on training set : ", gridsearch_adboost_lr.score(X_train, Y_train))
print("Accuracy on test set : ", gridsearch_adboost_lr.score(X_test, Y_test))

Grid search...
{'base_estimator__C': [0.01, 0.05, 0.1, 0.5], 'n_estimators': [5, 10, 20, 30]}


  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(param

...Done.
Best hyperparameters :  {'base_estimator__C': 0.5, 'n_estimators': 30}
Best validation accuracy :  0.9853466982905624

Accuracy on training set :  0.9852983809331377
Accuracy on test set :  0.9852765478951437


In [22]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = gridsearch_adboost_lr.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

# It's also possible to get the probabilities estimated by the model:
print("Probabilities on training set...")
Y_train_proba = gridsearch_adboost_lr.predict_proba(X_train)
print("...Done.")
print(Y_train_proba)
print()

# Predictions on test set
print("Predictions on test set...")
Y_test_pred = gridsearch_adboost_lr.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

# It's also possible to get the probabilities estimated by the model:
print("Probabilities on test set...")
Y_test_proba = gridsearch_adboost_lr.predict_proba(X_test)
print("...Done.")
print(Y_test_proba)
print()

print("f1-score on training set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))

Predictions on training set...
...Done.
[0 0 0 ... 0 0 0]

Probabilities on training set...
...Done.
[[0.54351849 0.45648151]
 [0.56414407 0.43585593]
 [0.58500618 0.41499382]
 ...
 [0.57213483 0.42786517]
 [0.52053715 0.47946285]
 [0.5452761  0.4547239 ]]

Predictions on test set...
...Done.
[0 0 0 ... 0 0 0]

Probabilities on test set...
...Done.
[[0.58702708 0.41297292]
 [0.54822317 0.45177683]
 [0.59245591 0.40754409]
 ...
 [0.56872437 0.43127563]
 [0.55264805 0.44735195]
 [0.54294059 0.45705941]]

f1-score on training set :  0.7423600954506966
f1-score on test set :  0.7445121951219512


In [23]:
rows_model_score = pd.DataFrame({'model': ['adboost_log_reg', 'adboost_log_reg'], 'f1_score': [f1_score(Y_train, Y_train_pred), f1_score(Y_test, Y_test_pred)], 'set': ['train', 'test']})

model_score_df = pd.concat([model_score_df, rows_model_score], ignore_index=True)

### Adaboost with decision tree as base estimator


In [24]:
# Perform grid search
print("Grid search...")
decision_tree = DecisionTreeClassifier()
model = AdaBoostClassifier(decision_tree)

# Grid of values to be tested
params = {
    'base_estimator__max_depth': [1, 2, 3],
    'base_estimator__min_samples_leaf': [1, 2, 3],
    'base_estimator__min_samples_split': [2, 3, 4],
    'n_estimators': [2, 4, 6, 8, 10]
}
print(params)
gridsearch_adboost_dec_tr = GridSearchCV(model, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
gridsearch_adboost_dec_tr.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch_adboost_dec_tr.best_params_)
print("Best validation accuracy : ", gridsearch_adboost_dec_tr.best_score_)
print()
print("Accuracy on training set : ", gridsearch_adboost_dec_tr.score(X_train, Y_train))
print("Accuracy on test set : ", gridsearch_adboost_dec_tr.score(X_test, Y_test))

Grid search...
{'base_estimator__max_depth': [1, 2, 3], 'base_estimator__min_samples_leaf': [1, 2, 3], 'base_estimator__min_samples_split': [2, 3, 4], 'n_estimators': [2, 4, 6, 8, 10]}


  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(parameters, safe=False))
  estimator = estimator.set_params(**clone(param

...Done.
Best hyperparameters :  {'base_estimator__max_depth': 3, 'base_estimator__min_samples_leaf': 2, 'base_estimator__min_samples_split': 2, 'n_estimators': 10}
Best validation accuracy :  0.9859836072383268

Accuracy on training set :  0.9862427633948573
Accuracy on test set :  0.985821210204512


In [25]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = gridsearch_adboost_dec_tr.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

# It's also possible to get the probabilities estimated by the model:
print("Probabilities on training set...")
Y_train_proba = gridsearch_adboost_dec_tr.predict_proba(X_train)
print("...Done.")
print(Y_train_proba)
print()

# Predictions on test set
print("Predictions on test set...")
Y_test_pred = gridsearch_adboost_dec_tr.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

# It's also possible to get the probabilities estimated by the model:
print("Probabilities on test set...")
Y_test_proba = gridsearch_adboost_dec_tr.predict_proba(X_test)
print("...Done.")
print(Y_test_proba)
print()

print("f1-score on training set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))

Predictions on training set...


...Done.
[0 0 0 ... 0 0 0]

Probabilities on training set...
...Done.
[[0.62910286 0.37089714]
 [0.71516328 0.28483672]
 [0.98823825 0.01176175]
 ...
 [0.98660155 0.01339845]
 [0.537603   0.462397  ]
 [0.63915119 0.36084881]]

Predictions on test set...
...Done.
[0 0 0 ... 0 0 0]

Probabilities on test set...
...Done.
[[9.87706754e-01 1.22932460e-02]
 [6.39660169e-01 3.60339831e-01]
 [9.99681973e-01 3.18027435e-04]
 ...
 [9.99716389e-01 2.83611107e-04]
 [6.54500309e-01 3.45499691e-01]
 [6.99670121e-01 3.00329879e-01]]

f1-score on training set :  0.7644049947344667
f1-score on test set :  0.7590325470289639


In [26]:
rows_model_score = pd.DataFrame({'model': ['adboost_dec_tree', 'adboost_dec_tree'], 'f1_score': [f1_score(Y_train, Y_train_pred), f1_score(Y_test, Y_test_pred)], 'set': ['train', 'test']})

model_score_df = pd.concat([model_score_df, rows_model_score], ignore_index=True)

### Boosting with decision tree as base estimator


In [27]:
# Perform grid search
from sklearn.ensemble import GradientBoostingClassifier


print("Grid search...")
model = GradientBoostingClassifier()

# Grid of values to be tested
params = {
    'max_depth': [1, 2, 3], # no base_estimator_ prefix because these are all arguments of GradientBoostingClassifier
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [2, 3, 4],
    'n_estimators': [2, 4, 6, 8, 10]
}
print(params)
gridsearch_boost_dec_tr = GridSearchCV(model, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
gridsearch_boost_dec_tr.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch_boost_dec_tr.best_params_)
print("Best validation accuracy : ", gridsearch_boost_dec_tr.best_score_)
print()
print("Accuracy on training set : ", gridsearch_boost_dec_tr.score(X_train, Y_train))
print("Accuracy on test set : ", gridsearch_boost_dec_tr.score(X_test, Y_test))

Grid search...
{'max_depth': [1, 2, 3], 'min_samples_leaf': [1, 2, 3], 'min_samples_split': [2, 3, 4], 'n_estimators': [2, 4, 6, 8, 10]}
...Done.
Best hyperparameters :  {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 10}
Best validation accuracy :  0.9844418496421105

Accuracy on training set :  0.9846482944013494
Accuracy on test set :  0.9846791763300302


In [28]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = gridsearch_boost_dec_tr.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

# It's also possible to get the probabilities estimated by the model:
print("Probabilities on training set...")
Y_train_proba = gridsearch_boost_dec_tr.predict_proba(X_train)
print("...Done.")
print(Y_train_proba)
print()

# Predictions on test set
print("Predictions on test set...")
Y_test_pred = gridsearch_boost_dec_tr.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

# It's also possible to get the probabilities estimated by the model:
print("Probabilities on test set...")
Y_test_proba = gridsearch_boost_dec_tr.predict_proba(X_test)
print("...Done.")
print(Y_test_proba)
print()

print("f1-score on training set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))

Predictions on training set...


...Done.
[0 0 0 ... 0 0 0]

Probabilities on training set...
...Done.
[[0.9873215 0.0126785]
 [0.9873215 0.0126785]
 [0.9873215 0.0126785]
 ...
 [0.9873215 0.0126785]
 [0.963796  0.036204 ]
 [0.9801348 0.0198652]]

Predictions on test set...
...Done.
[0 0 0 ... 0 0 0]

Probabilities on test set...
...Done.
[[0.9873215  0.0126785 ]
 [0.9873215  0.0126785 ]
 [0.9873215  0.0126785 ]
 ...
 [0.9873215  0.0126785 ]
 [0.98680659 0.01319341]
 [0.9873215  0.0126785 ]]

f1-score on training set :  0.719659902141654
f1-score on test set :  0.721938775510204


In [29]:
rows_model_score = pd.DataFrame({'model': ['boost_dec_tree', 'boost_dec_tree'], 'f1_score': [f1_score(Y_train, Y_train_pred), f1_score(Y_test, Y_test_pred)], 'set': ['train', 'test']})

model_score_df = pd.concat([model_score_df, rows_model_score], ignore_index=True)

### XGBoost

In [30]:
import xgboost as xgb
# Perform grid search
print("Grid search...")
xgboost = xgb.XGBClassifier()

# Grid of values to be tested
params = {
    'max_depth': [2, 4, 6], # exactly the same role as in scikit-learn
    'min_child_weight': [1, 2, 3], # effect is more or less similar to min_samples_leaf and min_samples_split
    'n_estimators': [2, 4, 6, 8,] # exactly the same role as in scikit-learn
}
print(params)
gridsearch_xgb = GridSearchCV(xgboost, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
gridsearch_xgb.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch_xgb.best_params_)
print("Best validation accuracy : ", gridsearch_xgb.best_score_)
print()
print("Accuracy on training set : ", gridsearch_xgb.score(X_train, Y_train))
print("Accuracy on test set : ", gridsearch_xgb.score(X_test, Y_test))

Grid search...
{'max_depth': [2, 4, 6], 'min_child_weight': [1, 2, 3], 'n_estimators': [2, 4, 6, 8]}
...Done.
Best hyperparameters :  {'max_depth': 6, 'min_child_weight': 2, 'n_estimators': 8}
Best validation accuracy :  0.9858694036193535

Accuracy on training set :  0.9861944461526297
Accuracy on test set :  0.9854346756623796


In [31]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = gridsearch_xgb.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

# It's also possible to get the probabilities estimated by the model:
print("Probabilities on training set...")
Y_train_proba = gridsearch_xgb.predict_proba(X_train)
print("...Done.")
print(Y_train_proba)
print()

# Predictions on test set
print("Predictions on test set...")
Y_test_pred = gridsearch_xgb.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

# It's also possible to get the probabilities estimated by the model:
print("Probabilities on test set...")
Y_test_proba = gridsearch_xgb.predict_proba(X_test)
print("...Done.")
print(Y_test_proba)
print()

print("f1-score on training set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))

Predictions on training set...
...Done.
[0 0 0 ... 0 0 0]

Probabilities on training set...


...Done.
[[0.95882565 0.04117436]
 [0.95882565 0.04117436]
 [0.95882565 0.04117436]
 ...
 [0.95882565 0.04117436]
 [0.86079645 0.13920352]
 [0.9548335  0.04516648]]

Predictions on test set...
...Done.
[0 0 0 ... 0 0 0]

Probabilities on test set...
...Done.
[[0.95882565 0.04117436]
 [0.95882565 0.04117436]
 [0.95882565 0.04117436]
 ...
 [0.95882565 0.04117436]
 [0.9574031  0.04259688]
 [0.95882565 0.04117436]]

f1-score on training set :  0.7633104902477597
f1-score on test set :  0.7539329177797566


In [32]:
rows_model_score = pd.DataFrame({'model': ['gridsearch_xgb', 'gridsearch_xgb'], 'f1_score': [f1_score(Y_train, Y_train_pred), f1_score(Y_test, Y_test_pred)], 'set': ['train', 'test']})

model_score_df = pd.concat([model_score_df, rows_model_score], ignore_index=True)

In [33]:
model_score_df = model_score_df.sort_values(by='f1_score', ascending=False)

In [34]:
model_score_df

Unnamed: 0,model,f1_score,set
6,adboost_dec_tree,0.764405,train
10,gridsearch_xgb,0.76331,train
0,bagging_log_reg,0.762993,train
1,bagging_log_reg,0.762477,test
7,adboost_dec_tree,0.759033,test
11,gridsearch_xgb,0.753933,test
5,adboost_log_reg,0.744512,test
4,adboost_log_reg,0.74236,train
2,bagging_dec_tree,0.728281,train
3,bagging_dec_tree,0.726546,test


In [39]:
px.bar(model_score_df, x="model", y="f1_score", color="set", barmode='group')