## Looking at how the models perform on a dataset trimmed to have a 50/50 split on the target variable (Diabetes_binary). Full dataset was over 85% non-diabetic, which can add unwanted bias to the model.

## Part 1: Prepare the Data

In [1]:
# Import Dependencies
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix

Load the data into a Pandas DataFrame and fetch the top 10 rows.

In [2]:
# Read in CSV
file_path = Path("../Resources/diabetes_data_5050split.csv")
df = pd.read_csv(file_path)
df.head(10)

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0,6.0,8.0
1,0.0,1.0,1.0,1.0,26.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,8.0
2,0.0,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0,6.0,8.0
3,0.0,1.0,1.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0,6.0,8.0
4,0.0,0.0,0.0,1.0,29.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,8.0
5,0.0,0.0,0.0,1.0,18.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,2.0,7.0,0.0,0.0,0.0,1.0,4.0,7.0
6,0.0,0.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,13.0,5.0,6.0
7,0.0,0.0,0.0,1.0,31.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,4.0,0.0,0.0,0.0,1.0,6.0,4.0,3.0
8,0.0,0.0,0.0,1.0,32.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,0.0,3.0,6.0,8.0
9,0.0,0.0,0.0,1.0,27.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,6.0,0.0,1.0,6.0,4.0,4.0


List the DataFrame's data types to ensure they're aligned to the type of data stored on each column.

In [3]:
# List dataframe data types
df.dtypes

Diabetes_binary         float64
HighBP                  float64
HighChol                float64
CholCheck               float64
BMI                     float64
Smoker                  float64
Stroke                  float64
HeartDiseaseorAttack    float64
PhysActivity            float64
Fruits                  float64
Veggies                 float64
HvyAlcoholConsump       float64
AnyHealthcare           float64
NoDocbcCost             float64
GenHlth                 float64
MentHlth                float64
PhysHlth                float64
DiffWalk                float64
Sex                     float64
Age                     float64
Education               float64
Income                  float64
dtype: object

Remove all rows with `null` values if any.

In [4]:
# Find null values
for column in df.columns:
    print(f"Column {column} has {df[column].isnull().sum()} null values")

Column Diabetes_binary has 0 null values
Column HighBP has 0 null values
Column HighChol has 0 null values
Column CholCheck has 0 null values
Column BMI has 0 null values
Column Smoker has 0 null values
Column Stroke has 0 null values
Column HeartDiseaseorAttack has 0 null values
Column PhysActivity has 0 null values
Column Fruits has 0 null values
Column Veggies has 0 null values
Column HvyAlcoholConsump has 0 null values
Column AnyHealthcare has 0 null values
Column NoDocbcCost has 0 null values
Column GenHlth has 0 null values
Column MentHlth has 0 null values
Column PhysHlth has 0 null values
Column DiffWalk has 0 null values
Column Sex has 0 null values
Column Age has 0 null values
Column Education has 0 null values
Column Income has 0 null values


In [5]:
# Look at min and max values
for column in df.columns:
    print(f"Column {column} has {df[column].min()} as minimum value")
    print(f"Column {column} has {df[column].max()} as maximum value")

Column Diabetes_binary has 0.0 as minimum value
Column Diabetes_binary has 1.0 as maximum value
Column HighBP has 0.0 as minimum value
Column HighBP has 1.0 as maximum value
Column HighChol has 0.0 as minimum value
Column HighChol has 1.0 as maximum value
Column CholCheck has 0.0 as minimum value
Column CholCheck has 1.0 as maximum value
Column BMI has 12.0 as minimum value
Column BMI has 98.0 as maximum value
Column Smoker has 0.0 as minimum value
Column Smoker has 1.0 as maximum value
Column Stroke has 0.0 as minimum value
Column Stroke has 1.0 as maximum value
Column HeartDiseaseorAttack has 0.0 as minimum value
Column HeartDiseaseorAttack has 1.0 as maximum value
Column PhysActivity has 0.0 as minimum value
Column PhysActivity has 1.0 as maximum value
Column Fruits has 0.0 as minimum value
Column Fruits has 1.0 as maximum value
Column Veggies has 0.0 as minimum value
Column Veggies has 1.0 as maximum value
Column HvyAlcoholConsump has 0.0 as minimum value
Column HvyAlcoholConsump h

No null values were found. Check for duplicates.

In [6]:
# Find duplicate entries
print(f"Duplicate entries: {df.duplicated().sum()}")

Duplicate entries: 1635


In [7]:
# Drop duplicate entries
df = df.drop_duplicates()

In [8]:
# Confirm that duplicate entries have been dropped
print(f"Duplicate entries: {df.duplicated().sum()}")

Duplicate entries: 0


Check for duplicates but ignore the target variable.

In [9]:
# Find duplicate entries ignoring target column
features = ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income']
print(f"Duplicate entries excluding the target variable: {df.duplicated(subset=features).sum()}")

Duplicate entries excluding the target variable: 408


In [10]:
# Drop duplicate entries
df = df.drop_duplicates(subset=features, keep=False)

In [11]:
# Confirm that duplicate entries have been dropped
print(f"Duplicate entries excluding the target variable: {df.duplicated(subset=features).sum()}")

Duplicate entries excluding the target variable: 0


In [12]:
# Evaluate shape after dropping duplicates
df.shape

(68241, 22)

Export CSV for Tableau Visualizations

In [13]:
# export csv. commented out to avoid overwriting file once exported
#df.to_csv('../Resources/viz_data.csv', index_label='index')

Standardize the dataset so that columns that contain larger values do not influence the outcome more than columns with smaller values. Exclude columns that already have binary or binned data from scaling.

In [14]:
# Scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[['BMI', 'MentHlth', 'PhysHlth']])

In [15]:
# Get list of the columns from the original DataFrame so we know what to add to scaled data
df.columns

Index(['Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')

In [16]:
# Create a DataFrame with the transformed data
scaled_data = pd.DataFrame(scaled_data, columns=['BMI', 'MentHlth', 'PhysHlth'])
scaled_data['Diabetes_binary'] = df['Diabetes_binary']
scaled_data['HighBP'] = df['HighBP']
scaled_data['HighChol'] = df['HighChol']
scaled_data['CholCheck'] = df['CholCheck']
scaled_data['Smoker'] = df['Smoker']
scaled_data['Stroke'] = df['Stroke']
scaled_data['HeartDiseaseorAttack'] = df['HeartDiseaseorAttack']
scaled_data['PhysActivity'] = df['PhysActivity']
scaled_data['Fruits'] = df['Fruits']
scaled_data['Veggies'] = df['Veggies']
scaled_data['HvyAlcoholConsump'] = df['HvyAlcoholConsump']
scaled_data['AnyHealthcare'] = df['AnyHealthcare']
scaled_data['NoDocbcCost'] = df['NoDocbcCost']
scaled_data['GenHlth'] = df['GenHlth']
scaled_data['DiffWalk'] = df['DiffWalk']
scaled_data['Sex'] = df['Sex']
scaled_data['Age'] = df['Age']
scaled_data['Education'] = df['Education']
scaled_data['Income'] = df['Income']
scaled_data.head()

Unnamed: 0,BMI,MentHlth,PhysHlth,Diabetes_binary,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,...,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,DiffWalk,Sex,Age,Education,Income
0,-0.555709,0.134741,2.356585,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,3.0,0.0,1.0,4.0,6.0,8.0
1,-0.555709,-0.469891,-0.590973,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,3.0,0.0,1.0,12.0,6.0,8.0
2,-0.555709,-0.469891,0.391546,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,13.0,6.0,8.0
3,-0.27697,-0.469891,-0.296217,0.0,1.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,3.0,0.0,1.0,11.0,6.0,8.0
4,-0.137601,-0.469891,-0.590973,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,2.0,0.0,0.0,8.0,5.0,8.0


In [17]:
# function by Boern found in the following link 
# https://stackoverflow.com/questions/31323499/sklearn-error-valueerror-input-contains-nan-infinity-or-a-value-too-large-for
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

In [18]:
df_cleaned = clean_dataset(scaled_data)

In [19]:
# Assign feature (X) and target (y) variables
X = df_cleaned.drop('Diabetes_binary', axis=1)
y = df_cleaned['Diabetes_binary']

Look at X and y to make sure everything looks as expected.

In [20]:
# Preview X
X.head()

Unnamed: 0,BMI,MentHlth,PhysHlth,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,...,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,DiffWalk,Sex,Age,Education,Income
0,-0.555709,0.134741,2.356585,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,3.0,0.0,1.0,4.0,6.0,8.0
1,-0.555709,-0.469891,-0.590973,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,3.0,0.0,1.0,12.0,6.0,8.0
2,-0.555709,-0.469891,0.391546,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,13.0,6.0,8.0
3,-0.27697,-0.469891,-0.296217,1.0,1.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,3.0,0.0,1.0,11.0,6.0,8.0
4,-0.137601,-0.469891,-0.590973,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,2.0,0.0,0.0,8.0,5.0,8.0


In [21]:
# Prevew y
y.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: Diabetes_binary, dtype: float64

In [22]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [23]:
def model_tester(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    clf = model.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(f'Training Score: {clf.score(X_train, y_train)}')
    print(f'Testing Score: {clf.score(X_test, y_test)}')

In [24]:
# Look at different Logistic Regression models and find bester performing for further tuning
model_tester(LogisticRegression(random_state=42), X, y)
model_tester(LogisticRegression(random_state=42, max_iter=500), X, y)
model_tester(LogisticRegression(random_state=42, max_iter=1000), X, y)
model_tester(LogisticRegression(random_state=42, max_iter=10000), X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

         0.0       0.77      0.76      0.76      8355
         1.0       0.76      0.76      0.76      8104

    accuracy                           0.76     16459
   macro avg       0.76      0.76      0.76     16459
weighted avg       0.76      0.76      0.76     16459

Training Score: 0.7601263771872975
Testing Score: 0.7616501610061365
              precision    recall  f1-score   support

         0.0       0.77      0.76      0.76      8355
         1.0       0.76      0.76      0.76      8104

    accuracy                           0.76     16459
   macro avg       0.76      0.76      0.76     16459
weighted avg       0.76      0.76      0.76     16459

Training Score: 0.760146629941672
Testing Score: 0.7621969742997752
              precision    recall  f1-score   support

         0.0       0.77      0.76      0.76      8355
         1.0       0.76      0.76      0.76      8104

    accuracy                           0.76  

The second model with max_iter=500 is the model selected as the 'best' LogisticRegression model. The accuracy scores leveled off for 1000 and 10000 suggesting there were diminishing returns for the additional iterations.

In [25]:
# Look at different RandomForest models and find bester performing for further tuning
model_tester(RandomForestClassifier(random_state=42), X, y)
model_tester(RandomForestClassifier(random_state=42, bootstrap=False), X, y)
model_tester(RandomForestClassifier(random_state=42, n_estimators=200), X, y)
model_tester(RandomForestClassifier(random_state=42, n_estimators=200, bootstrap=False), X, y)
model_tester(RandomForestClassifier(random_state=42, n_estimators=500), X, y)
model_tester(RandomForestClassifier(random_state=42, n_estimators=500, bootstrap=False), X, y)

              precision    recall  f1-score   support

         0.0       0.77      0.74      0.76      8355
         1.0       0.74      0.78      0.76      8104

    accuracy                           0.76     16459
   macro avg       0.76      0.76      0.76     16459
weighted avg       0.76      0.76      0.76     16459

Training Score: 0.9983595268956578
Testing Score: 0.7585515523421836
              precision    recall  f1-score   support

         0.0       0.76      0.74      0.75      8355
         1.0       0.74      0.76      0.75      8104

    accuracy                           0.75     16459
   macro avg       0.75      0.75      0.75     16459
weighted avg       0.75      0.75      0.75     16459

Training Score: 0.998400032404407
Testing Score: 0.7505316240354821
              precision    recall  f1-score   support

         0.0       0.77      0.74      0.76      8355
         1.0       0.75      0.78      0.76      8104

    accuracy                           0.76  

The third model with n_estimators=200 and the default bootstrap setting is selected as the 'best' Random Forest model. This is due to it having the highest testing accuracy score.

In [26]:
# Look at different AdaBoost models and find bester performing for further tuning
model_tester(AdaBoostClassifier(random_state=42, n_estimators=100), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=200), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=200, learning_rate=0.1), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=500, learning_rate=0.1), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=1000, learning_rate=0.1), X, y)

              precision    recall  f1-score   support

         0.0       0.77      0.76      0.77      8355
         1.0       0.76      0.77      0.76      8104

    accuracy                           0.77     16459
   macro avg       0.77      0.77      0.77     16459
weighted avg       0.77      0.77      0.77     16459

Training Score: 0.7665870058327933
Testing Score: 0.7666322376815117
              precision    recall  f1-score   support

         0.0       0.78      0.76      0.77      8355
         1.0       0.76      0.77      0.77      8104

    accuracy                           0.77     16459
   macro avg       0.77      0.77      0.77     16459
weighted avg       0.77      0.77      0.77     16459

Training Score: 0.7666882696046662
Testing Score: 0.767604350203536
              precision    recall  f1-score   support

         0.0       0.77      0.76      0.77      8355
         1.0       0.76      0.77      0.76      8104

    accuracy                           0.77  

The models with n_estimators=200 are performing the best. Next, we will try to further tune by changing the learning_rate and keeping the n_estimators the same.

In [27]:
# further tune based on results of previous cell
model_tester(AdaBoostClassifier(random_state=42, n_estimators=200), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=200, learning_rate=0.1), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=200, learning_rate=0.05), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=200, learning_rate=0.01), X, y)

              precision    recall  f1-score   support

         0.0       0.78      0.76      0.77      8355
         1.0       0.76      0.77      0.77      8104

    accuracy                           0.77     16459
   macro avg       0.77      0.77      0.77     16459
weighted avg       0.77      0.77      0.77     16459

Training Score: 0.7666882696046662
Testing Score: 0.767604350203536
              precision    recall  f1-score   support

         0.0       0.77      0.76      0.77      8355
         1.0       0.76      0.77      0.76      8104

    accuracy                           0.77     16459
   macro avg       0.77      0.77      0.77     16459
weighted avg       0.77      0.77      0.77     16459

Training Score: 0.7657768956578095
Testing Score: 0.7651133118658485
              precision    recall  f1-score   support

         0.0       0.77      0.76      0.76      8355
         1.0       0.76      0.77      0.76      8104

    accuracy                           0.76  

The model with n_estimaors=200 and default learning_rate=1 is selected as the 'best' Ada Boost model. This is due to it having the highest testing accuracy score.

In [28]:
# The best of each type of model
model_tester(LogisticRegression(random_state=42, max_iter=500), X, y)
model_tester(RandomForestClassifier(random_state=42, n_estimators=200), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=200), X, y)

              precision    recall  f1-score   support

         0.0       0.77      0.76      0.76      8355
         1.0       0.76      0.76      0.76      8104

    accuracy                           0.76     16459
   macro avg       0.76      0.76      0.76     16459
weighted avg       0.76      0.76      0.76     16459

Training Score: 0.760146629941672
Testing Score: 0.7621969742997752
              precision    recall  f1-score   support

         0.0       0.77      0.74      0.76      8355
         1.0       0.75      0.78      0.76      8104

    accuracy                           0.76     16459
   macro avg       0.76      0.76      0.76     16459
weighted avg       0.76      0.76      0.76     16459

Training Score: 0.998400032404407
Testing Score: 0.7601919922230999
              precision    recall  f1-score   support

         0.0       0.78      0.76      0.77      8355
         1.0       0.76      0.77      0.77      8104

    accuracy                           0.77   

The best performing model of all models tested with scaled data is the Ada Boost Classifier. With a testing accuracy score of 0.76760 it edged out the Logistic Regression (0.76220) and the Random Forest Classifier (0.76020). It also had tied for the best recall with 0.76.

The models with scaled data performed better on the 50/50 split data set than the models with unscaled data.

In [31]:
# Will an xgboost perform better?
from xgboost import XGBClassifier

In [34]:
# Test different XGBClassifier models
model_tester(XGBClassifier(seed=42), X, y)
model_tester(XGBClassifier(seed=42, eta=0.2), X, y)
model_tester(XGBClassifier(seed=42, eta=0.1), X, y)
model_tester(XGBClassifier(seed=42, eta=0.01), X, y)

              precision    recall  f1-score   support

         0.0       0.78      0.75      0.77      8355
         1.0       0.75      0.78      0.77      8104

    accuracy                           0.77     16459
   macro avg       0.77      0.77      0.77     16459
weighted avg       0.77      0.77      0.77     16459

Training Score: 0.8078418664938432
Testing Score: 0.7665714806488851
              precision    recall  f1-score   support

         0.0       0.78      0.75      0.77      8355
         1.0       0.75      0.79      0.77      8104

    accuracy                           0.77     16459
   macro avg       0.77      0.77      0.77     16459
weighted avg       0.77      0.77      0.77     16459

Training Score: 0.7982015554115359
Testing Score: 0.7681511634971748
              precision    recall  f1-score   support

         0.0       0.79      0.75      0.77      8355
         1.0       0.76      0.79      0.77      8104

    accuracy                           0.77 

In [40]:
# find best hyperparameters using HYPEROPT
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.metrics import accuracy_score

# initialize domain space for range of values
# using values from https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning/notebook
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 42
    }

In [41]:
# define objective function
# using function from https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning/notebook
def objective(space):
    clf=XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [42]:
# optimization algorithm

trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

SCORE:                                                                          
0.7581262531137979                                                              
  1%|        | 1/100 [00:01<02:50,  1.72s/trial, best loss: -0.7581262531137979]





SCORE:                                                                          
0.7595844218968345                                                              
  2%|▏       | 2/100 [00:03<03:01,  1.85s/trial, best loss: -0.7595844218968345]





SCORE:                                                                          
0.7583085242116775                                                              
  3%|▏       | 3/100 [00:05<02:41,  1.67s/trial, best loss: -0.7595844218968345]





SCORE:                                                                          
0.7559997569718695                                                              
  4%|▎       | 4/100 [00:06<02:34,  1.61s/trial, best loss: -0.7595844218968345]





SCORE:                                                                          
0.7589160945379427                                                              
  5%|▍       | 5/100 [00:09<03:00,  1.90s/trial, best loss: -0.7595844218968345]





SCORE:                                                                          
0.7581262531137979                                                              
  6%|▍       | 6/100 [00:11<03:01,  1.93s/trial, best loss: -0.7595844218968345]





SCORE:                                                                          
0.7594629078315815                                                              
  7%|▌       | 7/100 [00:13<03:04,  1.98s/trial, best loss: -0.7595844218968345]





SCORE:                                                                          
0.7580654960811714                                                              
  8%|▋       | 8/100 [00:14<02:54,  1.90s/trial, best loss: -0.7595844218968345]





SCORE:                                                                          
0.759402150798955                                                               
  9%|▋       | 9/100 [00:16<02:37,  1.73s/trial, best loss: -0.7595844218968345]





SCORE:                                                                          
0.7571541405917736                                                              
 10%|▋      | 10/100 [00:17<02:30,  1.67s/trial, best loss: -0.7595844218968345]





SCORE:                                                                          
0.7583085242116775                                                              
 11%|▊      | 11/100 [00:19<02:22,  1.60s/trial, best loss: -0.7595844218968345]





SCORE:                                                                          
0.7563035421350022                                                              
 12%|▊      | 12/100 [00:20<02:14,  1.53s/trial, best loss: -0.7595844218968345]





SCORE:                                                                          
0.7561820280697491                                                              
 13%|▉      | 13/100 [00:22<02:11,  1.51s/trial, best loss: -0.7595844218968345]





SCORE:                                                                          
0.7587945804726897                                                              
 14%|▉      | 14/100 [00:23<02:10,  1.52s/trial, best loss: -0.7595844218968345]





SCORE:                                                                          
0.759645178929461                                                               
 15%|█▏      | 15/100 [00:24<02:07,  1.49s/trial, best loss: -0.759645178929461]





SCORE:                                                                          
0.7563035421350022                                                              
 16%|█▎      | 16/100 [00:26<02:01,  1.44s/trial, best loss: -0.759645178929461]





SCORE:                                                                          
0.7561820280697491                                                              
 17%|█▎      | 17/100 [00:27<01:55,  1.40s/trial, best loss: -0.759645178929461]





SCORE:                                                                          
0.7590376086031958                                                              
 18%|█▍      | 18/100 [00:28<01:52,  1.38s/trial, best loss: -0.759645178929461]





SCORE:                                                                          
0.7566073272981347                                                              
 19%|█▌      | 19/100 [00:30<01:49,  1.36s/trial, best loss: -0.759645178929461]





SCORE:                                                                          
0.7583085242116775                                                              
 20%|█▌      | 20/100 [00:31<01:48,  1.36s/trial, best loss: -0.759645178929461]





SCORE:                                                                          
0.758247767179051                                                               
 21%|█▋      | 21/100 [00:32<01:48,  1.37s/trial, best loss: -0.759645178929461]





SCORE:                                                                          
0.7584907953095571                                                              
 22%|█▊      | 22/100 [00:34<01:48,  1.39s/trial, best loss: -0.759645178929461]





SCORE:                                                                          
0.7587945804726897                                                              
 23%|█▊      | 23/100 [00:35<01:49,  1.42s/trial, best loss: -0.759645178929461]





SCORE:                                                                          
0.7589160945379427                                                              
 24%|█▉      | 24/100 [00:37<01:52,  1.47s/trial, best loss: -0.759645178929461]





SCORE:                                                                          
0.7584300382769306                                                              
 25%|██      | 25/100 [00:38<01:49,  1.45s/trial, best loss: -0.759645178929461]





SCORE:                                                                          
0.7597059359620876                                                              
 26%|█▊     | 26/100 [00:40<01:50,  1.49s/trial, best loss: -0.7597059359620876]





SCORE:                                                                          
0.7590376086031958                                                              
 27%|█▉     | 27/100 [00:41<01:48,  1.49s/trial, best loss: -0.7597059359620876]





SCORE:                                                                          
0.7586730664074367                                                              
 28%|█▉     | 28/100 [00:43<01:47,  1.49s/trial, best loss: -0.7597059359620876]





SCORE:                                                                          
0.7589768515705693                                                              
 29%|██     | 29/100 [00:44<01:43,  1.45s/trial, best loss: -0.7597059359620876]





SCORE:                                                                          
0.7555744577434839                                                              
 30%|██     | 30/100 [00:46<01:40,  1.44s/trial, best loss: -0.7597059359620876]





SCORE:                                                                          
0.7600704781578468                                                              
 31%|██▏    | 31/100 [00:47<01:36,  1.40s/trial, best loss: -0.7600704781578468]





SCORE:                                                                          
0.7601919922230999                                                              
 32%|██▏    | 32/100 [00:48<01:34,  1.38s/trial, best loss: -0.7601919922230999]





SCORE:                                                                          
0.7587945804726897                                                              
 33%|██▎    | 33/100 [00:50<01:31,  1.37s/trial, best loss: -0.7601919922230999]





SCORE:                                                                          
0.7601312351904733                                                              
 34%|██▍    | 34/100 [00:51<01:29,  1.35s/trial, best loss: -0.7601919922230999]





SCORE:                                                                          
0.7586730664074367                                                              
 35%|██▍    | 35/100 [00:52<01:28,  1.37s/trial, best loss: -0.7601919922230999]





SCORE:                                                                          
0.7586123093748102                                                              
 36%|██▌    | 36/100 [00:54<01:28,  1.38s/trial, best loss: -0.7601919922230999]





SCORE:                                                                          
0.7598274500273406                                                              
 37%|██▌    | 37/100 [00:55<01:27,  1.39s/trial, best loss: -0.7601919922230999]





SCORE:                                                                          
0.7581870101464244                                                              
 38%|██▋    | 38/100 [00:57<01:26,  1.39s/trial, best loss: -0.7601919922230999]





SCORE:                                                                          
0.7589160945379427                                                              
 39%|██▋    | 39/100 [00:58<01:24,  1.39s/trial, best loss: -0.7601919922230999]





SCORE:                                                                          
0.7587945804726897                                                              
 40%|██▊    | 40/100 [00:59<01:22,  1.38s/trial, best loss: -0.7601919922230999]





SCORE:                                                                          
0.7595236648642081                                                              
 41%|██▊    | 41/100 [01:01<01:21,  1.38s/trial, best loss: -0.7601919922230999]





SCORE:                                                                          
0.7579439820159184                                                              
 42%|██▉    | 42/100 [01:02<01:20,  1.39s/trial, best loss: -0.7601919922230999]





SCORE:                                                                          
0.7568503554286409                                                              
 43%|███    | 43/100 [01:04<01:21,  1.43s/trial, best loss: -0.7601919922230999]





SCORE:                                                                          
0.7580654960811714                                                              
 44%|███    | 44/100 [01:05<01:20,  1.44s/trial, best loss: -0.7601919922230999]





SCORE:                                                                          
0.7581870101464244                                                              
 45%|███▏   | 45/100 [01:07<01:23,  1.51s/trial, best loss: -0.7601919922230999]





SCORE:                                                                          
0.7592198797010754                                                              
 46%|███▏   | 46/100 [01:08<01:19,  1.47s/trial, best loss: -0.7601919922230999]





SCORE:                                                                          
0.7585515523421836                                                              
 47%|███▎   | 47/100 [01:10<01:16,  1.44s/trial, best loss: -0.7601919922230999]





SCORE:                                                                          
0.7577617109180388                                                              
 48%|███▎   | 48/100 [01:11<01:13,  1.41s/trial, best loss: -0.7601919922230999]





SCORE:                                                                          
0.7586730664074367                                                              
 49%|███▍   | 49/100 [01:12<01:12,  1.42s/trial, best loss: -0.7601919922230999]





SCORE:                                                                          
0.7556352147761103                                                              
 50%|███▌   | 50/100 [01:14<01:11,  1.42s/trial, best loss: -0.7601919922230999]





SCORE:                                                                          
0.7571541405917736                                                              
 51%|███▌   | 51/100 [01:15<01:09,  1.43s/trial, best loss: -0.7601919922230999]





SCORE:                                                                          
0.7584907953095571                                                              
 52%|███▋   | 52/100 [01:17<01:13,  1.53s/trial, best loss: -0.7601919922230999]





SCORE:                                                                          
0.7590376086031958                                                              
 53%|███▋   | 53/100 [01:19<01:14,  1.58s/trial, best loss: -0.7601919922230999]





SCORE:                                                                          
0.7604350203536059                                                              
 54%|███▊   | 54/100 [01:20<01:12,  1.58s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7595844218968345                                                              
 55%|███▊   | 55/100 [01:22<01:08,  1.53s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7581262531137979                                                              
 56%|███▉   | 56/100 [01:23<01:05,  1.50s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7590376086031958                                                              
 57%|███▉   | 57/100 [01:24<01:02,  1.45s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7585515523421836                                                              
 58%|████   | 58/100 [01:26<00:59,  1.41s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7601919922230999                                                              
 59%|████▏  | 59/100 [01:27<00:58,  1.42s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.756060514004496                                                               
 60%|████▏  | 60/100 [01:29<00:55,  1.39s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7581870101464244                                                              
 61%|████▎  | 61/100 [01:30<00:55,  1.42s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7587945804726897                                                              
 62%|████▎  | 62/100 [01:32<00:56,  1.47s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7581870101464244                                                              
 63%|████▍  | 63/100 [01:33<00:55,  1.50s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7559997569718695                                                              
 64%|████▍  | 64/100 [01:35<00:52,  1.45s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7603135062883529                                                              
 65%|████▌  | 65/100 [01:36<00:49,  1.43s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7581870101464244                                                              
 66%|████▌  | 66/100 [01:37<00:49,  1.44s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7604350203536059                                                              
 67%|████▋  | 67/100 [01:39<00:46,  1.42s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7584907953095571                                                              
 68%|████▊  | 68/100 [01:40<00:45,  1.41s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7586123093748102                                                              
 69%|████▊  | 69/100 [01:42<00:43,  1.40s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7589160945379427                                                              
 70%|████▉  | 70/100 [01:43<00:41,  1.39s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7583085242116775                                                              
 71%|████▉  | 71/100 [01:44<00:39,  1.38s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7590983656358223                                                              
 72%|█████  | 72/100 [01:46<00:38,  1.39s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7585515523421836                                                              
 73%|█████  | 73/100 [01:47<00:37,  1.38s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7585515523421836                                                              
 74%|█████▏ | 74/100 [01:48<00:35,  1.37s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7590983656358223                                                              
 75%|█████▎ | 75/100 [01:50<00:34,  1.37s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7594629078315815                                                              
 76%|█████▎ | 76/100 [01:51<00:33,  1.40s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7592806367337019                                                              
 77%|█████▍ | 77/100 [01:53<00:33,  1.44s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7584300382769306                                                              
 78%|█████▍ | 78/100 [01:54<00:31,  1.44s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.758369281244304                                                               
 79%|█████▌ | 79/100 [01:56<00:30,  1.47s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7595844218968345                                                              
 80%|█████▌ | 80/100 [01:57<00:29,  1.49s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7581262531137979                                                              
 81%|█████▋ | 81/100 [01:59<00:27,  1.47s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7598882070599672                                                              
 82%|█████▋ | 82/100 [02:00<00:26,  1.45s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7564858132328817                                                              
 83%|█████▊ | 83/100 [02:02<00:24,  1.45s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7584907953095571                                                              
 84%|█████▉ | 84/100 [02:03<00:22,  1.43s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7564250562002551                                                              
 85%|█████▉ | 85/100 [02:04<00:21,  1.41s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7589160945379427                                                              
 86%|██████ | 86/100 [02:06<00:19,  1.41s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7587945804726897                                                              
 87%|██████ | 87/100 [02:07<00:19,  1.47s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7592198797010754                                                              
 88%|██████▏| 88/100 [02:09<00:17,  1.45s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7601312351904733                                                              
 89%|██████▏| 89/100 [02:10<00:15,  1.42s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7599489640925937                                                              
 90%|██████▎| 90/100 [02:11<00:14,  1.40s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7589160945379427                                                              
 91%|██████▎| 91/100 [02:13<00:12,  1.38s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7584907953095571                                                              
 92%|██████▍| 92/100 [02:14<00:10,  1.37s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7581870101464244                                                              
 93%|██████▌| 93/100 [02:15<00:09,  1.35s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7580047390485448                                                              
 94%|██████▌| 94/100 [02:17<00:08,  1.35s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7592806367337019                                                              
 95%|██████▋| 95/100 [02:18<00:06,  1.39s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7581870101464244                                                              
 96%|██████▋| 96/100 [02:20<00:05,  1.43s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7580654960811714                                                              
 97%|██████▊| 97/100 [02:21<00:04,  1.44s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7585515523421836                                                              
 98%|██████▊| 98/100 [02:23<00:02,  1.44s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7590983656358223                                                              
 99%|██████▉| 99/100 [02:24<00:01,  1.43s/trial, best loss: -0.7604350203536059]





SCORE:                                                                          
0.7586730664074367                                                              
100%|██████| 100/100 [02:26<00:00,  1.46s/trial, best loss: -0.7604350203536059]


In [43]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

The best hyperparameters are :  

{'colsample_bytree': 0.6586642260514318, 'gamma': 6.369028839498013, 'max_depth': 12.0, 'min_child_weight': 3.0, 'reg_alpha': 77.0, 'reg_lambda': 0.8705184540772957}


In [46]:
# Test best parameters with different eta values
model_tester(XGBClassifier(seed=42, colsample_bytree = 0.6586642260514318, gamma = 6.369028839498013, max_depth = 12, min_child_weight = 3.0, reg_alpha = 77.0, reg_lambda = 0.8705184540772957), X, y)
model_tester(XGBClassifier(seed=42, eta=0.2, colsample_bytree = 0.6586642260514318, gamma = 6.369028839498013, max_depth = 12, min_child_weight = 3.0, reg_alpha = 77.0, reg_lambda = 0.8705184540772957), X, y)
model_tester(XGBClassifier(seed=42, eta=0.1, colsample_bytree = 0.6586642260514318, gamma = 6.369028839498013, max_depth = 12, min_child_weight = 3.0, reg_alpha = 77.0, reg_lambda = 0.8705184540772957), X, y)
model_tester(XGBClassifier(seed=42, eta=0.01, colsample_bytree = 0.6586642260514318, gamma = 6.369028839498013, max_depth = 12, min_child_weight = 3.0, reg_alpha = 77.0, reg_lambda = 0.8705184540772957), X, y)

              precision    recall  f1-score   support

         0.0       0.78      0.75      0.77      8355
         1.0       0.76      0.78      0.77      8104

    accuracy                           0.77     16459
   macro avg       0.77      0.77      0.77     16459
weighted avg       0.77      0.77      0.77     16459

Training Score: 0.7703135126377187
Testing Score: 0.7685764627255605
              precision    recall  f1-score   support

         0.0       0.78      0.76      0.77      8355
         1.0       0.76      0.78      0.77      8104

    accuracy                           0.77     16459
   macro avg       0.77      0.77      0.77     16459
weighted avg       0.77      0.77      0.77     16459

Training Score: 0.7707590732339599
Testing Score: 0.7682119205298014
              precision    recall  f1-score   support

         0.0       0.78      0.76      0.77      8355
         1.0       0.76      0.78      0.77      8104

    accuracy                           0.77 

With a testing accuracy of 0.76906 and recall of 0.76, the XGBoost Classifier slightly outperforms the ADABoost.