In [1]:
# Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from tabulate import tabulate
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error
from sklearn.svm import LinearSVC
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Loading dataset
df = pd.read_csv('../../datasets/diabetes.csv')

In [8]:
# Information of dataset
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [31]:
# Description of dataset: mean, std, min, max, etc.
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [32]:
x = df.describe().T
x.to_latex(buf='./sup_diabetes.tex', float_format="%.2f", caption="Descriptive statistics of the diabetes dataset.", label="tab:sup_diabetes")

In [33]:
# Correlation between features
df_corr = df.corr()
df_corr

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,0.221898
Glucose,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
BloodPressure,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.065068
SkinThickness,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,0.074752
Insulin,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.130548
BMI,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.292695
DiabetesPedigreeFunction,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.173844
Age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,0.238356
Outcome,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,1.0


In [34]:
df_corr.to_latex(buf='./corr.tex', float_format="%.2f", caption="Descriptive statistics of the diabetes dataset.", label="tab:sup_diabetes_corr")

In [37]:
# Column names and Shape of dataset
print(f'Columns: {[col for col in df.columns]}')
print(f'Shape: {df.shape}')

Columns: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
Shape: (768, 9)


In [11]:
# Checking if is loss data (null, nan, etc.)
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [12]:
# Distribution of outcome
print(f'Outcome: {df["Outcome"].value_counts(normalize=True)}')

Outcome: Outcome
0    0.651042
1    0.348958
Name: proportion, dtype: float64


In [9]:
# New shape
print(f'Shape: {df.shape}')

Shape: (768, 9)


In [10]:
# New distribution of outcome
print(f'Outcome: {df["Outcome"].value_counts(normalize=True)}')

Outcome: Outcome
0    0.651042
1    0.348958
Name: proportion, dtype: float64


In [11]:
# Encoding
from sklearn.calibration import LabelEncoder


def encode_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Encode dataframe using LabelEncoder"""
    labelencoder = LabelEncoder()

    for col in df.columns:
        df[col] = labelencoder.fit_transform(df[col])

    return df


df = encode_dataframe(df)

In [12]:
# Scaler
scaler = RobustScaler()
df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [13]:
# Metrics: Accuracy, Precision, Recall, F1
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score


def get_metrics():
    metrics = {
        'accuracy': accuracy_score,
        'precision': precision_score,
        'recall': recall_score,
        'f1': f1_score,
        'roc': roc_auc_score
    }
    return metrics


def get_metrics_df(y_true, y_pred, name):
    metrics = get_metrics()
    df = pd.DataFrame()
    for metric_name, metric in metrics.items():
        df[metric_name] = [metric(y_true, y_pred)]
    return df

In [14]:
SEEDS = [24, 42, 206, 602, 412, 214, 754, 457, 2023, 3202]

In [15]:
run_counter = 0
results = dict()

# Splitting dataset
X = df.drop(columns='Outcome')
y = df['Outcome']

for seed in SEEDS:
    run_counter += 1
    print(f'%{run_counter} - Running for seed: {seed}')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=seed)

    # Shapes
    # print(f'X_train: {X_train.shape}')
    # print(f'X_test: {X_test.shape}')
    # print(f'y_train: {y_train.shape}')
    # print(f'y_test: {y_test.shape}')

    models_with_best_params = {
        'GaussianNB': GaussianNB(var_smoothing=0.657933224657568),
        'DT': DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=6, min_samples_split=7),
        'KNN': KNeighborsClassifier(metric='euclidean', n_neighbors=13, weights='distance'),
        'RF': RandomForestClassifier(criterion='gini', max_depth=15, min_samples_leaf=2, min_samples_split=2, n_estimators=100),
    }

    # Training and testing models
    for name, model in models_with_best_params.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        results[f"{model.__class__.__name__}_{seed}"] = get_metrics_df(y_test, y_pred, name)
        # print(model.__class__.__name__)
        # print(get_metrics_df(y_test, y_pred, name))
        # print(f"{tabulate(get_metrics_df(y_test, y_pred, name), headers='keys', tablefmt='psql', showindex=False)}\n")

%1 - Running for seed: 24
%2 - Running for seed: 42
%3 - Running for seed: 206
%4 - Running for seed: 602
%5 - Running for seed: 412
%6 - Running for seed: 214
%7 - Running for seed: 754
%8 - Running for seed: 457
%9 - Running for seed: 2023
%10 - Running for seed: 3202


In [16]:
results.keys()

dict_keys(['GaussianNB_24', 'DecisionTreeClassifier_24', 'KNeighborsClassifier_24', 'RandomForestClassifier_24', 'GaussianNB_42', 'DecisionTreeClassifier_42', 'KNeighborsClassifier_42', 'RandomForestClassifier_42', 'GaussianNB_206', 'DecisionTreeClassifier_206', 'KNeighborsClassifier_206', 'RandomForestClassifier_206', 'GaussianNB_602', 'DecisionTreeClassifier_602', 'KNeighborsClassifier_602', 'RandomForestClassifier_602', 'GaussianNB_412', 'DecisionTreeClassifier_412', 'KNeighborsClassifier_412', 'RandomForestClassifier_412', 'GaussianNB_214', 'DecisionTreeClassifier_214', 'KNeighborsClassifier_214', 'RandomForestClassifier_214', 'GaussianNB_754', 'DecisionTreeClassifier_754', 'KNeighborsClassifier_754', 'RandomForestClassifier_754', 'GaussianNB_457', 'DecisionTreeClassifier_457', 'KNeighborsClassifier_457', 'RandomForestClassifier_457', 'GaussianNB_2023', 'DecisionTreeClassifier_2023', 'KNeighborsClassifier_2023', 'RandomForestClassifier_2023', 'GaussianNB_3202', 'DecisionTreeClassif

In [17]:
# Get all 'Gaussian_%' keys
gaussian_keys = [key for key in results.keys() if 'Gaussian' in key]

# Get all 'DT_%' keys
dt_keys = [key for key in results.keys() if 'DecisionTreeClassifier' in key]

# Get all 'KNN_%' keys
knn_keys = [key for key in results.keys() if 'KNeighborsClassifier' in key]

# Get all 'RF_%' keys
rf_keys = [key for key in results.keys() if 'RandomForestClassifier' in key]

In [18]:
print(len(gaussian_keys))
assert len(gaussian_keys) == len(dt_keys) == len(knn_keys) == len(rf_keys)

10


In [19]:
# Gaussian accuracy mean
gaussian_accuracy_mean = np.mean([results[key]['accuracy'].values[0] for key in gaussian_keys])

# Gaussian accuracy std
gaussian_accuracy_std = np.std([results[key]['accuracy'].values[0] for key in gaussian_keys])

# Gaussian precision mean
gaussian_precision_mean = np.mean([results[key]['precision'].values[0] for key in gaussian_keys])

# Gaussian precision std
gaussian_precision_std = np.std([results[key]['precision'].values[0] for key in gaussian_keys])

# Gaussian recall mean
gaussian_recall_mean = np.mean([results[key]['recall'].values[0] for key in gaussian_keys])

# Gaussian recall std
gaussian_recall_std = np.std([results[key]['recall'].values[0] for key in gaussian_keys])

# Gaussian f1 mean
gaussian_f1_mean = np.mean([results[key]['f1'].values[0] for key in gaussian_keys])

# Gaussian f1 std
gaussian_f1_std = np.std([results[key]['f1'].values[0] for key in gaussian_keys])

# Gaussian roc mean
gaussian_roc_mean = np.mean([results[key]['roc'].values[0] for key in gaussian_keys])

# Gaussian roc std
gaussian_roc_std = np.std([results[key]['roc'].values[0] for key in gaussian_keys])


# Print tabulate
print(tabulate(
    [
        ['GaussianNB', 
         gaussian_accuracy_mean, gaussian_accuracy_std, 
         gaussian_precision_mean, gaussian_precision_std, 
         gaussian_recall_mean, gaussian_recall_std, 
         gaussian_f1_mean, gaussian_f1_std, 
         gaussian_roc_mean, gaussian_roc_std],
    ],
    headers=['Model', 'Accuracy Mean', 'Accuracy Std', 
             'Precision Mean', 'Precision Std', 
             'Recall Mean', 'Recall Std', 
             'F1 Mean', 'F1 Std', 
             'ROC Mean', 'ROC Std'],
    tablefmt='psql',
    floatfmt=(".4"),
))


# DT accuracy mean
dt_accuracy_mean = np.mean([results[key]['accuracy'].values[0] for key in dt_keys])

# DT accuracy std
dt_accuracy_std = np.std([results[key]['accuracy'].values[0] for key in dt_keys])

# DT precision mean
dt_precision_mean = np.mean([results[key]['precision'].values[0] for key in dt_keys])

# DT precision std
dt_precision_std = np.std([results[key]['precision'].values[0] for key in dt_keys])

# DT recall mean
dt_recall_mean = np.mean([results[key]['recall'].values[0] for key in dt_keys])

# DT recall std
dt_recall_std = np.std([results[key]['recall'].values[0] for key in dt_keys])

# DT f1 mean
dt_f1_mean = np.mean([results[key]['f1'].values[0] for key in dt_keys])

# DT f1 std
dt_f1_std = np.std([results[key]['f1'].values[0] for key in dt_keys])

# DT roc mean
dt_roc_mean = np.mean([results[key]['roc'].values[0] for key in dt_keys])

# DT roc std
dt_roc_std = np.std([results[key]['roc'].values[0] for key in dt_keys])


# Print tabulate
print(tabulate(
    [
        ['DecisionTreeClassifier', dt_accuracy_mean, dt_accuracy_std, 
         dt_precision_mean, dt_precision_std, 
         dt_recall_mean, dt_recall_std, 
         dt_f1_mean, dt_f1_std, 
         dt_roc_mean, dt_roc_std],
    ],
    headers=['Model', 'Accuracy Mean', 'Accuracy Std', 
             'Precision Mean', 'Precision Std', 
             'Recall Mean', 'Recall Std', 
             'F1 Mean', 'F1 Std', 
             'ROC Mean', 'ROC Std'],
    tablefmt='psql',
    floatfmt=(".4"),
))


# KNN accuracy mean
knn_accuracy_mean = np.mean([results[key]['accuracy'].values[0] for key in knn_keys])

# KNN accuracy std
knn_accuracy_std = np.std([results[key]['accuracy'].values[0] for key in knn_keys])

# KNN precision mean
knn_precision_mean = np.mean([results[key]['precision'].values[0] for key in knn_keys])

# KNN precision std
knn_precision_std = np.std([results[key]['precision'].values[0] for key in knn_keys])

# KNN recall mean
knn_recall_mean = np.mean([results[key]['recall'].values[0] for key in knn_keys])

# KNN recall std
knn_recall_std = np.std([results[key]['recall'].values[0] for key in knn_keys])

# KNN f1 mean
knn_f1_mean = np.mean([results[key]['f1'].values[0] for key in knn_keys])

# KNN f1 std
knn_f1_std = np.std([results[key]['f1'].values[0] for key in knn_keys])

# KNN roc mean
knn_roc_mean = np.mean([results[key]['roc'].values[0] for key in knn_keys])

# KNN roc std
knn_roc_std = np.std([results[key]['roc'].values[0] for key in knn_keys])


# Print tabulate
print(tabulate(
    [
        ['KNeighborsClassifier', knn_accuracy_mean, knn_accuracy_std, 
         knn_precision_mean, knn_precision_std, 
         knn_recall_mean, knn_recall_std, 
         knn_f1_mean, knn_f1_std, 
         knn_roc_mean, knn_roc_std],
    ],
    headers=['Model', 'Accuracy Mean', 'Accuracy Std', 
             'Precision Mean', 'Precision Std', 
             'Recall Mean', 'Recall Std', 
             'F1 Mean', 'F1 Std', 
             'ROC Mean', 'ROC Std'],
    tablefmt='psql',
    floatfmt=(".4"),
))


# RF accuracy mean
rf_accuracy_mean = np.mean([results[key]['accuracy'].values[0] for key in rf_keys])

# RF accuracy std
rf_accuracy_std = np.std([results[key]['accuracy'].values[0] for key in rf_keys])

# RF precision mean
rf_precision_mean = np.mean([results[key]['precision'].values[0] for key in rf_keys])

# RF precision std
rf_precision_std = np.std([results[key]['precision'].values[0] for key in rf_keys])

# RF recall mean
rf_recall_mean = np.mean([results[key]['recall'].values[0] for key in rf_keys])

# RF recall std
rf_recall_std = np.std([results[key]['recall'].values[0] for key in rf_keys])

# RF f1 mean
rf_f1_mean = np.mean([results[key]['f1'].values[0] for key in rf_keys])

# RF f1 std
rf_f1_std = np.std([results[key]['f1'].values[0] for key in rf_keys])

# RF roc mean
rf_roc_mean = np.mean([results[key]['roc'].values[0] for key in rf_keys])

# RF roc std
rf_roc_std = np.std([results[key]['roc'].values[0] for key in rf_keys])


# Print tabulate
print(tabulate(
    [
        ['RandomForestClassifier', rf_accuracy_mean, rf_accuracy_std, 
         rf_precision_mean, rf_precision_std, 
         rf_recall_mean, rf_recall_std, 
         rf_f1_mean, rf_f1_std, 
         rf_roc_mean, rf_roc_std],
    ],
    headers=['Model', 'Accuracy Mean', 'Accuracy Std', 
             'Precision Mean', 'Precision Std', 
             'Recall Mean', 'Recall Std', 
             'F1 Mean', 'F1 Std', 
             'ROC Mean', 'ROC Std'],
    tablefmt='psql',
    floatfmt=(".4"),
))

+------------+-----------------+----------------+------------------+-----------------+---------------+--------------+-----------+----------+------------+-----------+
| Model      |   Accuracy Mean |   Accuracy Std |   Precision Mean |   Precision Std |   Recall Mean |   Recall Std |   F1 Mean |   F1 Std |   ROC Mean |   ROC Std |
|------------+-----------------+----------------+------------------+-----------------+---------------+--------------+-----------+----------+------------+-----------|
| GaussianNB |          0.7513 |        0.02816 |           0.7249 |         0.07586 |        0.4769 |      0.07457 |    0.5716 |  0.06458 |     0.6891 |   0.03731 |
+------------+-----------------+----------------+------------------+-----------------+---------------+--------------+-----------+----------+------------+-----------+
+------------------------+-----------------+----------------+------------------+-----------------+---------------+--------------+-----------+----------+------------+-----

In [27]:
# Hiperparameters with GridSearchCV for Models: GaussianNB, Decision Tree, KNN and Random Forest
# from sklearn.model_selection import GridSearchCV


# # GaussianNB
# params_gnb = {'var_smoothing': np.logspace(0, -9, num=100)}
# grid_gnb = GridSearchCV(estimator=GaussianNB(), param_grid=params_gnb, cv=5, verbose=1, n_jobs=-1)
# grid_gnb.fit(X_train, y_train)
# print(f"Model: GaussianNB | Best params: {grid_gnb.best_params_}\n")

# # Decision Tree
# params_dt = {'criterion': ['gini', 'entropy'],
#                 'max_depth': [3, 5, 7, 9, 11, 13, 15],
#                 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9],
#                 'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9]}
# grid_dt = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=params_dt, cv=5, verbose=1, n_jobs=-1)
# grid_dt.fit(X_train, y_train)
# print(f"Model: DT | Best params: {grid_dt.best_params_}\n")

# # KNN
# params_knn = {'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
#                 'weights': ['uniform', 'distance'],
#                 'metric': ['euclidean', 'manhattan', 'minkowski']}
# grid_knn = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=params_knn, cv=5, verbose=1, n_jobs=-1)
# grid_knn.fit(X_train, y_train)
# print(f"Model: KNN | Best params: {grid_knn.best_params_}\n")

# # Random Forest
# params_rf = {'n_estimators': [100, 200, 300, 400, 500],
#              'criterion': ['gini', 'entropy'],
#                 'max_depth': [3, 5, 7, 9, 11, 13, 15],
#                 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9],
#                 'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9]}
# grid_rf = GridSearchCV(estimator=RandomForestClassifier(), param_grid=params_rf, cv=5, verbose=1, n_jobs=-1)
# grid_rf.fit(X_train, y_train)
# print(f"Model: RF | Best params: {grid_rf.best_params_}\n")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Model: GaussianNB | Best params: {'var_smoothing': 0.012328467394420659}

Fitting 5 folds for each of 1008 candidates, totalling 5040 fits
Model: DT | Best params: {'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 6, 'min_samples_split': 5}

Fitting 5 folds for each of 42 candidates, totalling 210 fits
Model: KNN | Best params: {'metric': 'manhattan', 'n_neighbors': 15, 'weights': 'distance'}

Fitting 5 folds for each of 5040 candidates, totalling 25200 fits


KeyboardInterrupt: 