# Checking wich is the best preprocessing methods combo - Binary Dataset

## Importing Packages

In [1]:
# Importing packages

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from pprint import pprint

from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from collections import Counter

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from keras import Input

## Loading dataset

In [2]:
# loading dataset
path_2 = "datasets/diabetes_binary_health_indicators_BRFSS2015.csv"

df = pd.read_csv(path_2)

### Binary Dataset preprocessing methods

In [3]:
# Removing duplicates

df.drop_duplicates(inplace = True)
print(f"Remaining duplicates: {df.duplicated().sum()}.")

# Droping columns that are not relevant for the model

columns = ["PhysHlth","Veggies","NoDocbcCost"]
df = df.drop(columns=columns)

# PREPARING THE DATA BEFORE AND AFTER THE DATA SPLITTING

# Checking the class distribution before balancing
print("Before balancing:", Counter(df['Diabetes_binary']))

X = df.drop(columns=['Diabetes_binary'])
y = df['Diabetes_binary']


f1s_svm, precisions_svm, recalls_svm = [], [], []
f1s_rf, precisions_rf, recalls_rf = [], [], []
f1s_nn, precisions_nn, recalls_nn = [], [], []

n_runs = 5
for run in range(n_runs):

    # Spltting the data

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=run)

    # Doing Normalization after splitting to avoid data leakage

    #scaler = MinMaxScaler()
    #X_train_scaled = scaler.fit_transform(X_train)
    #X_test_scaled = scaler.transform(X_test)

    # Doing Standardization 

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Random Undersampling

    undersample = RandomUnderSampler(sampling_strategy={0.0:60000}, random_state=run)
    X_und, y_und = undersample.fit_resample(X_train_scaled, y_train)
    print("[Binary] After Undersampling:", Counter(y_und))

    # SMOTE ENN for oversampling/downsampling

    smote_enn = SMOTEENN(random_state=run, n_jobs=-1, sampling_strategy=0.7)
    X_und, y_und = smote_enn.fit_resample(X_und, y_und)
    print("[Binary] After SMOTE ENN:", Counter(y_und))


    # SMOTE for Oversampling

    #smote = SMOTE(random_state=run, sampling_strategy=0.7)
    #X_und, y_und = smote.fit_resample(X_und, y_und)
    #print("[Binary] After SMOTE:", Counter(y_und))

    # Tomek Links

    tomek = TomekLinks()
    X_und, y_und = tomek.fit_resample(X_und, y_und)
    print("[Binary] After TomekLinks:", Counter(y_und))

    # Using PCA 

    #pca = PCA(n_components=5) 
    #X_und = pca.fit_transform(X_und)
    #X_test = pca.transform(X_test_scaled)

    # Baseline models

    # SVM MODEL
    svm = LinearSVC(C=10, dual=False, class_weight="balanced")
    svm.fit(X_und, y_und)
    svm_pred = svm.predict(X_test)

    # RF MODEL
    rf = RandomForestClassifier(n_estimators=300, criterion="gini", max_depth=10, n_jobs=-1, class_weight="balanced")
    rf.fit(X_und, y_und)
    rf_pred = rf.predict(X_test)

    # NN MODEL
    model = Sequential([
    Input(shape=(X_und.shape[1],)),
    Dense(64, activation='relu'),
    Dropout(0.5),  
    Dense(32, activation='relu'),
    Dropout(0.5),  
    Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=Adam(learning_rate=0.001),
                loss='binary_crossentropy',
                metrics=['accuracy', 'Precision', 'Recall'])

    classes = np.unique(y_und)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_und)
    class_weights = dict(zip(classes, weights))

    model.fit(X_und, y_und, epochs=20, batch_size=32, validation_split=0.2, class_weight=class_weights)
    nn_pred_probs = model.predict(X_test)
    nn_pred = (nn_pred_probs > 0.65).astype(int)

    precision_svm, recall_svm, f1_svm, _ = precision_recall_fscore_support(y_test, svm_pred, average='binary')
    f1s_svm.append(f1_svm)
    precisions_svm.append(precision_svm)
    recalls_svm.append(recall_svm)

    precision_rf, recall_rf, f1_rf, _ = precision_recall_fscore_support(y_test, rf_pred, average='binary')
    f1s_rf.append(f1_rf)
    precisions_rf.append(precision_rf)
    recalls_rf.append(recall_rf)

    precision_nn, recall_nn, f1_nn, _ = precision_recall_fscore_support(y_test, nn_pred, average='binary')
    f1s_nn.append(f1_nn)
    precisions_nn.append(precision_nn)
    recalls_nn.append(recall_nn)

results = {
        "SVM": {"F1": round(np.mean(f1s_svm),2), "Precision": round(np.mean(precisions_svm),2), "Recall": round(np.mean(recalls_svm),2)},
        "RF": {"F1": round(np.mean(f1s_rf),2), "Precision": round(np.mean(precisions_rf),2), "Recall": round(np.mean(recalls_rf),2)},
        "NN": {"F1": round(np.mean(f1s_nn),2), "Precision": round(np.mean(precisions_nn),2), "Recall": round(np.mean(recalls_nn),2)}
    }
pprint(results)

Remaining duplicates: 0.
Before balancing: Counter({0.0: 194377, 1.0: 35097})
[Binary] After Undersampling: Counter({0.0: 60000, 1.0: 28076})
[Binary] After SMOTE ENN: Counter({0.0: 30929, 1.0: 17534})
[Binary] After TomekLinks: Counter({0.0: 30909, 1.0: 17534})




Epoch 1/20
[1m1212/1212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - Precision: 0.6589 - Recall: 0.7154 - accuracy: 0.8687 - loss: 0.3035 - val_Precision: 1.0000 - val_Recall: 0.8687 - val_accuracy: 0.8687 - val_loss: 0.3471
Epoch 2/20
[1m1212/1212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - Precision: 0.7987 - Recall: 0.8778 - accuracy: 0.9289 - loss: 0.1869 - val_Precision: 1.0000 - val_Recall: 0.8470 - val_accuracy: 0.8470 - val_loss: 0.3738
Epoch 3/20
[1m1212/1212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - Precision: 0.8063 - Recall: 0.8730 - accuracy: 0.9329 - loss: 0.1797 - val_Precision: 1.0000 - val_Recall: 0.8479 - val_accuracy: 0.8479 - val_loss: 0.3580
Epoch 4/20
[1m1212/1212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - Precision: 0.8034 - Recall: 0.8744 - accuracy: 0.9317 - loss: 0.1776 - val_Precision: 1.0000 - val_Recall: 0.8504 - val_accuracy: 0.8504 - val_loss: 0.3606
Epoch 5/20
[1m1



Epoch 1/20
[1m1213/1213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - Precision: 0.6791 - Recall: 0.6269 - accuracy: 0.8721 - loss: 0.3126 - val_Precision: 1.0000 - val_Recall: 0.8711 - val_accuracy: 0.8711 - val_loss: 0.3783
Epoch 2/20
[1m1213/1213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - Precision: 0.7877 - Recall: 0.8848 - accuracy: 0.9289 - loss: 0.1841 - val_Precision: 1.0000 - val_Recall: 0.8472 - val_accuracy: 0.8472 - val_loss: 0.3903
Epoch 3/20
[1m1213/1213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - Precision: 0.8024 - Recall: 0.8800 - accuracy: 0.9320 - loss: 0.1773 - val_Precision: 1.0000 - val_Recall: 0.8571 - val_accuracy: 0.8571 - val_loss: 0.3739
Epoch 4/20
[1m1213/1213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - Precision: 0.8127 - Recall: 0.8815 - accuracy: 0.9361 - loss: 0.1696 - val_Precision: 1.0000 - val_Recall: 0.8692 - val_accuracy: 0.8692 - val_loss: 0.3369
Epoch 5/20
[1m1



Epoch 1/20
[1m1211/1211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - Precision: 0.6258 - Recall: 0.7721 - accuracy: 0.8440 - loss: 0.3177 - val_Precision: 1.0000 - val_Recall: 0.8474 - val_accuracy: 0.8474 - val_loss: 0.3916
Epoch 2/20
[1m1211/1211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - Precision: 0.7970 - Recall: 0.8712 - accuracy: 0.9305 - loss: 0.1828 - val_Precision: 1.0000 - val_Recall: 0.8512 - val_accuracy: 0.8512 - val_loss: 0.3711
Epoch 3/20
[1m1211/1211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - Precision: 0.7979 - Recall: 0.8784 - accuracy: 0.9299 - loss: 0.1781 - val_Precision: 1.0000 - val_Recall: 0.8481 - val_accuracy: 0.8481 - val_loss: 0.3573
Epoch 4/20
[1m1211/1211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - Precision: 0.8124 - Recall: 0.8750 - accuracy: 0.9338 - loss: 0.1701 - val_Precision: 1.0000 - val_Recall: 0.8317 - val_accuracy: 0.8317 - val_loss: 0.3703
Epoch 5/20
[1m1



Epoch 1/20
[1m1219/1219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - Precision: 0.6527 - Recall: 0.7802 - accuracy: 0.8622 - loss: 0.3042 - val_Precision: 1.0000 - val_Recall: 0.8529 - val_accuracy: 0.8529 - val_loss: 0.4065
Epoch 2/20
[1m1219/1219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - Precision: 0.8033 - Recall: 0.8830 - accuracy: 0.9324 - loss: 0.1799 - val_Precision: 1.0000 - val_Recall: 0.8577 - val_accuracy: 0.8577 - val_loss: 0.3746
Epoch 3/20
[1m1219/1219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - Precision: 0.8029 - Recall: 0.8911 - accuracy: 0.9347 - loss: 0.1743 - val_Precision: 1.0000 - val_Recall: 0.8551 - val_accuracy: 0.8551 - val_loss: 0.3826
Epoch 4/20
[1m1219/1219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - Precision: 0.8286 - Recall: 0.8892 - accuracy: 0.9404 - loss: 0.1644 - val_Precision: 1.0000 - val_Recall: 0.8401 - val_accuracy: 0.8401 - val_loss: 0.4129
Epoch 5/20
[1m1



Epoch 1/20
[1m1224/1224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - Precision: 0.6625 - Recall: 0.7391 - accuracy: 0.8657 - loss: 0.3053 - val_Precision: 1.0000 - val_Recall: 0.8713 - val_accuracy: 0.8713 - val_loss: 0.3512
Epoch 2/20
[1m1224/1224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - Precision: 0.7956 - Recall: 0.8856 - accuracy: 0.9291 - loss: 0.1881 - val_Precision: 1.0000 - val_Recall: 0.8656 - val_accuracy: 0.8656 - val_loss: 0.3427
Epoch 3/20
[1m1224/1224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - Precision: 0.8088 - Recall: 0.8810 - accuracy: 0.9329 - loss: 0.1831 - val_Precision: 1.0000 - val_Recall: 0.8589 - val_accuracy: 0.8589 - val_loss: 0.3498
Epoch 4/20
[1m1224/1224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - Precision: 0.8185 - Recall: 0.8821 - accuracy: 0.9365 - loss: 0.1725 - val_Precision: 1.0000 - val_Recall: 0.8461 - val_accuracy: 0.8461 - val_loss: 0.3822
Epoch 5/20
[1m1