# Checking wich is the best preprocessing methdos combo 
### Binary and Ternary Datasets

## Importing Packages

In [None]:
# Importing packages

import pandas as pd
import numpy as np

from pprint import pprint

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from collections import Counter

from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

## Loading datasets

In [None]:
# loading dataset and handle a subset of it
path_2 = "datasets/diabetes_binary_health_indicators_BRFSS2015.csv"
path_3 = "datasets/diabetes_012_health_indicators_BRFSS2015.csv"

df = pd.read_csv(path_2)
df_t = pd.read_csv(path_3)

## Performing the preprocessing methods combination

### Binary Dataset

In [None]:
# Removing duplicates

df.drop_duplicates(inplace = True)
print(f"Remaining duplicates: {df.duplicated().sum()}.")

# Droping columns that are not relevant for the model

columns = ["PhysHlth","Veggies","NoDocbcCost"]

# [WITHOUT FEATURE SELECTION if columns is commented]

df = df.drop(columns=columns)

# PREPARING THE DATA BEFORE AND AFTER THE DATA SPLITTING

# Checking the class distribution before balancing
print("Before balancing:", Counter(df['Diabetes_binary']))

X = df.drop(columns=['Diabetes_binary'])
y = df['Diabetes_binary']

f1s_svm, precisions_svm, recalls_svm = [], [], []
f1s_rf, precisions_rf, recalls_rf = [], [], []
f1s_nn, precisions_nn, recalls_nn = [], [], []

n_runs = 10
for run in range(n_runs):

    # Spltting the data

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=run)

    # Doing Normalization after splitting to avoid data leakage

    #scaler2 = MinMaxScaler()
    #X_train_normalized2 = scaler.fit_transform(X_train2)
    #X_test_normalized2 = scaler.transform(X_test2)

    # Doing Standardization after splitting to avoid data leakage

    scaler = StandardScaler()
    X_train_scaled2 = scaler.fit_transform(X_train2)
    X_test_scaled2 = scaler.transform(X_test2)

    # SMOTE ENN for Oversampling/undersampling

    smote_enn2 = SMOTEENN(random_state=run, n_jobs=-1, sampling_strategy=0.6)
    X_train_resampled2, y_train_resampled2 = smote_enn2.fit_resample(X_train_scaled2, y_train2)
    print("[Binary] After SMOTE ENN:", Counter(y_train_resampled2))

    # SMOTE for Oversampling

    #smote2 = SMOTE(random_state=run, n_jobs=-1, sampling_strategy=0.6)
    #X_train_resampled2, y_train_resampled2 = smote2.fit_resample(X_train_scaled2, y_train2)
    #print("[Binary] After SMOTE:", Counter(y_train_resampled2))

    # Random Undersampling

    undersample2 = RandomUnderSampler(sampling_strategy="auto", random_state=run)
    X_resampled2, y_resampled2 = undersample2.fit_resample(X_train_resampled2, y_train_resampled2)

    print("[Binary] After Undersampling:", Counter(y_resampled2))

    # Using PCA 

    pca = PCA(n_components=17) 
    X_train_scaled2 = pca.fit_transform(X_resampled2)
    X_test_scaled2 = pca.transform(X_test_scaled2)

    # Baseline models

    # SVM MODEL
    svm = SVC(kernel='linear', C=100) 
    svm.fit(X_train_scaled2, y_resampled2)
    svm_pred = svm.predict(X_test_scaled2)

    # RF MODEL
    rf = RandomForestClassifier(n_estimators=200, criterion="gini", max_depth=10, n_jobs=-1)
    rf.fit(X_train_scaled2, y_resampled2)
    rf_pred = rf.predict(X_test_scaled2)

    # NN MODEL
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train_scaled2.shape[1],)),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')  # For binary classification
    ])

    model.compile(optimizer=Adam(learning_rate=0.001),
                loss='binary_crossentropy',
                metrics=['accuracy'])

    model.fit(X_train_scaled2, y_resampled2, epochs=20, batch_size=32, validation_split=0.2)
    nn_pred_probs = model.predict(X_test_scaled2)
    nn_pred = (nn_pred_probs > 0.5).astype(int)

    precision_svm, recall_svm, f1_svm, _ = precision_recall_fscore_support(y_test, svm_pred, average='binary')
    f1s_svm.append(f1_svm)
    precisions_svm.append(precision_svm)
    recalls_svm.append(recall_svm)

    precision_rf, recall_rf, f1_rf, _ = precision_recall_fscore_support(y_test, rf_pred, average='binary')
    f1s_rf.append(f1_rf)
    precisions_rf.append(precision_rf)
    recalls_rf.append(recall_rf)

    precision_nn, recall_nn, f1_nn, _ = precision_recall_fscore_support(y_test, nn_pred, average='binary')
    f1s_nn.append(f1_nn)
    precisions_nn.append(precision_nn)
    recalls_nn.append(recall_nn)

results = {
        "Decision Tree": {"F1": round(np.mean(f1s_dt),2), "Precision": round(np.mean(precisions_dt),2), "Recall": round(np.mean(recalls_dt),2)},
        "KNN": {"F1": round(np.mean(f1s_knn),2), "Precision": round(np.mean(precisions_knn),2), "Recall": round(np.mean(recalls_knn),2)},
        "Naive Bayes": {"F1": round(np.mean(f1s_nb),2), "Precision": round(np.mean(precisions_nb),2), "Recall": round(np.mean(recalls_nb),2)}
    }
pprint(results)

## Ternary Dataset

In [None]:
# Removing duplicates

df_t.drop_duplicates(inplace = True)

# Droping columns that are not relevant for the model

columns1 = ["PhysHlth","Veggies","NoDocbcCost"]

# [WITHOUT FEATURE SELECTION if columns is commented]

df1 = df_t.drop(columns=columns)


# Checking the class distribution before balancing
print("Before balancing:", Counter(df1['Diabetes_012']))

X = df1.drop(columns=['Diabetes_012'])
y = df1['Diabetes_012']

# Random Undersampling first to reduce dataset size

undersample = RandomUnderSampler(sampling_strategy=0.7, random_state=run)
X_resampled, y_resampled = undersample.fit_resample(X, y)

f1s_svm3, precisions_svm3, recalls_svm3 = [], [], []
f1s_rf3, precisions_rf3, recalls_rf3 = [], [], []
f1s_nn3, precisions_nn3, recalls_nn3 = [], [], []

n_runs = 5
for run in range(n_runs):

    # Spltting the data

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=run)

    # Doing Normalization after splitting to avoid data leakage

    #scaler3 = MinMaxScaler()
    #X_train_normalized3 = scaler3.fit_transform(X_train2)
    #X_test_normalized3 = scaler3.transform(X_test2)

    # Doing Standardization after splitting to avoid data leakage

    scaler = StandardScaler()
    X_train_scaled3 = scaler.fit_transform(X_train2)
    X_test_scaled3 = scaler.transform(X_test2)

    # SMOTE ENN for Oversampling/undersampling

    smote_enn3 = SMOTEENN(random_state=run, n_jobs=-1, sampling_strategy=0.6)
    X_train_resampled3, y_train_resampled3 = smote_enn3.fit_resample(X_train_scaled2, y_train2)
    print("[Binary] After SMOTE ENN:", Counter(y_train_resampled2))

    # SMOTE for Oversampling

    #smote3 = SMOTE(random_state=run, n_jobs=-1, sampling_strategy=0.6)
    #X_train_resampled2, y_train_resampled2 = smote3.fit_resample(X_train_scaled2, y_train2)
    #print("[Binary] After SMOTE:", Counter(y_train_resampled2))

    # Random Undersampling

    undersample3 = RandomUnderSampler(sampling_strategy="auto", random_state=run)
    X_resampled3, y_resampled3 = undersample3.fit_resample(X_train_resampled2, y_train_resampled2)

    print("[Binary] After Undersampling:", Counter(y_resampled2))

    # Using PCA 

    pca3 = PCA(n_components=17) 
    X_train_scaled23 = pca3.fit_transform(X_resampled2)
    X_test_scaled3 = pca3.transform(X_test_scaled2)

    # Baseline models

    dt = DecisionTreeClassifier(max_depth=10)
    dt.fit(X_train_scaled, y_tomek)
    y_pred_dt = dt.predict(X_test_scaled)

    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(X_train_scaled, y_tomek)
    y_pred_knn = knn.predict(X_test_scaled)

    nb = GaussianNB()
    nb.fit(X_train_scaled, y_tomek)
    y_pred_nb = nb.predict(X_test_scaled)

    precision_dt, recall_dt, f1_dt, _ = precision_recall_fscore_support(y_test, y_pred_dt, average='binary')
    f1s_dt.append(f1_dt)
    precisions_dt.append(precision_dt)
    recalls_dt.append(recall_dt)

    precision_knn, recall_knn, f1_knn, _ = precision_recall_fscore_support(y_test, y_pred_knn, average='binary')
    f1s_knn.append(f1_knn)
    precisions_knn.append(precision_knn)
    recalls_knn.append(recall_knn)

    precision_nb, recall_nb, f1_nb, _ = precision_recall_fscore_support(y_test, y_pred_nb, average='binary')
    f1s_nb.append(f1_nb)
    precisions_nb.append(precision_nb)
    recalls_nb.append(recall_nb)

results = {
        "Decision Tree": {"F1": round(np.mean(f1s_dt),2), "Precision": round(np.mean(precisions_dt),2), "Recall": round(np.mean(recalls_dt),2)},
        "KNN": {"F1": round(np.mean(f1s_knn),2), "Precision": round(np.mean(precisions_knn),2), "Recall": round(np.mean(recalls_knn),2)},
        "Naive Bayes": {"F1": round(np.mean(f1s_nb),2), "Precision": round(np.mean(precisions_nb),2), "Recall": round(np.mean(recalls_nb),2)}
    }
pprint(results)