# Checking wich is the best preprocessing methods combo - Ternary Dataset

## Importing Packages

In [48]:
# Importing packages

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from pprint import pprint

from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from collections import Counter

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import Precision, Recall
from keras import Input

## Loading dataset

In [49]:
# loading dataset
path_3 = "datasets/diabetes_012_health_indicators_BRFSS2015.csv"

df = pd.read_csv(path_3)

### Ternary Dataset preprocessing methods

In [None]:
# Removing duplicates

df.drop_duplicates(inplace = True)
print(f"Remaining duplicates: {df.duplicated().sum()}.")

# Droping columns that are not relevant for the model

columns = ["PhysHlth","Veggies","NoDocbcCost"]
df = df.drop(columns=columns)

# PREPARING THE DATA BEFORE AND AFTER THE DATA SPLITTING

X = df.drop(columns=['Diabetes_012'])
y = df['Diabetes_012']


f1s_svm, precisions_svm, recalls_svm = [], [], []
f1s_rf, precisions_rf, recalls_rf = [], [], []
f1s_nn, precisions_nn, recalls_nn = [], [], []

n_runs = 5
for run in range(n_runs):

    # Spltting the data

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=run)

    # Doing Normalization after splitting to avoid data leakage

    #scaler = MinMaxScaler()
    #X_train_scaled = scaler.fit_transform(X_train)
    #X_test_scaled = scaler.transform(X_test)

    # Doing Standardization 

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Random Undersampling

    undersample = RandomUnderSampler(sampling_strategy={0.0:35000}, random_state=run)
    X_und, y_und = undersample.fit_resample(X_train_scaled, y_train)

    # SMOTE ENN for oversampling/downsampling

    smote_enn = SMOTEENN(random_state=run, n_jobs=-1, sampling_strategy="auto")
    X_und, y_und = smote_enn.fit_resample(X_und, y_und)


    # SMOTE for Oversampling

    #smote = SMOTE(random_state=run, sampling_strategy="auto")
    #X_und, y_und = smote.fit_resample(X_und, y_und)

    # Tomek Links

    tomek = TomekLinks()
    X_und, y_und = tomek.fit_resample(X_und, y_und)
    # Using PCA 

    pca = PCA(n_components=7) 
    X_und = pca.fit_transform(X_und)
    X_test = pca.transform(X_test_scaled)

    # Baseline models

    # SVM MODEL
    svm = LinearSVC(C=10, dual=False, class_weight="balanced", multi_class='ovr')
    svm.fit(X_und, y_und)
    svm_pred = svm.predict(X_test)

    #y_und_cat = to_categorical(y_und, num_classes=3)

    # RF MODEL
    rf = RandomForestClassifier(n_estimators=200, criterion="gini", max_depth=10, n_jobs=-1, class_weight="balanced")
    rf.fit(X_und, y_und)
    rf_pred = rf.predict(X_test)

    # NN MODEL
    model = Sequential([
    Input(shape=(X_und.shape[1],)),
    Dense(64, activation='relu'),
    Dropout(0.5),  
    Dense(32, activation='relu'),
    Dropout(0.5),  
    Dense(3, activation='softmax')
    ])

    model.compile(optimizer=Adam(learning_rate=0.001),
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

    model.fit(X_und, y_und, epochs=20, batch_size=32, validation_split=0.2)
    nn_pred_probs = model.predict(X_test)
    nn_pred = np.argmax(nn_pred_probs, axis=1)

    precision_svm, recall_svm, f1_svm, _ = precision_recall_fscore_support(y_test, svm_pred, zero_division=0)
    f1s_svm.append(f1_svm)
    precisions_svm.append(precision_svm)
    recalls_svm.append(recall_svm)

    precision_rf, recall_rf, f1_rf, _ = precision_recall_fscore_support(y_test, rf_pred, zero_division=0)
    f1s_rf.append(f1_rf)
    precisions_rf.append(precision_rf)
    recalls_rf.append(recall_rf)

    precision_nn, recall_nn, f1_nn, _ = precision_recall_fscore_support(y_test, nn_pred, zero_division=0)
    f1s_nn.append(f1_nn)
    precisions_nn.append(precision_nn)
    recalls_nn.append(recall_nn)

results = {
        "SVM": {"F1": round(np.mean(f1s_svm),2), "Precision": round(np.mean(precisions_svm),2), "Recall": round(np.mean(recalls_svm),2)},
        "RF": {"F1": round(np.mean(f1s_rf),2), "Precision": round(np.mean(precisions_rf),2), "Recall": round(np.mean(recalls_rf),2)},
        "NN": {"F1": round(np.mean(f1s_nn),2), "Precision": round(np.mean(precisions_nn),2), "Recall": round(np.mean(recalls_nn),2)}
    }
pprint(results)

Remaining duplicates: 0.
Epoch 1/20
[1m1299/1299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.7199 - loss: 0.6462 - val_accuracy: 0.0000e+00 - val_loss: 3.0398
Epoch 2/20
[1m1299/1299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7925 - loss: 0.5139 - val_accuracy: 0.0000e+00 - val_loss: 3.0676
Epoch 3/20
[1m1299/1299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8047 - loss: 0.4923 - val_accuracy: 0.0000e+00 - val_loss: 2.9236
Epoch 4/20
[1m1299/1299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8050 - loss: 0.4885 - val_accuracy: 0.0000e+00 - val_loss: 2.9113
Epoch 5/20
[1m1299/1299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8088 - loss: 0.4763 - val_accuracy: 0.0000e+00 - val_loss: 2.7954
Epoch 6/20
[1m1299/1299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8111 - loss: 0.4739 - val_accurac