# Checking wich is the best preprocessing methdos combo 
### Binary and Ternary Datasets

## Importing Packages

In [None]:
# Importing packages

import pandas as pd
import numpy as np

from pprint import pprint

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from itertools import combinations
from collections import Counter

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA

## Loading datasets

In [None]:
# loading dataset and handle a subset of it
path_2 = "datasets/diabetes_binary_health_indicators_BRFSS2015.csv"
path_3 = "datasets/diabetes_012_health_indicators_BRFSS2015.csv"

df = pd.read_csv(path_2)
df_t = pd.read_csv(path_3)

## Performing the preprocessing methods combination

### Binary Dataset

In [None]:
# Droping columns that are not relevant for the model

columns1 = ['ID','Source','End_Lat','End_Lng','End_Time','Start_Time','Description','Airport_Code','Country','Weather_Timestamp'
           ,'Nautical_Twilight','Astronomical_Twilight','Timezone','Wind_Direction','Zipcode','Wind_Chill(F)','Temperature(F)',
           'Sunrise_Sunset','Street','County','State','City','Precipitation(in)','Bump']

#  default
columns = ['ID','Source','End_Lat','End_Lng','End_Time','Start_Time','Description','Airport_Code','Country','Weather_Timestamp',
           'Civil_Twilight','Nautical_Twilight','Astronomical_Twilight','Timezone','Wind_Direction','Pressure(in)','Zipcode',
           'Precipitation(in)','Humidity(%)','Wind_Chill(F)','Temperature(F)','Sunrise_Sunset','Street','County',
           'State','City']
df1 = df.drop(columns=columns)

# MISSING VALUES

# Checking Missing Values

missing_vals = df1.isna().sum().sort_values(ascending = False) / len(df1) * 100
print(missing_vals[missing_vals !=0]) 

# =============================================================================
# ATTENTION! If you use columns1, there are more features with missing values:
# Features: Humidity(%), Pressure(in), Civil_Twilight
#
# Reduce Civil_Twilight to a binary variable
#
#df["Civil_Twilight"] = df["Civil_Twilight"].map(lambda x: 0 if x == "Night" else 1)
#
# Missing values
#
#df1.fillna({
#    'Humidity(%)': df['Humidity(%)'].median(),
#    'Pressure(in)': df['Pressure(in)'].median(),
#    'Civil_Twilight': df['Civil_Twilight'].mode()[0]},
#    inplace=True)
#
# =============================================================================

# Wind_Speed and Visibility Missing Values

df1.fillna({
    'Wind_Speed(mph)': df['Wind_Speed(mph)'].median(),
    'Visibility(mi)': df['Visibility(mi)'].median()},
    inplace=True)

# Checking once again the existence of Missing values

missing_vals = df1.isna().sum().sort_values(ascending = False) / len(df1) * 100
print(df1.shape)
print(df1.dtypes)
print(missing_vals[missing_vals !=0])

# PREPARING THE DATA BEFORE AND AFTER THE DATA SPLITTING

# Checking the class distribution before balancing
print("Before balancing:", Counter(df1['Severity']))

X = df1.drop(columns=['Severity'])
y = df1['Severity']

# Random Undersampling first to reduce dataset size

undersample = RandomUnderSampler(sampling_strategy=0.7, random_state=17)
X_resampled, y_resampled = undersample.fit_resample(X, y)

f1s_dt, precisions_dt, recalls_dt = [], [], []
f1s_knn, precisions_knn, recalls_knn = [], [], []
f1s_nb, precisions_nb, recalls_nb = [], [], []

n_runs = 5
for run in range(n_runs):

    # Spltting the data

    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=run)

    # LOO Encoding

    loo_encoder = LeaveOneOutEncoder()
    X_train_encoded = loo_encoder.fit_transform(X_train, y_train)
    X_test_encoded = loo_encoder.transform(X_test)

    # Apply Tomek Links to get better class separation
    # If you prefer not use tomek, consider changing y_train to y_tomek
    # in model training and changing X_train_encoded to X_tomek in standardization
    # And vice-versa!

    tomek = TomekLinks()
    X_tomek, y_tomek = tomek.fit_resample(X_train_encoded, y_train)  

    print("After Tomek Links:", Counter(y_tomek))

    # Doing Standardization after splitting to avoid data leakage

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_tomek)
    X_test_scaled = scaler.transform(X_test_encoded) 

    # Using PCA 

    pca = PCA(n_components=20) 

    X_train_scaled = pca.fit_transform(X_train_scaled)
    X_test_scaled = pca.transform(X_test_scaled)

    # Baseline models

    dt = DecisionTreeClassifier(max_depth=10)
    dt.fit(X_train_scaled, y_tomek)
    y_pred_dt = dt.predict(X_test_scaled)

    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(X_train_scaled, y_tomek)
    y_pred_knn = knn.predict(X_test_scaled)

    nb = GaussianNB()
    nb.fit(X_train_scaled, y_tomek)
    y_pred_nb = nb.predict(X_test_scaled)

    precision_dt, recall_dt, f1_dt, _ = precision_recall_fscore_support(y_test, y_pred_dt, average='binary')
    f1s_dt.append(f1_dt)
    precisions_dt.append(precision_dt)
    recalls_dt.append(recall_dt)

    precision_knn, recall_knn, f1_knn, _ = precision_recall_fscore_support(y_test, y_pred_knn, average='binary')
    f1s_knn.append(f1_knn)
    precisions_knn.append(precision_knn)
    recalls_knn.append(recall_knn)

    precision_nb, recall_nb, f1_nb, _ = precision_recall_fscore_support(y_test, y_pred_nb, average='binary')
    f1s_nb.append(f1_nb)
    precisions_nb.append(precision_nb)
    recalls_nb.append(recall_nb)

results = {
        "Decision Tree": {"F1": round(np.mean(f1s_dt),2), "Precision": round(np.mean(precisions_dt),2), "Recall": round(np.mean(recalls_dt),2)},
        "KNN": {"F1": round(np.mean(f1s_knn),2), "Precision": round(np.mean(precisions_knn),2), "Recall": round(np.mean(recalls_knn),2)},
        "Naive Bayes": {"F1": round(np.mean(f1s_nb),2), "Precision": round(np.mean(precisions_nb),2), "Recall": round(np.mean(recalls_nb),2)}
    }
pprint(results)

## Ternary Dataset

In [None]:
# Droping columns that are not relevant for the model

columns1 = ['ID','Source','End_Lat','End_Lng','End_Time','Start_Time','Description','Airport_Code','Country','Weather_Timestamp'
           ,'Nautical_Twilight','Astronomical_Twilight','Timezone','Wind_Direction','Zipcode','Wind_Chill(F)','Temperature(F)',
           'Sunrise_Sunset','Street','County','State','City','Precipitation(in)','Bump']

#  default
columns = ['ID','Source','End_Lat','End_Lng','End_Time','Start_Time','Description','Airport_Code','Country','Weather_Timestamp',
           'Civil_Twilight','Nautical_Twilight','Astronomical_Twilight','Timezone','Wind_Direction','Pressure(in)','Zipcode',
           'Precipitation(in)','Humidity(%)','Wind_Chill(F)','Temperature(F)','Sunrise_Sunset','Street','County',
           'State','City']
df1 = df.drop(columns=columns)

# MISSING VALUES

# Checking Missing Values

missing_vals = df1.isna().sum().sort_values(ascending = False) / len(df1) * 100
print(missing_vals[missing_vals !=0]) 

# =============================================================================
# ATTENTION! If you use columns1, there are more features with missing values:
# Features: Humidity(%), Pressure(in), Civil_Twilight
#
# Reduce Civil_Twilight to a binary variable
#
#df["Civil_Twilight"] = df["Civil_Twilight"].map(lambda x: 0 if x == "Night" else 1)
#
# Missing values
#
#df1.fillna({
#    'Humidity(%)': df['Humidity(%)'].median(),
#    'Pressure(in)': df['Pressure(in)'].median(),
#    'Civil_Twilight': df['Civil_Twilight'].mode()[0]},
#    inplace=True)
#
# =============================================================================

# Wind_Speed and Visibility Missing Values

df1.fillna({
    'Wind_Speed(mph)': df['Wind_Speed(mph)'].median(),
    'Visibility(mi)': df['Visibility(mi)'].median()},
    inplace=True)

# Checking once again the existence of Missing values

missing_vals = df1.isna().sum().sort_values(ascending = False) / len(df1) * 100
print(df1.shape)
print(df1.dtypes)
print(missing_vals[missing_vals !=0])

# PREPARING THE DATA BEFORE AND AFTER THE DATA SPLITTING

# Checking the class distribution before balancing
print("Before balancing:", Counter(df1['Severity']))

X = df1.drop(columns=['Severity'])
y = df1['Severity']

# Random Undersampling first to reduce dataset size

undersample = RandomUnderSampler(sampling_strategy=0.7, random_state=17)
X_resampled, y_resampled = undersample.fit_resample(X, y)

f1s_dt, precisions_dt, recalls_dt = [], [], []
f1s_knn, precisions_knn, recalls_knn = [], [], []
f1s_nb, precisions_nb, recalls_nb = [], [], []

n_runs = 5
for run in range(n_runs):

    # Spltting the data

    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=run)

    # LOO Encoding

    loo_encoder = LeaveOneOutEncoder()
    X_train_encoded = loo_encoder.fit_transform(X_train, y_train)
    X_test_encoded = loo_encoder.transform(X_test)

    # Apply Tomek Links to get better class separation
    # If you prefer not use tomek, consider changing y_train to y_tomek
    # in model training and changing X_train_encoded to X_tomek in standardization
    # And vice-versa!

    tomek = TomekLinks()
    X_tomek, y_tomek = tomek.fit_resample(X_train_encoded, y_train)  

    print("After Tomek Links:", Counter(y_tomek))

    # Doing Standardization after splitting to avoid data leakage

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_tomek)
    X_test_scaled = scaler.transform(X_test_encoded) 

    # Using PCA 

    pca = PCA(n_components=20) 

    X_train_scaled = pca.fit_transform(X_train_scaled)
    X_test_scaled = pca.transform(X_test_scaled)

    # Baseline models

    dt = DecisionTreeClassifier(max_depth=10)
    dt.fit(X_train_scaled, y_tomek)
    y_pred_dt = dt.predict(X_test_scaled)

    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(X_train_scaled, y_tomek)
    y_pred_knn = knn.predict(X_test_scaled)

    nb = GaussianNB()
    nb.fit(X_train_scaled, y_tomek)
    y_pred_nb = nb.predict(X_test_scaled)

    precision_dt, recall_dt, f1_dt, _ = precision_recall_fscore_support(y_test, y_pred_dt, average='binary')
    f1s_dt.append(f1_dt)
    precisions_dt.append(precision_dt)
    recalls_dt.append(recall_dt)

    precision_knn, recall_knn, f1_knn, _ = precision_recall_fscore_support(y_test, y_pred_knn, average='binary')
    f1s_knn.append(f1_knn)
    precisions_knn.append(precision_knn)
    recalls_knn.append(recall_knn)

    precision_nb, recall_nb, f1_nb, _ = precision_recall_fscore_support(y_test, y_pred_nb, average='binary')
    f1s_nb.append(f1_nb)
    precisions_nb.append(precision_nb)
    recalls_nb.append(recall_nb)

results = {
        "Decision Tree": {"F1": round(np.mean(f1s_dt),2), "Precision": round(np.mean(precisions_dt),2), "Recall": round(np.mean(recalls_dt),2)},
        "KNN": {"F1": round(np.mean(f1s_knn),2), "Precision": round(np.mean(precisions_knn),2), "Recall": round(np.mean(recalls_knn),2)},
        "Naive Bayes": {"F1": round(np.mean(f1s_nb),2), "Precision": round(np.mean(precisions_nb),2), "Recall": round(np.mean(recalls_nb),2)}
    }
pprint(results)