In [4]:
# Importing necessary libraries

import pandas as pd                       # For data manipulation and analysis
import numpy as np                        # For numerical computing
import time                               # For tracking time
import math                               # For mathematical operations
import warnings                           # For managing warnings

# import shap                               # For SHAP (SHapley Additive exPlanations) values

import imblearn                           # For dealing with imbalanced datasets
from imblearn.over_sampling import RandomOverSampler   # For oversampling
from imblearn.under_sampling import RandomUnderSampler # For undersampling
    

import seaborn as sns                     # For statistical data visualization
import matplotlib.pyplot as plt           # For creating visualizations
import matplotlib.patches as mpatches     # For drawing patches in plots
import matplotlib.colors as mcolors       # For defining custom colors in plots
import matplotlib.ticker as ticker        # For formatting tick marks on plots
from matplotlib.ticker import FuncFormatter         # For custom tick formatting
from matplotlib.ticker import MaxNLocator

from sklearn.preprocessing import StandardScaler      # For feature scaling
from sklearn.model_selection import (StratifiedKFold) # For splitting data into train and test sets

from sklearn.metrics import (roc_auc_score,           # For evaluating model performance
                             recall_score)   

from sklearn.svm import SVC                           # For Support Vector Classifier
from sklearn.linear_model import LogisticRegression   # For Logistic Regression Classifier
from sklearn.ensemble import (RandomForestClassifier, # For ensemble classifiers
                              GradientBoostingClassifier,
                              BaggingClassifier)

from sklearn.neural_network import MLPClassifier      # For Multi-layer Perceptron Classifier

from sklearn.impute import KNNImputer

import re
from sklearn.calibration import CalibratedClassifierCV


AttributeError: module 'numpy' has no attribute '_no_nep50_warning'

In [None]:
# Set the number of folds for cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=17)

# Set preprocessing: StandardScaler for feature standardization
preprocessing = StandardScaler()

# Initialize KNNImputer with the specified number of neighbors
imputer = KNNImputer(n_neighbors=3)

In [None]:
# Define a dictionary containing various classification algorithms

algorithms = {
    'svc_linear': SVC(probability=True, kernel='linear', random_state=0),
    # Support Vector Classifier with linear kernel
    
    'svc_rbf': SVC(probability=True, kernel='rbf', random_state=0),
    # Support Vector Classifier with radial basis function (RBF) kernel
    
    'random_forest': RandomForestClassifier(random_state=0),
    # Random Forest Classifier
    
    'gradient_boosting': GradientBoostingClassifier(random_state=0),
    # Gradient Boosting Classifier
    
    'logistic_regression': LogisticRegression(),
    # Logistic Regression Classifier
    
    'bagging': BaggingClassifier(random_state=0),
    # Bagging Classifier
    
    'mlp': MLPClassifier(random_state=0)
    # Multi-layer Perceptron Classifier
}

In [None]:
def data_sample(X, y):
    '''
    Receives a set of features and target feature separately. 
    Returns balanced data, with the same number of samples in both classes. 
    If the minority class is less than 5%, applies oversampling and undersampling. 
    Otherwise, applies only undersampling. 
    Parameters:
        X : array-like, shape (n_samples, n_features)
            The feature matrix.
        y : array-like, shape (n_samples,)
            The target variable.

    Returns:
        X_resampled : array-like, shape (n_samples_resampled, n_features)
            The resampled feature matrix.
        y_resampled : array-like, shape (n_samples_resampled,)
            The resampled target variable.
    '''
    
    # Define sampling strategies 
    undersample = RandomUnderSampler(sampling_strategy='majority', random_state=1)
    oversample = RandomOverSampler(sampling_strategy=0.2, random_state=1)
    
    # Identify the minority class
    count_1 = (y == 1).sum()
    count_0 = (y == 0).sum()
    count_min = min(count_0, count_1)
    count_max = max(count_0, count_1)

    # Calculate the percentage of the minority class compared to the total number of instances
    ratio = (count_min / count_max) 
    
    # If the minority class is more than 60% of the majority class, do not apply any resampling technique
    if ratio > 0.6: 
        X_resampled, y_resampled = X, y

    # Check if the percentage of class 1 is at least 5% of the total number of instances
    # If it is less than 5%, apply both over and under sampling    
    else: 
        if ratio <= 0.2:
            X_resampled, y_resampled = oversample.fit_resample(X, y)
            X_resampled, y_resampled = undersample.fit_resample(X_resampled, y_resampled)

        # Otherwise, apply only undersampling    
        else:
            X_resampled, y_resampled = undersample.fit_resample(X, y) 
        
    return X_resampled, y_resampled


In [None]:
def impute_missing(data, n_neighbors=3):
    """
    Impute missing values using the K-nearest neighbors algorithm.

    Parameters:
        data (pd.DataFrame): Input DataFrame with missing values.
        n_neighbors (int, optional): Number of neighbors to use for imputation. Defaults to 3.

    Returns:
        pd.DataFrame: DataFrame with missing values imputed using KNN.
    """
    # Initialize KNNImputer with the specified number of neighbors
    imputer = KNNImputer(n_neighbors=n_neighbors)

    # Perform imputation
    imputed_data = imputer.fit_transform(data)

    # Convert the imputed array back to a DataFrame
    imputed_df = pd.DataFrame(imputed_data, columns=data.columns, index=data.index)

    return imputed_df


In [None]:
from sklearn.metrics import precision_score

def evaluate_cv(X_train, y_train):
    '''
    Receives data to be evaluated and returns the average performance inside cross-validation, using 3 metrics.
    Applies over-under sampling to get balanced datasets and standardizes features.
    
    Parameters:
    data : DataFrame
        The dataset containing features and the target variable.
    
    Returns:
    df : DataFrame
        A DataFrame containing the mean and standard deviation of each algorithm's performance across 5-fold cross-validation.
        The performance metrics include AUC (mean and standard deviation), sensitivity (mean and standard deviation),
        specificity (mean and standard deviation), prec_n (mean and standard deviation), and prec_p (mean and standard deviation).
    '''
    # Record the start time
    start_time = time.time()
    
    # # Identify the target column
    # target_feature = data.columns[-1]
    
    # # Separate features (X) and target (y)
    # X = data.drop(columns=[target_feature])
    # y = data[target_feature]
    
    # Initialize dictionaries to store metrics for each algorithm
    sen = {}
    spe = {}
    auc = {}
    prec_n = {}  # Negative precision
    prec_p = {}  # Positive precision
    
    for algorithm in algorithms.keys():
        sen[algorithm] = []
        spe[algorithm] = []
        auc[algorithm] = []
        prec_n[algorithm] = []
        prec_p[algorithm] = []

    # Iterate through each round of the cross-validation
    for train, test in kf.split(X_train, y_train):
        # Allocate train and test data
        X_train_fold, X_test_fold = X_train.iloc[train], X_train.iloc[test]
        y_train_fold, y_test_fold = y_train.iloc[train], y_train.iloc[test]

        # # Apply over-under sampling
        # X_train, y_train = data_sample(X_train, y_train)
            
        # X_train = imputer.fit_transform(X_train)
        # X_test = imputer.transform(X_test)
                
        # # Standardize features
        # X_train = preprocessing.fit_transform(X_train)
        # X_test = preprocessing.transform(X_test)

        # Iterate through each algorithm
        for algorithm, (clf) in algorithms.items():
            
            clf.fit((X_train_fold), y_train_fold)

            # Make predictions for the test data
            y_pred = clf.predict(X_test_fold)

            # Calculate sensitivity and specificity 
            recallscore = recall_score(y_test_fold, y_pred, labels=[0, 1], average=None)
            sen[algorithm].append(recallscore[1])
            spe[algorithm].append(recallscore[0])

            # Calculate precision for each class
            prec_score = precision_score(y_test_fold, y_pred, labels=[0, 1], average=None)
            prec_n[algorithm].append(prec_score[0])
            prec_p[algorithm].append(prec_score[1])

            # Calculate the area under the ROC curve
            aucscore = roc_auc_score(y_test_fold, (clf.predict_proba((X_test_fold)))[:, 1])     
            auc[algorithm].append(aucscore)

    # Create a DataFrame with the mean and standard deviation of each algorithm's performance across 5 folds 
    df = pd.DataFrame(columns=list(algorithms.keys()))

    df.loc['auc (mean)'] = [np.mean(auc['svc_linear']), np.mean(auc['svc_rbf']), np.mean(auc['random_forest']), 
                            np.mean(auc['gradient_boosting']), np.mean(auc['logistic_regression']), 
                            np.mean(auc['bagging']), np.mean(auc['mlp'])]

    df.loc['auc (stdev)'] = [np.std(auc['svc_linear']), np.std(auc['svc_rbf']), np.std(auc['random_forest']), 
                             np.std(auc['gradient_boosting']), np.std(auc['logistic_regression']), 
                             np.std(auc['bagging']), np.std(auc['mlp'])]

    df.loc['rcl_1 (mean)'] = [np.mean(sen['svc_linear']), np.mean(sen['svc_rbf']), np.mean(sen['random_forest']), 
                            np.mean(sen['gradient_boosting']), np.mean(sen['logistic_regression']), 
                            np.mean(sen['bagging']), np.mean(sen['mlp'])]

    df.loc['rcl_1 (stdev)'] = [np.std(sen['svc_linear']), np.std(sen['svc_rbf']), np.std(sen['random_forest']), 
                             np.std(sen['gradient_boosting']), np.std(sen['logistic_regression']), 
                             np.std(sen['bagging']), np.std(sen['mlp'])]

    df.loc['rcl_0 (mean)'] = [np.mean(spe['svc_linear']), np.mean(spe['svc_rbf']), np.mean(spe['random_forest']), 
                            np.mean(spe['gradient_boosting']), np.mean(spe['logistic_regression']), 
                            np.mean(spe['bagging']), np.mean(spe['mlp'])]

    df.loc['rcl_0 (stdev)'] = [np.std(spe['svc_linear']), np.std(spe['svc_rbf']), np.std(spe['random_forest']), 
                             np.std(spe['gradient_boosting']), np.std(spe['logistic_regression']), 
                             np.std(spe['bagging']), np.std(spe['mlp'])]

    df.loc['prc_1 (mean)'] = [np.mean(prec_p['svc_linear']), np.mean(prec_p['svc_rbf']), np.mean(prec_p['random_forest']), 
                                 np.mean(prec_p['gradient_boosting']), np.mean(prec_p['logistic_regression']), 
                                 np.mean(prec_p['bagging']), np.mean(prec_p['mlp'])]

    df.loc['prc_1 (stdev)'] = [np.std(prec_p['svc_linear']), np.std(prec_p['svc_rbf']), np.std(prec_p['random_forest']), 
                                  np.std(prec_p['gradient_boosting']), np.std(prec_p['logistic_regression']), 
                                  np.std(prec_p['bagging']), np.std(prec_p['mlp'])]

    df.loc['prc_0 (mean)'] = [np.mean(prec_n['svc_linear']), np.mean(prec_n['svc_rbf']), np.mean(prec_n['random_forest']), 
                                 np.mean(prec_n['gradient_boosting']), np.mean(prec_n['logistic_regression']), 
                                 np.mean(prec_n['bagging']), np.mean(prec_n['mlp'])]

    df.loc['prc_0 (stdev)'] = [np.std(prec_n['svc_linear']), np.std(prec_n['svc_rbf']), np.std(prec_n['random_forest']), 
                                  np.std(prec_n['gradient_boosting']), np.std(prec_n['logistic_regression']), 
                                  np.std(prec_n['bagging']), np.std(prec_n['mlp'])]

    # Set caption for DataFrame
    df = df.style.set_caption('Average performance and standard deviation among 5-fold cross-validation')

    # Record the end time
    end_time = time.time()

    # Calculate the time taken
    total_time = end_time - start_time

    # Display the DataFrame
    display(df)

    # Print the total time taken to run cross-validation
    print(f"Total time taken to run cross-validation: {total_time:.2f} seconds")

    return df

In [None]:
from sklearn.metrics import precision_score

def evaluate_external(data, data_test): 
    '''
    Receives data and data_test to be evaluated and returns the average performance, using 3 metrics.
    Applies over-under sampling to get balanced datasets and standardizes features.
    
    Parameters:
    data : DataFrame
        The training dataset containing features and the target variable.
    data_test : DataFrame
        The test dataset containing features and the target variable.
    
    Returns:
    df : DataFrame
        A DataFrame containing the mean performance of each algorithm across external validation.
        The performance metrics include AUC (mean), sensitivity (mean), specificity (mean), prec_n (mean), and prec_p (mean).
    '''
    # Record the start time
    start_time = time.time()
    
    # Identify the target column
    target_feature = data.columns[-1]
    
    # Separate features (X) and target (y) for training data
    X = data.drop(columns=[target_feature])
    y = data[target_feature]

    # Separate features (X) and target (y) for test data
    X_test = data_test.drop(columns=[target_feature])
    y_test = data_test[target_feature]
    
    # Initialize dictionaries to store metrics for each algorithm
    sen = {}
    spe = {}
    auc = {}
    prec_n = {}  # Negative precision
    prec_p = {}  # Positive precision
    
    for algorithm in algorithms.keys():
        sen[algorithm] = []
        spe[algorithm] = []
        auc[algorithm] = []
        prec_n[algorithm] = []
        prec_p[algorithm] = []

    # Apply over-under sampling to training data
    X_train, y_train = data_sample(X, y)
    #X_train = X
    #y_train = y

    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)
  
    # Apply preprocessing to both training and test data
    X_train = preprocessing.fit_transform(X_train)
    X_test = preprocessing.transform(X_test)
  
    # For each algorithm 
    for algorithm, (clf) in algorithms.items():
        # Train model
        clf.fit((X_train), y_train)

        # Make predictions for the test data
        y_pred = clf.predict(X_test)

        # Calculate sensitivity and specificity 
        recallscore = recall_score(y_test, y_pred, labels=[0, 1], average=None)
        sen[algorithm].append(recallscore[1])
        spe[algorithm].append(recallscore[0])

        # Calculate precision for each class
        prec_score = precision_score(y_test, y_pred, labels=[0, 1], average=None)
        prec_n[algorithm].append(prec_score[0])
        prec_p[algorithm].append(prec_score[1])

        # Calculate the area under the ROC curve
        aucscore = roc_auc_score(y_test, (clf.predict_proba((X_test)))[:, 1])     
        auc[algorithm].append(aucscore)
    
    # Create a DataFrame with the mean performance of each algorithm across the external validation
    df = pd.DataFrame(columns=list(algorithms.keys()))

    df.loc['auc'] = [np.mean(auc['svc_linear']), np.mean(auc['svc_rbf']), np.mean(auc['random_forest']), 
                            np.mean(auc['gradient_boosting']), np.mean(auc['logistic_regression']), 
                            np.mean(auc['bagging']), np.mean(auc['mlp'])]

    df.loc['rcl_1'] = [np.mean(sen['svc_linear']), np.mean(sen['svc_rbf']), np.mean(sen['random_forest']), 
                            np.mean(sen['gradient_boosting']), np.mean(sen['logistic_regression']), 
                            np.mean(sen['bagging']), np.mean(sen['mlp'])]

    df.loc['rcl_0'] = [np.mean(spe['svc_linear']), np.mean(spe['svc_rbf']), np.mean(spe['random_forest']), 
                            np.mean(spe['gradient_boosting']), np.mean(spe['logistic_regression']), 
                            np.mean(spe['bagging']), np.mean(spe['mlp'])]

    df.loc['prc_1'] = [np.mean(prec_p['svc_linear']), np.mean(prec_p['svc_rbf']), np.mean(prec_p['random_forest']), 
                            np.mean(prec_p['gradient_boosting']), np.mean(prec_p['logistic_regression']), 
                            np.mean(prec_p['bagging']), np.mean(prec_p['mlp'])]

    df.loc['prc_0'] = [np.mean(prec_n['svc_linear']), np.mean(prec_n['svc_rbf']), np.mean(prec_n['random_forest']), 
                            np.mean(prec_n['gradient_boosting']), np.mean(prec_n['logistic_regression']), 
                            np.mean(prec_n['bagging']), np.mean(prec_n['mlp'])]
   
    # Set caption for DataFrame
    df = df.style.set_caption('Performance for external validation')
   
    # Record the end time
    end_time = time.time()
    
    # Calculate the time taken
    total_time = end_time - start_time
    
    # Display the DataFrame
    display(df)
    
    # Print the total time taken to run external-validation
    print(f"Total time taken to run external-validation: {total_time:.2f} seconds")

    return df


In [None]:
data = pd.read_csv('death_dengue_23.csv')


In [None]:
evaluate_cv(data)



Unnamed: 0,svc_linear,svc_rbf,random_forest,gradient_boosting,logistic_regression,bagging,mlp
auc (mean),0.868875,0.863518,0.867892,0.856056,0.866408,0.828921,0.83882
auc (stdev),0.012221,0.012315,0.011804,0.015567,0.012723,0.007793,0.007771
rcl_1 (mean),0.811389,0.82223,0.836076,0.81294,0.802135,0.7465,0.795909
rcl_1 (stdev),0.017861,0.0261,0.034744,0.034288,0.021389,0.027162,0.040302
rcl_0 (mean),0.766919,0.744366,0.751807,0.761277,0.772597,0.763146,0.738706
rcl_0 (stdev),0.025435,0.026123,0.034013,0.041539,0.029652,0.028837,0.021904
prc_1 (mean),0.809419,0.796775,0.804427,0.806676,0.811704,0.793789,0.787572
prc_1 (stdev),0.013972,0.015099,0.018515,0.023945,0.016428,0.015772,0.011679
prc_0 (mean),0.770246,0.775702,0.79208,0.771261,0.763174,0.712966,0.750382
prc_0 (stdev),0.011471,0.023467,0.033217,0.027854,0.012732,0.017508,0.032924


Total time taken to run cross-validation: 109.09 seconds


Unnamed: 0,svc_linear,svc_rbf,random_forest,gradient_boosting,logistic_regression,bagging,mlp
auc (mean),0.868875,0.863518,0.867892,0.856056,0.866408,0.828921,0.83882
auc (stdev),0.012221,0.012315,0.011804,0.015567,0.012723,0.007793,0.007771
rcl_1 (mean),0.811389,0.82223,0.836076,0.81294,0.802135,0.7465,0.795909
rcl_1 (stdev),0.017861,0.0261,0.034744,0.034288,0.021389,0.027162,0.040302
rcl_0 (mean),0.766919,0.744366,0.751807,0.761277,0.772597,0.763146,0.738706
rcl_0 (stdev),0.025435,0.026123,0.034013,0.041539,0.029652,0.028837,0.021904
prc_1 (mean),0.809419,0.796775,0.804427,0.806676,0.811704,0.793789,0.787572
prc_1 (stdev),0.013972,0.015099,0.018515,0.023945,0.016428,0.015772,0.011679
prc_0 (mean),0.770246,0.775702,0.79208,0.771261,0.763174,0.712966,0.750382
prc_0 (stdev),0.011471,0.023467,0.033217,0.027854,0.012732,0.017508,0.032924


In [None]:
data_test = pd.read_csv('death_dengue_24.csv')
evaluate_external(data, data_test)



Unnamed: 0,svc_linear,svc_rbf,random_forest,gradient_boosting,logistic_regression,bagging,mlp
auc,0.848675,0.838825,0.839433,0.840177,0.852099,0.7747,0.800839
rcl_1,0.768559,0.79476,0.799127,0.781659,0.746725,0.641921,0.729258
rcl_0,0.784091,0.732955,0.738636,0.761364,0.784091,0.761364,0.755682
prc_1,0.82243,0.79476,0.799127,0.809955,0.818182,0.777778,0.795238
prc_0,0.722513,0.732955,0.738636,0.728261,0.704082,0.62037,0.682051


Total time taken to run external-validation: 10.42 seconds


Unnamed: 0,svc_linear,svc_rbf,random_forest,gradient_boosting,logistic_regression,bagging,mlp
auc,0.848675,0.838825,0.839433,0.840177,0.852099,0.7747,0.800839
rcl_1,0.768559,0.79476,0.799127,0.781659,0.746725,0.641921,0.729258
rcl_0,0.784091,0.732955,0.738636,0.761364,0.784091,0.761364,0.755682
prc_1,0.82243,0.79476,0.799127,0.809955,0.818182,0.777778,0.795238
prc_0,0.722513,0.732955,0.738636,0.728261,0.704082,0.62037,0.682051
