In [24]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from scipy.stats import norm, ttest_ind

In [25]:
import kagglehub
import pandas as pd
import os

path = kagglehub.dataset_download("ishandutta/early-stage-diabetes-risk-prediction-dataset")
for filename in os.listdir(path):
    if filename.endswith(".csv"):
        csv_file_name = filename
        break
csv_file_path = os.path.join(path, csv_file_name)
print("CSV file path:", csv_file_path)
data = pd.read_csv(csv_file_path)
data.head()

CSV file path: /root/.cache/kagglehub/datasets/ishandutta/early-stage-diabetes-risk-prediction-dataset/versions/1/diabetes_data_upload.csv


Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


In [26]:
data = pd.get_dummies(data, drop_first=True)

X = data.drop(columns=['class_Positive'])
y = data['class_Positive']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [27]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    fpr = fp / (fp + tn)
    fnr = fn / (fn + tp)
    results[name] = {'Accuracy': acc, 'Precision': prec, 'Recall': rec, 'FPR': fpr, 'FNR': fnr}

results_df = pd.DataFrame(results).T
print(results_df)

best_model_fnr = results_df['FNR'].idxmin()
print(f'Model with lowest FNR (Type II Error): {best_model_fnr}')

                     Accuracy  Precision    Recall    FPR       FNR
Logistic Regression  0.942308   0.983333  0.921875  0.025  0.078125
Decision Tree        0.980769   1.000000  0.968750  0.000  0.031250
Random Forest        0.971154   0.984127  0.968750  0.025  0.031250
SVM                  0.980769   0.984375  0.984375  0.025  0.015625
KNN                  0.932692   0.983051  0.906250  0.025  0.093750
Gradient Boosting    0.990385   1.000000  0.984375  0.000  0.015625
Model with lowest FNR (Type II Error): SVM


In [28]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)
misclassified = X_test[y_test != y_pred_log]
correctly_classified = X_test[y_test == y_pred_log]

age_misclassified = misclassified[:, 0]
age_correct = correctly_classified[:, 0]

z_stat, p_value = ttest_ind(age_correct, age_misclassified, equal_var=False)
print(f'Z-Test for mean age difference: Z-Statistic = {z_stat}, P-Value = {p_value}')

Z-Test for mean age difference: Z-Statistic = 1.4810358680954627, P-Value = 0.18603116427333352


In [29]:
rf_fpr = results_df.loc['Random Forest', 'FPR']
if rf_fpr > 0.2:
    pop_mean = 0.2
    n = len(y_test)
    se = np.sqrt((pop_mean * (1 - pop_mean)) / n)
    z_stat_fpr = (rf_fpr - pop_mean) / se
    p_value_fpr = 2 * (1 - norm.cdf(abs(z_stat_fpr)))
    print(f'Z-Test for FPR > 20%: Z-Statistic = {z_stat_fpr}, P-Value = {p_value_fpr}')

fnr_svm = results_df.loc['SVM', 'FNR']
fnr_knn = results_df.loc['KNN', 'FNR']
n_svm = y_test.shape[0]
n_knn = y_test.shape[0]
se_fnr = np.sqrt((fnr_svm * (1 - fnr_svm) / n_svm) + (fnr_knn * (1 - fnr_knn) / n_knn))
z_stat_fnr = (fnr_svm - fnr_knn) / se_fnr
p_value_fnr = 2 * (1 - norm.cdf(abs(z_stat_fnr)))
print(f'Z-Test for FNR (SVM vs. KNN): Z-Statistic = {z_stat_fnr}, P-Value = {p_value_fnr}')

Z-Test for FNR (SVM vs. KNN): Z-Statistic = -2.5151608424234704, P-Value = 0.011897809031118811
