In [13]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from scipy import stats

warnings.filterwarnings("ignore", category=RuntimeWarning)

data = pd.read_csv('/content/diabetes_data_upload.csv')


X = data.drop(columns=['class'])
y = data['class'].map({'Positive': 1, 'Negative': 0})

X_encoded = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'Confusion Matrix': confusion_matrix(y_test, y_pred)
    }

# Display results
for model_name, metrics in results.items():
    print(f"{model_name}:")
    print(f"  Accuracy: {metrics['Accuracy']:.2f}")
    print(f"  Precision: {metrics['Precision']:.2f}")
    print(f"  Recall: {metrics['Recall']:.2f}")
    print(f"  Confusion Matrix:\n{metrics['Confusion Matrix']}\n")

for model_name, metrics in results.items():
    tn, fp, fn, tp = metrics['Confusion Matrix'].ravel()
    type_ii_error_rate = fn / (fn + tp)
    print(f"{model_name} Type II Error Rate: {type_ii_error_rate:.2f}")

correctly_classified = data[data['class'] == 'Positive']
misclassified = data[data['class'] == 'Negative']

mean_age_correct = correctly_classified['Age'].mean()
mean_age_misclassified = misclassified['Age'].mean()

z_stat, p_value = stats.ttest_ind(correctly_classified['Age'], misclassified['Age'], equal_var=False)
print(f"Z-Test: Z-statistic = {z_stat:.2f}, p-value = {p_value:.4f}")

# Random Forest analysis for Type I error
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

tn, fp, fn, tp = confusion_matrix(y_test, y_pred_rf).ravel()
type_i_error_rate = fp / (fp + tn)
print(f"Random Forest Type I Error Rate: {type_i_error_rate:.2f}")

if type_i_error_rate > 0.20:
    z_stat_i, p_value_i = stats.ztest([type_i_error_rate], value=0.20)
    print(f"One-Sample Z-Test for Type I Error: Z-statistic = {z_stat_i:.2f}, p-value = {p_value_i:.4f}")

gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

for model_name, model in models.items():
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    type_ii_error_rate = fn / (fn + tp)
    print(f"{model_name} Type II Error Rate: {type_ii_error_rate:.2f}")

logistic_fn = confusion_matrix(y_test, models['Logistic Regression'].predict(X_test)).ravel()[2]
rf_fn = confusion_matrix(y_test, models['Random Forest'].predict(X_test)).ravel()[2]

z_stat_fn, p_value_fn = stats.ttest_ind([logistic_fn], [rf_fn], equal_var=False)
print(f"Z-Test for Type II Error Rates: Z-statistic = {z_stat_fn:.2f}, p-value = {p_value_fn:.4f}")


Logistic Regression:
  Accuracy: 0.92
  Precision: 0.93
  Recall: 0.96
  Confusion Matrix:
[[28  5]
 [ 3 68]]

Decision Tree:
  Accuracy: 0.95
  Precision: 1.00
  Recall: 0.93
  Confusion Matrix:
[[33  0]
 [ 5 66]]

Random Forest:
  Accuracy: 0.99
  Precision: 1.00
  Recall: 0.99
  Confusion Matrix:
[[33  0]
 [ 1 70]]

Gradient Boosting:
  Accuracy: 0.97
  Precision: 1.00
  Recall: 0.96
  Confusion Matrix:
[[33  0]
 [ 3 68]]

Logistic Regression Type II Error Rate: 0.04
Decision Tree Type II Error Rate: 0.07
Random Forest Type II Error Rate: 0.01
Gradient Boosting Type II Error Rate: 0.04
Z-Test: Z-statistic = 2.49, p-value = 0.0132
Random Forest Type I Error Rate: 0.00
Logistic Regression Type II Error Rate: 0.04
Decision Tree Type II Error Rate: 0.07
Random Forest Type II Error Rate: 0.01
Gradient Boosting Type II Error Rate: 0.04
Z-Test for Type II Error Rates: Z-statistic = nan, p-value = nan
