In [1]:
import zipfile
import os

# Path to your zip file
zip_path = "/content/drive/MyDrive/Early Stage Diabetes Risk Prediction Dataset.zip"

# Destination folder to extract files
extract_path = "/content/Early Stage Diabetes Risk Prediction Dataset"

# Ensure the extraction directory exists
os.makedirs(extract_path, exist_ok=True)

# Unzipping the file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Dataset extracted successfully!")


Dataset extracted successfully!


1.Train Logistic Regression, Decision Tree, and Random Forest models on the dataset. Compare their accuracy, precision, and recall.


In [2]:
import pandas as pd
import numpy as np
import zipfile
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix


# Load dataset (assuming CSV file is extracted)
csv_path = os.path.join(extract_path, "/content/Early Stage Diabetes Risk Prediction Dataset/diabetes_data_upload.csv")  # Adjust filename if needed
df = pd.read_csv(csv_path)

# Encode categorical variables
label_encoders = {}
for col in df.columns:
    if df[col].dtype == 'object':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

# Splitting data
X = df.drop(columns=['class'])  # Assuming 'class' is the target column
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}

# Train and evaluate models
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    fn_rate = cm[1][0] / (cm[1][0] + cm[1][1])  # False Negative Rate (Type II Error)

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "False Negative Rate": fn_rate
    })

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results)
print(results_df.sort_values(by="Accuracy", ascending=False))


                 Model  Accuracy  Precision    Recall  False Negative Rate
2        Random Forest  0.990385   1.000000  0.985915             0.014085
1        Decision Tree  0.961538   1.000000  0.943662             0.056338
0  Logistic Regression  0.923077   0.931507  0.957746             0.042254


2.Train a Logistic Regression model and perform a Z-Test on the mean age of correctly classified vs. misclassified diabetic patients.

In [4]:
import pandas as pd
import numpy as np
import zipfile
import os
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix


# Load dataset (assuming CSV file is extracted)
csv_path = os.path.join(extract_path, "/content/Early Stage Diabetes Risk Prediction Dataset/diabetes_data_upload.csv")  # Adjust filename if needed
df = pd.read_csv(csv_path)

# Encode categorical variables
label_encoders = {}
for col in df.columns:
    if df[col].dtype == 'object' and col != 'Age':  # Keep 'Age' as numeric
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

# Splitting data
X = df.drop(columns=['class'])  # Assuming 'class' is the target column
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Standardizing features (excluding Age for interpretation purposes)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(columns=["Age"]))
X_test_scaled = scaler.transform(X_test.drop(columns=["Age"]))

# Convert back to DataFrame and restore Age column
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns[1:], index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns[1:], index=X_test.index)

# Reattach the Age column
X_train.insert(0, "Age", df.loc[X_train.index, "Age"])
X_test.insert(0, "Age", df.loc[X_test.index, "Age"])

# Train Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Get correctly and incorrectly classified diabetic patients
test_results = pd.DataFrame({
    "Age": X_test["Age"].values,
    "True_Label": y_test.values,
    "Predicted_Label": y_pred
})

test_results_diabetic = test_results[test_results["True_Label"] == 1]
correctly_classified = test_results_diabetic[test_results_diabetic["True_Label"] == test_results_diabetic["Predicted_Label"]]
misclassified = test_results_diabetic[test_results_diabetic["True_Label"] != test_results_diabetic["Predicted_Label"]]

# Perform Z-Test on mean age of correctly vs. misclassified diabetic patients
z_stat, p_value = stats.ttest_ind(correctly_classified["Age"], misclassified["Age"], equal_var=False)

# Print results
print(f"Z-Statistic: {z_stat}, P-Value: {p_value}")
if p_value < 0.05:
    print("Significant difference in mean age. Age is likely an important feature.")
else:
    print("No significant difference in mean age. Age may not be a strong factor.")

Z-Statistic: 5.448854897317102, P-Value: 1.2770382292057548e-05
Significant difference in mean age. Age is likely an important feature.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


3.Train a Random Forest model and analyze its false positive rate (Type I error).

In [5]:
import pandas as pd
import numpy as np
import zipfile
import os
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix


# Load dataset (assuming CSV file is extracted)
csv_path = os.path.join(extract_path, "/content/Early Stage Diabetes Risk Prediction Dataset/diabetes_data_upload.csv")  # Adjust filename if needed
df = pd.read_csv(csv_path)

# Encode categorical variables
label_encoders = {}
for col in df.columns:
    if df[col].dtype == 'object' and col != 'Age':  # Keep 'Age' as numeric
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

# Splitting data
X = df.drop(columns=['class'])  # Assuming 'class' is the target column
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing features (excluding Age for interpretation purposes)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(columns=["Age"]))
X_test_scaled = scaler.transform(X_test.drop(columns=["Age"]))

# Convert back to DataFrame and restore Age column
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns[1:], index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns[1:], index=X_test.index)
X_train.insert(0, "Age", df.loc[X_train.index, "Age"])
X_test.insert(0, "Age", df.loc[X_test.index, "Age"])

# Train Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Compute False Positive Rate (Type I Error)
cm_rf = confusion_matrix(y_test, y_pred_rf)
fp_rate_rf = cm_rf[0][1] / (cm_rf[0][0] + cm_rf[0][1])
print(f"Random Forest False Positive Rate: {fp_rate_rf:.4f}")

# If False Positive Rate > 20%, perform a One-Sample Z-Test
alpha = 0.05  # Significance level
threshold_fp_rate = 0.20
if fp_rate_rf > threshold_fp_rate:
    sample_size = cm_rf[0][0] + cm_rf[0][1]  # Total non-diabetic samples
    z_stat, p_value = stats.ttest_1samp([fp_rate_rf] * sample_size, threshold_fp_rate)
    print(f"Z-Statistic: {z_stat}, P-Value: {p_value}")
    if p_value < alpha:
        print("False positive rate is significantly different from 20%.")
    else:
        print("False positive rate is not significantly different from 20%.")

# Train Gradient Boosting Model
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

# Compute False Positive Rate for Gradient Boosting
cm_gb = confusion_matrix(y_test, y_pred_gb)
fp_rate_gb = cm_gb[0][1] / (cm_gb[0][0] + cm_gb[0][1])
print(f"Gradient Boosting False Positive Rate: {fp_rate_gb:.4f}")

# Compare Models
if fp_rate_rf < fp_rate_gb:
    print("Random Forest has a lower Type I error rate.")
elif fp_rate_gb < fp_rate_rf:
    print("Gradient Boosting has a lower Type I error rate.")
else:
    print("Both models have the same Type I error rate.")

Random Forest False Positive Rate: 0.0000
Gradient Boosting False Positive Rate: 0.0000
Both models have the same Type I error rate.


4.Compare the false negative rates (Type II errors) of SVM, KNN, and Logistic Regression models.

In [6]:
import pandas as pd
import numpy as np
import zipfile
import os
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix


# Load dataset (assuming CSV file is extracted)
csv_path = os.path.join(extract_path, "/content/Early Stage Diabetes Risk Prediction Dataset/diabetes_data_upload.csv")  # Adjust filename if needed
df = pd.read_csv(csv_path)

# Encode categorical variables
label_encoders = {}
for col in df.columns:
    if df[col].dtype == 'object' and col != 'Age':  # Keep 'Age' as numeric
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

# Splitting data
X = df.drop(columns=['class'])  # Assuming 'class' is the target column
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing features (excluding Age for interpretation purposes)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(columns=["Age"]))
X_test_scaled = scaler.transform(X_test.drop(columns=["Age"]))

# Convert back to DataFrame and restore Age column
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns[1:], index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns[1:], index=X_test.index)
X_train.insert(0, "Age", df.loc[X_train.index, "Age"])
X_test.insert(0, "Age", df.loc[X_test.index, "Age"])

# Train SVM model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

# Train KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)

# Train Logistic Regression model
logreg_model = LogisticRegression(random_state=42)
logreg_model.fit(X_train, y_train)
y_pred_logreg = logreg_model.predict(X_test)

# Compute False Negative Rates (Type II Error)
def compute_false_negative_rate(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    fn_rate = cm[1][0] / (cm[1][0] + cm[1][1])
    return fn_rate, cm

fn_rate_svm, cm_svm = compute_false_negative_rate(y_test, y_pred_svm)
fn_rate_knn, cm_knn = compute_false_negative_rate(y_test, y_pred_knn)
fn_rate_logreg, cm_logreg = compute_false_negative_rate(y_test, y_pred_logreg)

print(f"SVM False Negative Rate: {fn_rate_svm:.4f}")
print(f"KNN False Negative Rate: {fn_rate_knn:.4f}")
print(f"Logistic Regression False Negative Rate: {fn_rate_logreg:.4f}")

# Perform Z-Test to compare false negative rates
alpha = 0.05  # Significance level
def z_test(fn_rate1, fn_rate2, n):
    prop_diff = fn_rate1 - fn_rate2
    std_error = np.sqrt((fn_rate1 * (1 - fn_rate1) / n) + (fn_rate2 * (1 - fn_rate2) / n))
    z_stat = prop_diff / std_error
    p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))  # Two-tailed test
    return z_stat, p_value

# Assuming equal sample size for both groups
n_samples = len(y_test) // 2  # Approximate number of diabetic cases

z_svm_knn, p_svm_knn = z_test(fn_rate_svm, fn_rate_knn, n_samples)
z_svm_logreg, p_svm_logreg = z_test(fn_rate_svm, fn_rate_logreg, n_samples)
z_knn_logreg, p_knn_logreg = z_test(fn_rate_knn, fn_rate_logreg, n_samples)

print(f"Z-Test (SVM vs KNN): Z-Statistic = {z_svm_knn:.4f}, P-Value = {p_svm_knn:.4f}")
print(f"Z-Test (SVM vs Logistic Regression): Z-Statistic = {z_svm_logreg:.4f}, P-Value = {p_svm_logreg:.4f}")
print(f"Z-Test (KNN vs Logistic Regression): Z-Statistic = {z_knn_logreg:.4f}, P-Value = {p_knn_logreg:.4f}")

# Determine the best model based on Type II error
best_model = min([(fn_rate_svm, "SVM"), (fn_rate_knn, "KNN"), (fn_rate_logreg, "Logistic Regression")])[1]
print(f"The model with the lowest False Negative Rate is: {best_model}")

# Recommendation for real-world deployment
if fn_rate_logreg <= fn_rate_svm and fn_rate_logreg <= fn_rate_knn:
    print("Logistic Regression is recommended for real-world deployment due to its lower False Negative Rate, reducing the risk of undiagnosed diabetes cases.")
else:
    print(f"{best_model} is recommended for real-world deployment due to its lower False Negative Rate.")


SVM False Negative Rate: 0.0845
KNN False Negative Rate: 0.1549
Logistic Regression False Negative Rate: 0.0423
Z-Test (SVM vs KNN): Z-Statistic = -1.1127, P-Value = 0.2658
Z-Test (SVM vs Logistic Regression): Z-Statistic = 0.8876, P-Value = 0.3747
Z-Test (KNN vs Logistic Regression): Z-Statistic = 1.9626, P-Value = 0.0497
The model with the lowest False Negative Rate is: Logistic Regression
Logistic Regression is recommended for real-world deployment due to its lower False Negative Rate, reducing the risk of undiagnosed diabetes cases.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


5.Train a Gradient Boosting Model and examine the misclassification of diabetic patients.


In [7]:
import pandas as pd
import numpy as np
import zipfile
import os
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

# Load dataset (assuming CSV file is extracted)
csv_path = os.path.join(extract_path, "/content/Early Stage Diabetes Risk Prediction Dataset/diabetes_data_upload.csv")  # Adjust filename if needed
df = pd.read_csv(csv_path)

# Encode categorical variables
label_encoders = {}
for col in df.columns:
    if df[col].dtype == 'object' and col != 'Age':  # Keep 'Age' as numeric
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

# Splitting data
X = df.drop(columns=['class'])  # Assuming 'class' is the target column
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing features (excluding Age for interpretation purposes)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(columns=["Age"]))
X_test_scaled = scaler.transform(X_test.drop(columns=["Age"]))

# Convert back to DataFrame and restore Age column
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns[1:], index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns[1:], index=X_test.index)
X_train.insert(0, "Age", df.loc[X_train.index, "Age"])
X_test.insert(0, "Age", df.loc[X_test.index, "Age"])

# Train Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

# Train Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Compute False Negative Rates (Type II Error)
def compute_false_negative_rate(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    fn_rate = cm[1][0] / (cm[1][0] + cm[1][1])
    return fn_rate, cm

fn_rate_gb, cm_gb = compute_false_negative_rate(y_test, y_pred_gb)
fn_rate_rf, cm_rf = compute_false_negative_rate(y_test, y_pred_rf)

print(f"Gradient Boosting False Negative Rate: {fn_rate_gb:.4f}")
print(f"Random Forest False Negative Rate: {fn_rate_rf:.4f}")

# Perform Z-Test to compare misclassified ages
misclassified_gb = X_test[(y_test != y_pred_gb)]['Age']
correctly_classified_gb = X_test[(y_test == y_pred_gb)]['Age']

z_stat, p_value = stats.ttest_ind(misclassified_gb, correctly_classified_gb, equal_var=False)

print(f"Z-Test for Age (Gradient Boosting Misclassified vs Correctly Classified): Z-Statistic = {z_stat:.4f}, P-Value = {p_value:.4f}")

# Recommendation based on Type II errors
best_model = "Gradient Boosting" if fn_rate_gb < fn_rate_rf else "Random Forest"
print(f"The model with the lowest False Negative Rate is: {best_model}")

if fn_rate_gb < fn_rate_rf:
    print("Gradient Boosting is recommended for medical use due to its lower False Negative Rate, reducing the risk of undiagnosed diabetes cases.")
else:
    print("Random Forest is recommended for medical use due to its lower False Negative Rate, reducing the risk of undiagnosed diabetes cases.")

Gradient Boosting False Negative Rate: 0.0423
Random Forest False Negative Rate: 0.0141
Z-Test for Age (Gradient Boosting Misclassified vs Correctly Classified): Z-Statistic = -1.8681, P-Value = 0.1928
The model with the lowest False Negative Rate is: Random Forest
Random Forest is recommended for medical use due to its lower False Negative Rate, reducing the risk of undiagnosed diabetes cases.


6.Train three different models (e.g., Logistic Regression, SVM, Random Forest) and compare their Type I and Type II error rates.

In [9]:
import pandas as pd
import numpy as np
import zipfile
import os
from scipy import stats
from statsmodels.stats.proportion import proportions_ztest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix


# Load dataset (assuming CSV file is extracted)
csv_path = os.path.join(extract_path, "/content/Early Stage Diabetes Risk Prediction Dataset/diabetes_data_upload.csv")  # Adjust filename if needed
df = pd.read_csv(csv_path)

# Encode categorical variables
label_encoders = {}
for col in df.columns:
    if df[col].dtype == 'object' and col != 'Age':  # Keep 'Age' as numeric
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

# Splitting data
X = df.drop(columns=['class'])  # Assuming 'class' is the target column
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing features (excluding Age for interpretation purposes)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(columns=["Age"]))
X_test_scaled = scaler.transform(X_test.drop(columns=["Age"]))

# Convert back to DataFrame and restore Age column
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns[1:], index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns[1:], index=X_test.index)
X_train.insert(0, "Age", df.loc[X_train.index, "Age"])
X_test.insert(0, "Age", df.loc[X_test.index, "Age"])

# Train models
models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "SVM": SVC(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)

    type_i_error = cm[0][1] / (cm[0][0] + cm[0][1])  # False positive rate
    type_ii_error = cm[1][0] / (cm[1][0] + cm[1][1])  # False negative rate

    results[name] = {
        "Type I Error": type_i_error,
        "Type II Error": type_ii_error,
        "False Positives": cm[0][1],
        "False Negatives": cm[1][0],
        "Total Positives": cm[0][0] + cm[0][1],
        "Total Negatives": cm[1][0] + cm[1][1]
    }

    print(f"{name} - Type I Error: {type_i_error:.4f}, Type II Error: {type_ii_error:.4f}")

# Perform Two-Proportion Z-Test for statistical significance
model_pairs = [("Logistic Regression", "SVM"), ("Logistic Regression", "Random Forest"), ("SVM", "Random Forest")]

for model1, model2 in model_pairs:
    for error_type in ["Type I Error", "Type II Error"]:
        count = np.array([results[model1]["False Positives" if error_type == "Type I Error" else "False Negatives"],
                          results[model2]["False Positives" if error_type == "Type I Error" else "False Negatives"]])

        nobs = np.array([results[model1]["Total Positives" if error_type == "Type I Error" else "Total Negatives"],
                         results[model2]["Total Positives" if error_type == "Type I Error" else "Total Negatives"]])

        if min(nobs) > 0:  # Avoid divide by zero error
            z_stat, p_value = proportions_ztest(count, nobs)
            print(f"Z-Test for {error_type} ({model1} vs {model2}): Z-Statistic = {z_stat:.4f}, P-Value = {p_value:.4f}")
        else:
            print(f"Z-Test for {error_type} ({model1} vs {model2}): Not enough data for statistical test.")

# Select the best model for medical use (minimizing Type II Error)
best_model = min(results, key=lambda x: results[x]["Type II Error"])
print(f"Recommended model for medical use: {best_model} due to lowest Type II Error.")


Logistic Regression - Type I Error: 0.1515, Type II Error: 0.0423
SVM - Type I Error: 0.3030, Type II Error: 0.0704
Random Forest - Type I Error: 0.0000, Type II Error: 0.0141
Z-Test for Type I Error (Logistic Regression vs SVM): Z-Statistic = -1.4686, P-Value = 0.1419
Z-Test for Type II Error (Logistic Regression vs SVM): Z-Statistic = -0.7279, P-Value = 0.4667
Z-Test for Type I Error (Logistic Regression vs Random Forest): Z-Statistic = 2.3259, P-Value = 0.0200
Z-Test for Type II Error (Logistic Regression vs Random Forest): Z-Statistic = 1.0144, P-Value = 0.3104
Z-Test for Type I Error (SVM vs Random Forest): Z-Statistic = 3.4330, P-Value = 0.0006
Z-Test for Type II Error (SVM vs Random Forest): Z-Statistic = 1.6686, P-Value = 0.0952
Recommended model for medical use: Random Forest due to lowest Type II Error.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
