In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("diabetes_result.csv")

# Display basic information and the first few rows
df.info()
df.describe()


# Rows: 768
# Target Variable: Outcome (1 = Diabetic, 0 = Non-Diabetic)
# Features: Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, Age
# Issue Detected: The Insulin, SkinThickness, and possibly Glucose and BloodPressure columns contain 0 values which may represent missing data.


In [None]:
import matplotlib.pyplot as plt

# Exclude non-numeric columns like 'id' if present
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
if 'id' in numeric_cols:
    numeric_cols = numeric_cols.drop('id')

# Plot histograms for all numeric features
n_cols = 3
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(18, 5 * n_rows))
for i, col in enumerate(numeric_cols, 1):
    plt.subplot(n_rows, n_cols, i)
    plt.hist(df[col], bins=20, edgecolor='black', color='skyblue')
    plt.title(f"{col} Distribution")
    plt.xlabel(col)
    plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

In [None]:
# Handle missing value
import pandas as pd
import numpy as np

# Make a copy
df_cleaned = df.copy()

# Columns to exclude from 0 → NaN replacement
exclude_cols = ['Outcome', 'id'] if 'id' in df.columns else ['Outcome']

# Replace 0 with NaN in relevant features
cols_to_check = df_cleaned.columns.difference(exclude_cols)
df_cleaned[cols_to_check] = df_cleaned[cols_to_check].replace(0, np.nan)

# Count of missing values
missing_counts = df_cleaned.isnull().sum()

# Percentage of missing values
missing_percentage = (missing_counts / len(df_cleaned)) * 100

# Combine into one table
missing_summary = pd.DataFrame({
    'Missing Count': missing_counts,
    'Missing Percentage (%)': missing_percentage.round(2)
})

# Display the table
print("🔍 Missing Values Summary:")
print(missing_summary)



In [None]:
# Handle missing value

##############################
# The dataset has no missing values (Non-Null Count = 768 for all columns)
# However, biologically implausible values like 0 in Glucose, BloodPressure, SkinThickness, Insulin, and BMI may indicate missing data.

import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

# Load the dataset (only if not already loaded)
# df = pd.read_csv("diabetes_result.csv")

# Make a copy
df_cleaned = df.copy()

# Define columns with biologically implausible zero values (to be treated as missing)
cols_with_invalid_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

# Replace 0 with NaN only in those columns (not in Pregnancies)
df_cleaned[cols_with_invalid_zeros] = df_cleaned[cols_with_invalid_zeros].replace(0, np.nan)

# Count missing values before imputation
missing_before = df_cleaned[cols_with_invalid_zeros].isnull().sum()

# Prepare data for imputation — exclude 'id' and 'Outcome'
features_only = df_cleaned.drop(columns=['id', 'Outcome'])

# Standardize features for KNN
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_only)

# Apply KNN Imputer
imputer = KNNImputer(n_neighbors=5)
features_imputed = imputer.fit_transform(features_scaled)

# Inverse transform to return to original scale
features_unscaled = scaler.inverse_transform(features_imputed)
features_clean = pd.DataFrame(features_unscaled, columns=features_only.columns)

# Combine with 'id' and 'Outcome'
df_imputed_final = pd.concat([df_cleaned[['id', 'Outcome']].reset_index(drop=True), features_clean], axis=1)

# Recheck missing values after imputation
missing_after = df_imputed_final[cols_with_invalid_zeros].isnull().sum()

# Summary stats before and after imputation
summary_before = df[cols_with_invalid_zeros].describe()
summary_after = df_imputed_final[cols_with_invalid_zeros].describe()

# Create a comparison DataFrame for missing values
missing_summary = pd.DataFrame({
    "Missing Before Imputation": missing_before,
    "Missing After Imputation": missing_after
})
print("Missing Value Summary:")
print(missing_summary)

print("\nSummary Statistics Before Imputation:")
print(summary_before)

print("\nSummary Statistics After Imputation:")
print(summary_after)

import matplotlib.pyplot as plt

# Assuming df, df_cleaned, and df_imputed_final are loaded and processed as per the script

# Extract relevant feature columns excluding 'id' and 'Outcome'
features_to_plot = df.columns.difference(['id', 'Outcome'])

# Set up plot layout
n_features = len(features_to_plot)
n_cols = 3
n_rows = (n_features + n_cols - 1) // n_cols

plt.figure(figsize=(18, 5 * n_rows))

# Plot histograms before and after imputation
for i, feature in enumerate(features_to_plot, 1):
    plt.subplot(n_rows, n_cols, i)
    plt.hist(df[feature].dropna(), bins=20, alpha=0.5, label='Before', color='orange', edgecolor='black')
    plt.hist(df_imputed_final[feature], bins=20, alpha=0.5, label='After', color='green', edgecolor='black')
    plt.title(f"{feature} Distribution")
    plt.xlabel(feature)
    plt.ylabel("Frequency")
    plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# Outlier Handling with median

########################################
# Outlier handling on the KNN-imputed data (df_imputed_final)
# Exclude 'id', 'Outcome', and 'Age' from processing
def handle_outliers_iqr(df, exclude_cols=None):
    df_out = df.copy()
    if exclude_cols is None:
        exclude_cols = []

    for col in df.columns:
        if col in exclude_cols:
            continue
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        median = df[col].median()
        outliers = (df[col] < lower_bound) | (df[col] > upper_bound)
        df_out.loc[outliers, col] = median
    return df_out

# Apply IQR-based outlier handling
df_outliers_handled = handle_outliers_iqr(df_imputed_final, exclude_cols=['id', 'Outcome', 'Age'])

# Compare summaries before and after outlier handling (excluding 'id' and 'Outcome')
summary_before_outliers = df_imputed_final.drop(columns=['id', 'Outcome']).describe()
summary_after_outliers = df_outliers_handled.drop(columns=['id', 'Outcome']).describe()

print("\nSummary Statistics Before Outlier Handling:")
print(summary_before_outliers)

print("\nSummary Statistics After Outlier Handling:")
print(summary_after_outliers)

import matplotlib.pyplot as plt

# Features to plot (excluding ID and Outcome)
features_to_plot = df_imputed_final.columns.difference(['id', 'Outcome'])

# Set up plot layout
n_features = len(features_to_plot)
n_cols = 3
n_rows = (n_features + n_cols - 1) // n_cols

plt.figure(figsize=(18, 5 * n_rows))

# Plot histograms before and after outlier handling
for i, feature in enumerate(features_to_plot, 1):
    plt.subplot(n_rows, n_cols, i)
    plt.hist(df_imputed_final[feature], bins=20, alpha=0.5, label='Before', color='orange', edgecolor='black')
    plt.hist(df_outliers_handled[feature], bins=20, alpha=0.5, label='After', color='green', edgecolor='black')
    plt.title(f"{feature} Distribution")
    plt.xlabel(feature)
    plt.ylabel("Frequency")
    plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# EDA

#####################################
import seaborn as sns
import matplotlib.pyplot as plt

# Round pregnancies to keep it discrete before EDA
df_eda_ready = df_outliers_handled.copy()
df_eda_ready['Pregnancies'] = df_eda_ready['Pregnancies'].round().astype(int)

# Distribution plots
plt.figure(figsize=(15, 10))
sns.countplot(data=df_eda_ready, x='Outcome')
plt.title("Class Distribution: Diabetes Outcome")
plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 8))
correlation_matrix = df_eda_ready.drop(columns=['id']).corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

# Pairplot (sample)
sns.pairplot(df_eda_ready.drop(columns='id'), hue='Outcome', corner=True)
plt.suptitle("Pairwise Feature Relationships", y=1.02)
plt.show()

# Boxplots for each feature by Outcome
features = df_eda_ready.columns.drop(['id', 'Outcome'])
plt.figure(figsize=(20, 25))
for i, col in enumerate(features):
    plt.subplot(5, 2, i+1)
    sns.boxplot(data=df_eda_ready, x='Outcome', y=col)
    plt.title(f"{col} by Diabetes Outcome")
plt.tight_layout()
plt.show()

In [None]:
from scipy.stats import f_oneway
import pandas as pd

# Prepare
features = df_eda_ready.columns.drop(['id', 'Outcome'])
group0 = df_eda_ready[df_eda_ready['Outcome'] == 0]
group1 = df_eda_ready[df_eda_ready['Outcome'] == 1]

# Run ANOVA per feature
results = []

for feature in features:
    stat, p = f_oneway(group0[feature], group1[feature])
    significance = ''
    if p < 0.001:
        significance = '***'  # Highly significant
    elif p < 0.01:
        significance = '**'   # Very significant
    elif p < 0.05:
        significance = '*'    # Significant
    else:
        significance = 'ns'   # Not significant
    
    results.append({
        'Feature': feature,
        'F-Statistic': round(stat, 3),
        'p-Value': round(p, 6),
        'Significance': significance
    })

# Create and print table
anova_df = pd.DataFrame(results)
#anova_df = anova_df.sort_values(by='p-Value')
anova_df = anova_df.sort_values(by=['p-Value', 'F-Statistic'], ascending=[True, False])


# Display as a clean text table
print("=== Feature-wise ANOVA Results ===")
print(anova_df.to_string(index=False))


In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

X = df_eda_ready.drop(columns=['id', 'Outcome'])
X = add_constant(X)  # Add constant for VIF calculation

vif_df = pd.DataFrame()
vif_df["Feature"] = X.columns
vif_df["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_df.sort_values(by='VIF', ascending=False))


In [None]:
# Feature Engineering (intraction + categorization + log transform)

############################################################

df = df_outliers_handled.copy()  # <-- Start from clean data

# Interaction features
df['Glucose_Insulin'] = df['Glucose'] * df['Insulin']
df['Glucose_BMI'] = df['Glucose'] * df['BMI']
df['Insulin_BMI'] = df['Insulin'] * df['BMI']

# Discretization (binning) into categories
df['Glucose_Bin'] = pd.cut(df['Glucose'], bins=[0, 90, 125, 150, 200], labels=[0, 1, 2, 3])
df['BMI_Bin'] = pd.cut(df['BMI'], bins=[0, 25, 30, 35, 50], labels=[0, 1, 2, 3])
df['Age_Bin'] = pd.cut(df['Age'], bins=[20, 30, 40, 50, 70, 90], labels=[0, 1, 2, 3, 4])


df.head()



In [None]:
# Histogram plot before log transformation

###################################################################
import matplotlib.pyplot as plt
import seaborn as sns

# Create a list of features to visualize (you can adjust this as needed)
features_to_plot = [
    'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
    'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome',
    'Glucose_Insulin', 'Glucose_BMI', 'Insulin_BMI',
    'Glucose_Bin', 'BMI_Bin', 'Age_Bin'
]

# Setup figure size and layout
num_features = len(features_to_plot)
cols = 4
rows = (num_features // cols) + int(num_features % cols > 0)

plt.figure(figsize=(20, 5 * rows))

# Plot each histogram
for i, feature in enumerate(features_to_plot, 1):
    plt.subplot(rows, cols, i)
    
    # Handle categorical bin plots
    if str(df[feature].dtype) == 'category':
        sns.histplot(df[feature].cat.codes, bins=len(df[feature].cat.categories), color='skyblue')
    else:
        sns.histplot(df[feature], bins=30, kde=False, color='steelblue')
    
    plt.title(f'{feature}')
    plt.xlabel('')
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
# Log Transformation

###################################
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# -------------------------------
# Log-transform skewed features
df['Log_Insulin'] = np.log1p(df['Insulin'])  # log1p = log(x + 1)
df['Log_DPF'] = np.log1p(df['DiabetesPedigreeFunction'])
df['Log_Age'] = np.log1p(df['Age'])
df['Log_Glucose_Insulin'] = np.log1p(df['Glucose_Insulin'])
df['Log_Insulin_BMI'] = np.log1p(df['Insulin_BMI'])

# -------------------------------
# List of features to visualize
features_to_plot = [
    'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
    'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome',
    'Glucose_Insulin', 'Glucose_BMI', 'Insulin_BMI',
    'Log_Insulin', 'Log_DPF', 'Log_Age','Log_Glucose_Insulin', 'Log_Insulin_BMI',
    'Glucose_Bin', 'BMI_Bin', 'Age_Bin'
]

# -------------------------------
# Setup figure size and layout
num_features = len(features_to_plot)
cols = 4
rows = (num_features // cols) + int(num_features % cols > 0)

plt.figure(figsize=(22, 5 * rows))

# Plot histograms
for i, feature in enumerate(features_to_plot, 1):
    plt.subplot(rows, cols, i)
    
    # Plot categorical bins using .cat.codes
    if str(df[feature].dtype) == 'category':
        sns.histplot(df[feature].cat.codes, bins=len(df[feature].cat.categories), color='skyblue')
    else:
        sns.histplot(df[feature], bins=30, kde=False, color='steelblue')
    
    plt.title(f'{feature}')
    plt.xlabel('')
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
from scipy.stats import f_oneway
import pandas as pd

# Split data into Outcome groups
group0 = df[df['Outcome'] == 0]
group1 = df[df['Outcome'] == 1]

# Drop 'Outcome' and 'id', select numeric features
features_to_test = df.select_dtypes(include='number').drop(columns=['Outcome', 'id'], errors='ignore').columns

# Run ANOVA
anova_results = []

for feature in features_to_test:
    stat, p = f_oneway(group0[feature], group1[feature])
    
    if p < 0.001:
        sig = '***'
    elif p < 0.01:
        sig = '**'
    elif p < 0.05:
        sig = '*'
    else:
        sig = 'ns'
    
    anova_results.append({
        'Feature': feature,
        'F-Statistic': round(stat, 3),
        'p-Value': round(p, 6),
        'Significance': sig
    })

# Sort by F-statistic descending
anova_df = pd.DataFrame(anova_results).sort_values(by='F-Statistic', ascending=False)

# Display
print("=== ANOVA Test Results (Sorted by Significance) ===")
print(anova_df.to_string(index=False))


In [None]:
# Standardization

####################################################
from sklearn.preprocessing import StandardScaler

df = df.drop(columns=['id'])

# Define features to standardize (drop Outcome, ID, and bins)
features_to_standardize = [
    'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
    'DiabetesPedigreeFunction', 'Age', 'Glucose_Insulin', 'Glucose_BMI', 'Insulin_BMI',
    'Log_Insulin', 'Log_DPF', 'Log_Age', 'Log_Glucose_Insulin', 'Log_Insulin_BMI'
]

# Initialize scaler
scaler = StandardScaler()

# Fit and transform
df_standardized = df.copy()
df_standardized[features_to_standardize] = scaler.fit_transform(df_standardized[features_to_standardize])

# Now df_standardized has scaled features

df_standardized.head()



In [None]:
# X = features, drop Outcome and ID/bins
X = df_standardized.drop(columns=['Outcome', 'Glucose_Bin', 'BMI_Bin', 'Age_Bin'])

# y = target
y = df_standardized['Outcome']

from sklearn.model_selection import train_test_split

# Split before any feature selection
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)



In [None]:
# Random forest model

    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import train_test_split, RandomizedSearchCV
    from sklearn.metrics import (
        accuracy_score, precision_score, recall_score, f1_score,
        roc_auc_score, roc_curve, precision_recall_curve,
        confusion_matrix, ConfusionMatrixDisplay
    )
    from imblearn.over_sampling import SMOTE
    import matplotlib.pyplot as plt
    import pandas as pd
    import numpy as np

    # Prepare dataset (assumes df is preloaded)
    df = df.drop(columns='id', errors='ignore')
    X_full = df.drop(columns=['Outcome', 'Glucose_Bin', 'BMI_Bin', 'Age_Bin'], errors='ignore')
    y = df['Outcome']

    # Step 0: Feature Importance Selection using Random Forest
    rf_initial = RandomForestClassifier(random_state=42)
    rf_initial.fit(X_full, y)

    feature_importances = pd.Series(rf_initial.feature_importances_, index=X_full.columns)
    selected_features = feature_importances.sort_values(ascending=False).head(10).index.tolist()

    print("=== Selected Top 10 Features by Random Forest ===")
    print(selected_features)

    X = X_full[selected_features]

    # Step 1: Train-test split (stratified)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    # Step 2: SMOTE to balance training data
    smote = SMOTE(random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

    # Step 3: Hyperparameter tuning for Random Forest
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [4, 6, 8],
        'min_samples_split': [4, 6, 8],
        'min_samples_leaf': [2, 4, 6],
        'max_features': ['sqrt', 0.5]
    }

    rf_model = RandomForestClassifier(random_state=42)
    rf_search = RandomizedSearchCV(
        rf_model, param_distributions=param_grid,
        n_iter=25, scoring='f1', cv=15, n_jobs=-1, verbose=0, random_state=42
    )
    rf_search.fit(X_train_res, y_train_res)
    best_rf = rf_search.best_estimator_

    # Displaying the best parameters found by RandomizedSearchCV
    best_parameters = rf_search.best_params_
    print("=== Best Hyperparameters Found ===")
    print(best_parameters)

    # Step 4: Evaluation
    results = {}
    y_pred_train = best_rf.predict(X_train_res)
    y_pred_test = best_rf.predict(X_test)
    y_proba_train = best_rf.predict_proba(X_train_res)[:, 1]
    y_proba_test = best_rf.predict_proba(X_test)[:, 1]

    for label, y_data, y_pred, y_proba in [
        ('Train', y_train_res, y_pred_train, y_proba_train),
        ('Test', y_test, y_pred_test, y_proba_test)
    ]:
        results[label] = {
            'Accuracy': accuracy_score(y_data, y_pred),
            'Precision': precision_score(y_data, y_pred),
            'Recall': recall_score(y_data, y_pred),
            'F1 Score': f1_score(y_data, y_pred),
            'Average Recall': (recall_score(y_data, y_pred, pos_label=1) + recall_score(y_data, y_pred, pos_label=0)) / 2,
            'AUC': roc_auc_score(y_data, y_proba),
            'FPR_TPR': roc_curve(y_data, y_proba)
        }

    # === Plotting ===
    # ROC Curves
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    for ax, (label, res) in zip(axes, results.items()):
        fpr, tpr, _ = res['FPR_TPR']
        ax.plot(fpr, tpr, label=f'Random Forest ({label} AUC = {res["AUC"]:.2f})')
        ax.plot([0, 1], [0, 1], linestyle='--', color='orange')
        ax.set_title(f'Random Forest ROC Curve ({label} Set)')
        ax.set_xlabel('False Positive Rate')
        ax.set_ylabel('True Positive Rate')
        ax.legend()
    plt.tight_layout()
    plt.show()

    # === Confusion Matrices ===
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    for ax, (label, y_true, y_pred) in zip(axes, [
        ('Train', y_train_res, y_pred_train),
        ('Test', y_test, y_pred_test)
    ]):
        cm = confusion_matrix(y_true, y_pred)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
        disp.plot(ax=ax, cmap='Blues', values_format='d')
        ax.set_title(f'Random Forest Confusion Matrix ({label} Set)')
    plt.tight_layout()
    plt.show()


    # === Results Summary ===
    results_summary = pd.DataFrame(results).T.drop(columns='FPR_TPR').round(3)

    print("=== Train vs Test Performance ===")
    print(results_summary)


In [None]:
# Random forest model with categorical features

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, precision_recall_curve,
    confusion_matrix, ConfusionMatrixDisplay
)
from sklearn.inspection import permutation_importance
from sklearn.calibration import calibration_curve
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Assume df is already loaded and cleaned

# Include categorical bin features into feature set
categorical_features = ['Glucose_Bin', 'BMI_Bin', 'Age_Bin']

# Make sure categorical features are integers
for cat in categorical_features:
    df[cat] = df[cat].astype(int)

# Step 0: Full feature set including numerical + categorical
X_full = df.drop(columns=['Outcome'], errors='ignore')
y = df['Outcome']

# Step 1: Feature Importance Selection using Random Forest
rf_initial = RandomForestClassifier(random_state=42)
rf_initial.fit(X_full, y)

feature_importances = pd.Series(rf_initial.feature_importances_, index=X_full.columns)
selected_features = feature_importances.sort_values(ascending=False).head(10).index.tolist()

print("=== Selected Top 10 Features by Random Forest ===")
print(selected_features)

# Step 2: Select only top 10 features
X = X_full[selected_features]

# Step 3: Train-Test Split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Step 4: Apply SMOTE to balance training set
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Step 5: Random Forest with RandomizedSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [4, 6, 8],
    'min_samples_split': [4, 6, 8],
    'min_samples_leaf': [2, 4, 6],
    'max_features': ['sqrt', 0.5],
    #'class_weight': ['balanced']
}

rf_model = RandomForestClassifier(random_state=42)
rf_search = RandomizedSearchCV(
    rf_model, param_distributions=param_grid,
    n_iter=25, scoring='f1', cv=15, n_jobs=-1, random_state=42
)
rf_search.fit(X_train_res, y_train_res)
best_rf = rf_search.best_estimator_

# Step 6: Evaluation
results = {}
y_pred_train = best_rf.predict(X_train_res)
y_pred_test = best_rf.predict(X_test)
y_proba_train = best_rf.predict_proba(X_train_res)[:, 1]
y_proba_test = best_rf.predict_proba(X_test)[:, 1]

for label, y_true, y_pred, y_proba in [
    ('Train', y_train_res, y_pred_train, y_proba_train),
    ('Test', y_test, y_pred_test, y_proba_test)
]:
    results[label] = {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1 Score': f1_score(y_true, y_pred),
        'Average Recall': (recall_score(y_true, y_pred, pos_label=1) + recall_score(y_true, y_pred, pos_label=0)) / 2,
        'AUC': roc_auc_score(y_true, y_proba),
        'FPR_TPR': roc_curve(y_true, y_proba)
    }

# === Plotting ===
# ROC Curves
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
for ax, (label, res) in zip(axes, results.items()):
    fpr, tpr, _ = res['FPR_TPR']
    ax.plot(fpr, tpr, label=f'{label} ROC (AUC = {res["AUC"]:.2f})')
    ax.plot([0, 1], [0, 1], linestyle='--', color='orange')
    ax.set_title(f'{label} ROC Curve')
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.legend()
plt.tight_layout()
plt.show()

# Confusion Matrices
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# Train
ConfusionMatrixDisplay(confusion_matrix(y_train_res, y_pred_train),
                       display_labels=["No Diabetes", "Diabetes"]).plot(ax=axes[0], cmap='Blues', values_format='d')
axes[0].set_title('Confusion Matrix - Train')
# Test
ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred_test),
                       display_labels=["No Diabetes", "Diabetes"]).plot(ax=axes[1], cmap='Blues', values_format='d')
axes[1].set_title('Confusion Matrix - Test')
plt.tight_layout()
plt.show()

# === Print results ===
print("=== Train vs Test Performance ===")
print(pd.DataFrame(results).T.drop(columns='FPR_TPR').round(3))


In [None]:
# Logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, precision_recall_curve,
    confusion_matrix, ConfusionMatrixDisplay
)
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Prepare dataset (assumes df is preloaded)
df = df.drop(columns='id', errors='ignore')
X_full = df.drop(columns=['Outcome', 'Glucose_Bin', 'BMI_Bin', 'Age_Bin'], errors='ignore')
y = df['Outcome']

# Step 0: Feature Selection using RFE
X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(
    X_full, y, test_size=0.2, stratify=y, random_state=42
)

lr_initial = LogisticRegression(random_state=42, solver='liblinear', penalty='l2')
lr_initial.fit(X_train_temp, y_train_temp)

rfe = RFE(estimator=lr_initial, n_features_to_select=10)
rfe.fit(X_train_temp, y_train_temp)

selected_features = X_full.columns[rfe.get_support()].tolist()
print("=== Selected Features by RFE ===")
print(selected_features)

X = X_full[selected_features]

# Step 1: Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Step 2: SMOTE to balance training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Step 3: Hyperparameter tuning for Logistic Regression
param_grid = {
    'C': np.logspace(-4, 4, 10),
    'penalty': ['l1','l2'],
    'solver': ['liblinear'],
    'max_iter': [100, 200, 300]
}

lr_model = LogisticRegression(random_state=42)
lr_search = RandomizedSearchCV(
    lr_model, param_distributions=param_grid,
    n_iter=25, scoring='f1', cv=15, n_jobs=-1, verbose=0, random_state=42
)
lr_search.fit(X_train_res, y_train_res)
best_lr = lr_search.best_estimator_

# Displaying the best parameters found by RandomizedSearchCV
print("=== Best Hyperparameters Found ===")
print(lr_search.best_params_)

# Step 4: Evaluation
results = {}
y_pred_train = best_lr.predict(X_train_res)
y_pred_test = best_lr.predict(X_test)
y_proba_train = best_lr.predict_proba(X_train_res)[:, 1]
y_proba_test = best_lr.predict_proba(X_test)[:, 1]

for label, y_data, y_pred, y_proba in [
    ('Train', y_train_res, y_pred_train, y_proba_train),
    ('Test', y_test, y_pred_test, y_proba_test)
]:
    results[label] = {
        'Accuracy': accuracy_score(y_data, y_pred),
        'Precision': precision_score(y_data, y_pred),
        'Recall': recall_score(y_data, y_pred),
        'F1 Score': f1_score(y_data, y_pred),
        'Average Recall': (recall_score(y_data, y_pred, pos_label=1) + recall_score(y_data, y_pred, pos_label=0)) / 2,
        'AUC': roc_auc_score(y_data, y_proba),
        'FPR_TPR': roc_curve(y_data, y_proba)
    }

# === Plotting ===
# ROC Curves
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
for ax, (label, res) in zip(axes, results.items()):
    fpr, tpr, _ = res['FPR_TPR']
    ax.plot(fpr, tpr, label=f'Logistic Regression ({label} AUC = {res["AUC"]:.2f})')
    ax.plot([0, 1], [0, 1], linestyle='--', color='orange')
    ax.set_title(f'Logistic Regression ROC Curve ({label} Set)')
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.legend()
plt.tight_layout()
plt.show()

# === Confusion Matrices ===
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
for ax, (label, y_true, y_pred) in zip(axes, [
    ('Train', y_train_res, y_pred_train),
    ('Test', y_test, y_pred_test)
]):
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
    disp.plot(ax=ax, cmap='Blues', values_format='d')
    ax.set_title(f'Logistic Regression Confusion Matrix ({label} Set)')
plt.tight_layout()
plt.show()

# === Results Summary ===
results_summary = pd.DataFrame(results).T.drop(columns='FPR_TPR').round(3)

print("=== Train vs Test Performance ===")
print(results_summary)




In [None]:
# Logistic Regression model with categorical features
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, precision_recall_curve,
    confusion_matrix, ConfusionMatrixDisplay
)
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Assume df is already loaded and cleaned
categorical_features = ['Glucose_Bin', 'BMI_Bin', 'Age_Bin']
for cat in categorical_features:
    df[cat] = df[cat].astype(int)

# Step 0: Feature Set
X_full = df.drop(columns=['Outcome'], errors='ignore')
y = df['Outcome']

# Step 1: Feature Importance using Random Forest
from sklearn.ensemble import RandomForestClassifier
rf_initial = RandomForestClassifier(random_state=42)
rf_initial.fit(X_full, y)

feature_importances = pd.Series(rf_initial.feature_importances_, index=X_full.columns)
selected_features = feature_importances.sort_values(ascending=False).head(10).index.tolist()
print("=== Selected Top 10 Features by Random Forest ===")
print(selected_features)

# Step 2: Select Top 10 Features
X = X_full[selected_features]

# Step 3: Train-Test Split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Step 4: SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Step 5: Logistic Regression with Hyperparameter Tuning
param_grid = {
    'C': np.logspace(-4, 4, 10),
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear'],
    'max_iter': [100, 200, 300]
}
lr_model = LogisticRegression(random_state=42)
lr_search = RandomizedSearchCV(
    lr_model, param_distributions=param_grid,
    n_iter=25, scoring='f1', cv=15, n_jobs=-1, verbose=0, random_state=42
)
lr_search.fit(X_train_res, y_train_res)
best_lr = lr_search.best_estimator_

print("=== Best Hyperparameters Found ===")
print(lr_search.best_params_)

# Step 6: Evaluation
results = {}
y_pred_train = best_lr.predict(X_train_res)
y_pred_test = best_lr.predict(X_test)
y_proba_train = best_lr.predict_proba(X_train_res)[:, 1]
y_proba_test = best_lr.predict_proba(X_test)[:, 1]

for label, y_true, y_pred, y_proba in [
    ('Train', y_train_res, y_pred_train, y_proba_train),
    ('Test', y_test, y_pred_test, y_proba_test)
]:
    results[label] = {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1 Score': f1_score(y_true, y_pred),
        'Average Recall': (recall_score(y_true, y_pred, pos_label=1) + recall_score(y_true, y_pred, pos_label=0)) / 2,
        'AUC': roc_auc_score(y_true, y_proba),
        'FPR_TPR': roc_curve(y_true, y_proba)
    }

# === Plotting ===
# ROC Curves
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
for ax, (label, res) in zip(axes, results.items()):
    fpr, tpr, _ = res['FPR_TPR']
    ax.plot(fpr, tpr, label=f'{label} ROC (AUC = {res["AUC"]:.2f})')
    ax.plot([0, 1], [0, 1], linestyle='--', color='orange')
    ax.set_title(f'{label} ROC Curve')
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.legend()
plt.tight_layout()
plt.show()

# Confusion Matrices
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# Train
ConfusionMatrixDisplay(confusion_matrix(y_train_res, y_pred_train),
                       display_labels=["No Diabetes", "Diabetes"]).plot(ax=axes[0], cmap='Blues', values_format='d')
axes[0].set_title('Confusion Matrix - Train')
# Test
ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred_test),
                       display_labels=["No Diabetes", "Diabetes"]).plot(ax=axes[1], cmap='Blues', values_format='d')
axes[1].set_title('Confusion Matrix - Test')
plt.tight_layout()
plt.show()

# Results Summary
print("=== Train vs Test Performance ===")
print(pd.DataFrame(results).T.drop(columns='FPR_TPR').round(3))


In [None]:
# SVM model

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay
)
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# === Step 0: Dataset Preparation ===
df = df.drop(columns='id', errors='ignore')
X_full = df.drop(columns=['Outcome', 'Glucose_Bin', 'BMI_Bin', 'Age_Bin'], errors='ignore')
y = df['Outcome']

# === Step 0.5: Feature Selection using Random Forest ===
rf_initial = RandomForestClassifier(random_state=42)
rf_initial.fit(X_full, y)
feature_importances = pd.Series(rf_initial.feature_importances_, index=X_full.columns)
selected_features = feature_importances.sort_values(ascending=False).head(10).index.tolist()

print("=== Selected Top 10 Features by Random Forest ===")
print(selected_features)

X = X_full[selected_features]

# === Step 1: Train-Test Split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# === Step 2: Apply SMOTE on Training Data ===
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# === Step 3: SVM Model with RandomizedSearchCV (linear kernel) ===
svm_model = SVC(probability=False, random_state=42)

param_grid = {
    'C': np.logspace(-3, 3, 10),
    'kernel': ['linear'],
}

svm_search = RandomizedSearchCV(
    svm_model, param_distributions=param_grid,
    n_iter=5, scoring='f1', cv=5, n_jobs=-1, verbose=0, random_state=42
)
svm_search.fit(X_train_res, y_train_res)
best_svm = svm_search.best_estimator_

print("\n=== Best Hyperparameters Found ===")
print(svm_search.best_params_)

# === Step 4: Evaluation ===
results = {}
y_pred_train = best_svm.predict(X_train_res)
y_pred_test = best_svm.predict(X_test)

# Decision function for ROC curve
y_score_train = best_svm.decision_function(X_train_res)
y_score_test = best_svm.decision_function(X_test)

for label, y_data, y_pred in [('Train', y_train_res, y_pred_train), ('Test', y_test, y_pred_test)]:
    results[label] = {
        'Accuracy': accuracy_score(y_data, y_pred),
        'Precision': precision_score(y_data, y_pred),
        'Recall': recall_score(y_data, y_pred),
        'F1 Score': f1_score(y_data, y_pred),
        'Average Recall': (recall_score(y_data, y_pred, pos_label=1) + recall_score(y_data, y_pred, pos_label=0)) / 2
    }

# === Step 5: Plotting ROC Curve (Separate Train and Test Figures) ===

# ROC for Train Set
fpr_train, tpr_train, _ = roc_curve(y_train_res, y_score_train)
auc_train = roc_auc_score(y_train_res, y_score_train)

plt.figure(figsize=(8, 6))
plt.plot(fpr_train, tpr_train, label=f'Train ROC (AUC = {auc_train:.2f})', linestyle='-', linewidth=2)
plt.plot([0, 1], [0, 1], linestyle=':', color='black')
plt.title('ROC Curve - Train Set (SVM Model)', fontsize=16)
plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.legend(fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.show()

# ROC for Test Set
fpr_test, tpr_test, _ = roc_curve(y_test, y_score_test)
auc_test = roc_auc_score(y_test, y_score_test)

plt.figure(figsize=(8, 6))
plt.plot(fpr_test, tpr_test, label=f'Test ROC (AUC = {auc_test:.2f})', linestyle='--', linewidth=2)
plt.plot([0, 1], [0, 1], linestyle=':', color='black')
plt.title('ROC Curve - Test Set (SVM Model)', fontsize=16)
plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.legend(fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.show()


# === Step 6: Confusion Matrix (Train and Test) ===
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Train Confusion Matrix
cm_train = confusion_matrix(y_train_res, y_pred_train)
disp_train = ConfusionMatrixDisplay(confusion_matrix=cm_train, display_labels=['No Diabetes', 'Diabetes'])
disp_train.plot(ax=axes[0], cmap='Blues', values_format='d', colorbar=False)
axes[0].set_title('Confusion Matrix - Train Set', fontsize=14)
axes[0].grid(False)

# Test Confusion Matrix
cm_test = confusion_matrix(y_test, y_pred_test)
disp_test = ConfusionMatrixDisplay(confusion_matrix=cm_test, display_labels=['No Diabetes', 'Diabetes'])
disp_test.plot(ax=axes[1], cmap='Blues', values_format='d', colorbar=False)
axes[1].set_title('Confusion Matrix - Test Set', fontsize=14)
axes[1].grid(False)

plt.suptitle('SVM Model - Confusion Matrices (Train vs Test)', fontsize=16)
plt.tight_layout()
plt.show()

# === Step 7: Results Summary ===
results_summary = pd.DataFrame(results).T.round(3)

print("\n=== Train vs Test Performance Summary ===")
print(results_summary)
