In [189]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    roc_curve
)
import seaborn as sns
import matplotlib.pyplot as plt
import math
import numpy as np
from scipy.stats import zscore

In [190]:
df = pd.read_csv('diabetes_prediction_dataset.csv')
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [191]:
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [None]:
sns.pairplot(df)
plt.show()

In [None]:
features = ['gender','age','hypertension' ,'heart_disease','smoking_history','bmi' ,'HbA1c_level' ,'blood_glucose_level' ,'diabetes']

for feature in features:
    plt.figure(figsize=(7, 4))
    sns.boxplot(x=df[feature], color='orange')
    plt.title(f"Boxplot of {feature.capitalize()}")
    plt.xlabel(feature.capitalize())
    plt.grid(True)
    plt.show()

In [None]:
def remove_outliers_zscore(df, cols, threshold=3, target_col='diabetes'):
    frames = []

    for class_val in df[target_col].unique():
        class_df = df[df[target_col] == class_val].copy()
        zscores = np.abs(zscore(class_df[cols]))

        # Keep only rows with all z-scores < threshold
        mask = (zscores < threshold).all(axis=1)
        class_df = class_df[mask]
        frames.append(class_df)

    return pd.concat(frames, axis=0).reset_index(drop=True)


In [None]:
df_z_cleaned = remove_outliers_zscore(df, ['bmi', 'HbA1c_level', 'blood_glucose_level', 'age'])



In [None]:

numeric_cols = df_z_cleaned.select_dtypes(include='number').columns.tolist()

plt.figure(figsize=(15, 8))
n_cols = 3
n_rows = math.ceil(len(numeric_cols) / n_cols)

for i, col in enumerate(numeric_cols, 1):
    plt.subplot(n_rows, n_cols, i)
    sns.boxplot(y=df_z_cleaned[col], color='lightblue')
    plt.title(f'Boxplot of {col}')
    plt.tight_layout()

plt.show()



In [None]:
print("Before removal:", df[['bmi', 'HbA1c_level', 'blood_glucose_level']].describe())
print("\nAfter Z-score removal:", df_z_cleaned[['bmi', 'HbA1c_level', 'blood_glucose_level']].describe())


In [None]:
df_cleaned = remove_outliers_selected(df, cols_to_check=['bmi', 'age', 'HbA1c_level', 'blood_glucose_level'])

df_encoded = pd.get_dummies(df_cleaned, drop_first=True)

X = df_encoded.drop('diabetes', axis=1)
y = df_encoded['diabetes']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
print("Train set mean:", np.mean(X_train_scaled, axis=0))
print("Train set std:", np.std(X_train_scaled, axis=0))


In [None]:
log_reg = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)

log_reg.fit(X_train_scaled, y_train)

y_pred = log_reg.predict(X_test_scaled)
y_prob = log_reg.predict_proba(X_test_scaled)[:, 1]  # Probability for class 1


In [None]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
roc_auc = roc_auc_score(y_test, y_prob)

print("\nEvaluation Metrics:")
print(f"Accuracy  : {accuracy:.4f}")
print(f"Precision : {precision:.4f}")
print(f"Recall    : {recall:.4f}")
print(f"F1 Score  : {f1:.4f}")
print(f"ROC AUC   : {roc_auc:.4f}")

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_prob)

plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})", color='darkorange')
plt.plot([0, 1], [0, 1], 'k--', label="Random")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Logistic Regression")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

y_prob = log_reg.predict_proba(X_test_scaled)[:, 1]

thresholds = np.arange(0.1, 0.9, 0.05)
results = []

for thresh in thresholds:
    y_pred_thresh = (y_prob >= thresh).astype(int)
    
    precision = precision_score(y_test, y_pred_thresh, zero_division=0)
    recall = recall_score(y_test, y_pred_thresh, zero_division=0)
    f1 = f1_score(y_test, y_pred_thresh, zero_division=0)
    
    results.append((thresh, precision, recall, f1))

print("Threshold  Precision  Recall  F1-Score")
for r in results:
    print(f"{r[0]:.2f}       {r[1]:.3f}     {r[2]:.3f}   {r[3]:.3f}")


In [None]:
thresholds = np.array([r[0] for r in results])
precisions = np.array([r[1] for r in results])
recalls = np.array([r[2] for r in results])
f1s = np.array([r[3] for r in results])

plt.figure(figsize=(8, 5))
plt.plot(thresholds, precisions, label='Precision')
plt.plot(thresholds, recalls, label='Recall')
plt.plot(thresholds, f1s, label='F1 Score')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Metric Scores at Different Thresholds')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
optimal_thresh = 0.85
y_pred_opt = (y_prob >= optimal_thresh).astype(int)

from sklearn.metrics import confusion_matrix
cm_opt = confusion_matrix(y_test, y_pred_opt)
print("Confusion Matrix at 0.85 Threshold:\n", cm_opt)
