In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

ModuleNotFoundError: No module named 'pandas'

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [None]:
#for loading data
df = pd.read_excel("diabetes_binary_health_indicators_BRFSS2015.xlsx")
df.head()

In [None]:
df.shape

In [None]:
#for understading the split between people with and without diabetes
df['Diabetes_binary'].value_counts()

In [None]:
X = df.drop('Diabetes_binary', axis=1)
y = df['Diabetes_binary']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
scaler = StandardScaler()

In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
model = LogisticRegression(class_weight='balanced',max_iter=1000)
model.fit(X_train_scaled, y_train)

In [None]:
y_pred = model.predict(X_test_scaled)
y_proba = model.predict_proba(X_test_scaled)[:, 1]

In [None]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
roc_auc_score(y_test, y_proba)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
plt.hist(y_proba[y_test == 0], bins=50, alpha=0.6, label="No Diabetes")
plt.hist(y_proba[y_test == 1], bins=50, alpha=0.6, label="Diabetes")
plt.xlabel("Predicted probability of diabetes")
plt.ylabel("Number of people")
plt.legend()
plt.show()

In [None]:
y_pred_03 = (y_proba >= 0.3).astype(int)

In [None]:
print(confusion_matrix(y_test, y_pred_03))
print(classification_report(y_test, y_pred_03))

In [None]:
coeff_df = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": model.coef_[0]
}).sort_values(by="Coefficient", ascending=False)

coeff_df

In [None]:
coeff_df["Odds_Ratio"] = np.exp(coeff_df["Coefficient"])
coeff_df = coeff_df.sort_values(by="Odds_Ratio", ascending=False)

coeff_df


In [None]:
plt.figure(figsize=(8,6))
sns.barplot(
    x='Coefficient',
    y='Feature',
    data=coeff_df.sort_values(by='Coefficient')
)
plt.title("Feature Importance (Logistic Regression)")
plt.show()


In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)

In [None]:
rf.fit(X_train, y_train)

In [None]:
rf_proba = rf.predict_proba(X_test)[:, 1]

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_rf = roc_auc_score(y_test, rf_proba)
roc_auc_rf

In [None]:
rf_pred_03 = (rf_proba >= 0.3).astype(int)

In [None]:
print(confusion_matrix(y_test, rf_pred_03))
print(classification_report(y_test, rf_pred_03))

In [None]:
plt.figure(figsize=(8,5))
plt.hist(rf_proba[y_test == 0], bins=50, alpha=0.6, label="No Diabetes")
plt.hist(rf_proba[y_test == 1], bins=50, alpha=0.6, label="Diabetes")
plt.xlabel("Predicted probability (Random Forest)")
plt.ylabel("Number of people")
plt.legend()
plt.show()


In [None]:
importances = pd.DataFrame({
    "Feature": X.columns,
    "Importance": rf.feature_importances_
}).sort_values(by="Importance", ascending=False)

importances

In [None]:
plt.figure(figsize=(8, 5))
plt.barh(importances["Feature"], importances["Importance"])
plt.gca().invert_yaxis()
plt.xlabel("Feature Importance")
plt.title("Random Forest Feature Importance for Diabetes Prediction")
plt.show()