In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# =========================================================
# 1. IMPORT LIBRARIES
# =========================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler

# Make plots look nicer
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (8, 5)


In [None]:
# =========================================================
# 2. LOAD DATA
# =========================================================
data = pd.read_csv('/kaggle/input/diabetes-health-indicators-dataset/diabetes_binary_health_indicators_BRFSS2015.csv')

# Quick overview
print("Shape:", data.shape)
data.head()


In [None]:
# =========================================================
# 2. LOAD DATA
# =========================================================
data = pd.read_csv('/kaggle/input/diabetes-health-indicators-dataset/diabetes_binary_health_indicators_BRFSS2015.csv')

# Quick overview
print("Shape:", data.shape)
data.head()


In [None]:
# =========================================================
# 3. BASIC EXPLORATION
# =========================================================
# Missing values?
print(data.isna().sum().sum(), "missing values")

# Summary stats
data.describe().T.head(10)

# Check class balance
sns.countplot(x='Diabetes_binary', data=data)
plt.title("Class Distribution (0 = No Diabetes / 1 = Diabetes)")
plt.show()


In [None]:
# =========================================================
# 4. FEATURE SELECTION
# =========================================================
# Target variable
y = data['Diabetes_binary']

# Drop the target
X = data.drop('Diabetes_binary', axis=1)

# Standardize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Feature matrix shape:", X_scaled.shape)


In [None]:
# =========================================================
# 5. TRAIN-TEST SPLIT
# =========================================================
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.25, random_state=42, stratify=y)

print("Training samples:", X_train.shape[0])
print("Test samples:", X_test.shape[0])


In [None]:
# =========================================================
# 6. MODEL TRAINING (Random Forest)
# =========================================================
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    max_depth=10,
    n_jobs=-1
)
rf.fit(X_train, y_train)

# Predictions
y_pred = rf.predict(X_test)

# Accuracy
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))


In [None]:
# =========================================================
# 7. MODEL EVALUATION
# =========================================================
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
# =========================================================
# 8. FEATURE IMPORTANCE
# =========================================================
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)

# Top 10 features
top10 = importances.head(10)
sns.barplot(x=top10.values, y=top10.index)
plt.title("Top 10 Important Features Predicting Diabetes / Health Adherence")
plt.show()

top10


In [None]:
# =========================================================
# 9. SIMPLE INTERPRETATION
# =========================================================
print("""
Interpretation:
- Model accuracy gives an estimate of how well the lifestyle variables predict diabetes risk (proxy for adherence).
- Top features (like BMI, PhysicalActivity, GenHlth) can be seen as behavior-related predictors.
- You can adapt this framework to predict adherence to digital interventions by substituting engagement-related variables.
""")
