## 1. Installing Dependencies

In [1]:
!pip install pandas scikit-learn xgboost --quiet

## 2. Data Loading & Exploration

In [11]:
import pandas as pd
import numpy as np

# Load dataset
print("Loading dataset...")
df = pd.read_csv('diabetes_binary_5050split_health_indicators_BRFSS2015.csv')

# rows and columns
print("\n=== Dataset Overview ===")
print(f"Shape: {df.shape}")

print("\nFirst 5 rows:")
display(df.head())

# Check for missing values
print("\n=== Missing Values Check ===")
print(df.isnull().sum())

Loading dataset...

=== Dataset Overview ===
Shape: (70692, 22)

First 5 rows:


Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0,6.0,8.0
1,0.0,1.0,1.0,1.0,26.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,8.0
2,0.0,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0,6.0,8.0
3,0.0,1.0,1.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0,6.0,8.0
4,0.0,0.0,0.0,1.0,29.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,8.0



=== Missing Values Check ===
Diabetes_binary         0
HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
dtype: int64


## 3. Data Preprocessing

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Separate features and target
X = df.iloc[:, 1:]  # All columns except Diabetes_binary
y = df.iloc[:, 0]   # Diabetes_binary column

# Standardize features
print("\nStandardizing features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split (80-20)
print("\nSplitting data into train/test sets...")
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, 
    y, 
    test_size=0.2, 
    random_state=42,  
    stratify=df.iloc[:, 0].values      
)

print(f"\nTraining samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")


Standardizing features...

Splitting data into train/test sets...

Training samples: 56553
Test samples: 14139


## 4. Random Forest Model

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize model
rf_model = RandomForestClassifier(
    n_estimators=300,        # Number of trees
    max_depth=15,            # Maximum tree depth
    min_samples_split=5,     # Minimum samples to split node
    min_samples_leaf=2,      # Minimum samples at leaf node
    class_weight="balanced", # Handle class imbalance
    random_state=42,         # Reproducibility
    n_jobs=-1                # Use all CPU cores
)

# Train model
print("\nTraining Random Forest model...")
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate performance
print("\n=== Random Forest Results ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))



Training Random Forest model...


0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.



=== Random Forest Results ===
Accuracy: 0.7491

Classification Report:
              precision    recall  f1-score   support

         0.0       0.77      0.70      0.74      7070
         1.0       0.73      0.80      0.76      7069

    accuracy                           0.75     14139
   macro avg       0.75      0.75      0.75     14139
weighted avg       0.75      0.75      0.75     14139



## 5. XGBoost Model

In [14]:
import xgboost as xgb

# Initialize model with optimized parameters
xgb_model = xgb.XGBClassifier(
    n_estimators=1000,        # Number of boosting rounds
    max_depth=6,             # Maximum tree depth
    learning_rate=0.01,      # Step size shrinkage
    subsample=0.8,           # Subsample ratio of training instances
    colsample_bytree=0.8,    # Subsample ratio of features
    gamma=0,                 # Minimum loss reduction
    random_state=42,         # Reproducibility
    n_jobs=-1,               # Use all CPU cores
    eval_metric='logloss'    # Evaluation metric
)

# Train model
print("\nTraining XGBoost model...")
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate performance
print("\n=== XGBoost Results ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb))


XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <54A1AE05-1E14-3DA2-A8D0-062134694298> /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file)"]


## 6. Saving the models

In [32]:
import joblib
import pickle

# Create a directory to save models (if it doesn't exist)
import os
os.makedirs('saved_models', exist_ok=True)

# 1. Save Random Forest Model
joblib.dump(rf_model, 'saved_models/random_forest_model.pkl')

# 2. Save XGBoost Model
joblib.dump(xgb_model, 'saved_models/xgboost_model.pkl')

['saved_models/xgboost_model.pkl']

## 7. Key Findings & Next Steps

**Findings:**
- Both models achieved ~75% accuracy
- Random Forest performed slightly better (0.7530 vs 0.7507)
- Both models identified similar important features

**Next Steps to take:**
- Try hyperparameter tuning with GridSearchCV/RandomizedSearchCV
- Experiment with feature engineering
