In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from scipy.stats import skew
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import warnings
warnings.filterwarnings("ignore")

In [2]:
df= pd.read_csv('diabetes_012_health_indicators_BRFSS2015.csv')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Diabetes_012          253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   HeartDiseaseorAttack  253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

In [3]:
df.head(5)

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [5]:
all_features= [c for c in df.columns if c!='Diabetes_012']
X= df[all_features]
y= df['Diabetes_012']
X_train,X_test,y_train,y_test= train_test_split(
    X,y,test_size=0.2,stratify=y,random_state=42
)
print(f'Train data shape {X_train.shape}')
print(f'Test data shape {X_test.shape}')

Train data shape (202944, 21)
Test data shape (50736, 21)


In [6]:
# Applying scaling and converting from floating point to integer
scale_cols= ['BMI','MentHlth','PhysHlth','Age']
ordinal_cols= ['Education','Income','GenHlth']
binary_cols= [c for c in all_features if c not in scale_cols + ordinal_cols]
scaler= StandardScaler()
X_train[scale_cols]= scaler.fit_transform(X_train[scale_cols])
X_test[scale_cols]= scaler.transform(X_test[scale_cols])
X_train[binary_cols+ordinal_cols]= X_train[binary_cols+ordinal_cols].astype('int64')
X_test[binary_cols+ordinal_cols]= X_test[binary_cols+ordinal_cols].astype('int64')
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 202944 entries, 185494 to 110153
Data columns (total 21 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   HighBP                202944 non-null  int64  
 1   HighChol              202944 non-null  int64  
 2   CholCheck             202944 non-null  int64  
 3   BMI                   202944 non-null  float64
 4   Smoker                202944 non-null  int64  
 5   Stroke                202944 non-null  int64  
 6   HeartDiseaseorAttack  202944 non-null  int64  
 7   PhysActivity          202944 non-null  int64  
 8   Fruits                202944 non-null  int64  
 9   Veggies               202944 non-null  int64  
 10  HvyAlcoholConsump     202944 non-null  int64  
 11  AnyHealthcare         202944 non-null  int64  
 12  NoDocbcCost           202944 non-null  int64  
 13  GenHlth               202944 non-null  int64  
 14  MentHlth              202944 non-null  float64
 15  

In [7]:
X_train.head(5)

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
185494,1,0,1,-0.512552,0,0,0,1,1,1,...,1,0,2,-0.42929,-0.485961,0,1,0.642861,4,4
217030,0,0,1,1.301115,1,0,0,1,1,1,...,1,0,1,-0.42929,-0.485961,0,0,-2.302154,5,5
641,1,1,1,0.998837,0,0,0,1,1,0,...,1,0,2,-0.42929,-0.485961,0,0,0.970085,5,3
217863,0,0,1,-0.210274,0,0,0,1,0,0,...,1,0,2,-0.42929,0.088138,0,1,-1.647706,4,6
53677,0,0,1,-0.059135,1,0,0,1,0,0,...,1,1,3,-0.023967,-0.485961,0,0,0.315637,4,4


In [8]:
# Sampling 
smote= SMOTE(random_state=42)
random_oversampler= RandomOverSampler(random_state=42)
random_undersampler= RandomUnderSampler(random_state=42)

X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
X_train_ros, y_train_ros = random_oversampler.fit_resample(X_train, y_train)
X_train_rus, y_train_rus = random_undersampler.fit_resample(X_train, y_train)
print(f"Original Train Shape: {X_train.shape}")
print(f"After SMOTE: {X_train_smote.shape}")
print(f"After Random Oversampling: {X_train_ros.shape}")
print(f"After Random Undersampling: {X_train_rus.shape}")

Original Train Shape: (202944, 21)
After SMOTE: (512886, 21)
After Random Oversampling: (512886, 21)
After Random Undersampling: (11115, 21)


In [22]:
# PCA for dimensionality reduction
# Apply PCA on training data (after scaling)
pca_smote = PCA(n_components=0.95, random_state=42)

# Fit PCA on the training set
X_train_smote_pca = pca_smote.fit_transform(X_train_smote)

# Apply the same PCA transformation to the test set
X_test_smote_pca = pca_smote.transform(X_test)
print(f"Original train shape: {X_train.shape}")
print(f"Original test shape: {X_test.shape}")
# Print the new shapes after PCA transformation
print(f"Training Data shape after PCA (SMOTE): {X_train_smote_pca.shape}")
print(f"Test Data shape after PCA (SMOTE): {X_test_smote_pca.shape}")

Original train shape: (202944, 21)
Original test shape: (50736, 21)
Training Data shape after PCA (SMOTE): (512886, 12)
Test Data shape after PCA (SMOTE): (50736, 12)


In [19]:
pca_ros = PCA(n_components=0.95, random_state=42)

# Fit PCA on the training set
X_train_ros_pca = pca_ros.fit_transform(X_train_ros)

# Apply the same PCA transformation to the test set
X_test_ros_pca = pca_ros.transform(X_test)
print(f"Original train shape: {X_train.shape}")
print(f"Original test shape: {X_test.shape}")
# Print the new shapes after PCA transformation
print(f"Training Data shape after PCA (ROS): {X_train_ros_pca.shape}")
print(f"Test Data shape after PCA (ROS): {X_test_ros_pca.shape}")

Original train shape: (202944, 21)
Original test shape: (50736, 21)
Training Data shape after PCA (ROS): (512886, 13)
Test Data shape after PCA (ROS): (50736, 13)


In [20]:
pca_rus = PCA(n_components=0.95, random_state=42)

# Fit PCA on the training set
X_train_rus_pca = pca_rus.fit_transform(X_train_rus)

# Apply the same PCA transformation to the test set
X_test_rus_pca = pca_rus.transform(X_test)
print(f"Original train shape: {X_train.shape}")
print(f"Original test shape: {X_test.shape}")
# Print the new shapes after PCA transformation
print(f"Training Data shape after PCA (RUS): {X_train_rus_pca.shape}")
print(f"Test Data shape after PCA (RUS): {X_test_rus_pca.shape}")

Original train shape: (202944, 21)
Original test shape: (50736, 21)
Training Data shape after PCA (RUS): (11115, 13)
Test Data shape after PCA (RUS): (50736, 13)


In [9]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [16]:
dtrain2 = xgb.DMatrix(X_train_smote, label=y_train_smote)

In [18]:
dtrain3 = xgb.DMatrix(X_train_ros, label=y_train_ros)

In [26]:
dtrain4 = xgb.DMatrix(X_train_smote_pca, label=y_train_smote)
dtest4 = xgb.DMatrix(X_test_smote_pca, label=y_test)

In [15]:
y_train.value_counts()

Diabetes_012
0.0    170962
2.0     28277
1.0      3705
Name: count, dtype: int64

In [10]:
params = {
    'objective': 'multi:softmax',  
    'num_class': 3,               
    'eta': 0.1,                  
    'max_depth': 6,              
    'eval_metric': 'mlogloss',    
    'seed': 42
}


num_round = 100
model = xgb.train(params, dtrain, num_round)

y_pred = model.predict(dtest)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))

Accuracy: 0.8497
              precision    recall  f1-score   support

         0.0       0.86      0.98      0.92     42741
         1.0       0.00      0.00      0.00       926
         2.0       0.56      0.18      0.28      7069

    accuracy                           0.85     50736
   macro avg       0.47      0.39      0.40     50736
weighted avg       0.81      0.85      0.81     50736



In [20]:
model3 = xgb.train(params, dtrain3, num_round)#ros

y_pred3 = model3.predict(dtest)
accuracy = accuracy_score(y_test, y_pred3)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred3))

Accuracy: 0.6317
              precision    recall  f1-score   support

         0.0       0.95      0.64      0.77     42741
         1.0       0.03      0.29      0.05       926
         2.0       0.34      0.62      0.44      7069

    accuracy                           0.63     50736
   macro avg       0.44      0.52      0.42     50736
weighted avg       0.85      0.63      0.71     50736



In [27]:
model4 = xgb.train(params, dtrain4, num_round)#smote+pca

y_pred4 = model4.predict(dtest4)
accuracy = accuracy_score(y_test, y_pred4)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred4))

Accuracy: 0.6115
              precision    recall  f1-score   support

         0.0       0.95      0.62      0.75     42741
         1.0       0.02      0.25      0.04       926
         2.0       0.34      0.63      0.44      7069

    accuracy                           0.61     50736
   macro avg       0.44      0.50      0.41     50736
weighted avg       0.85      0.61      0.69     50736



In [17]:
model2 = xgb.train(params, dtrain2, num_round)#smote

y_pred2 = model2.predict(dtest)
accuracy = accuracy_score(y_test, y_pred2)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred2))

Accuracy: 0.8187
              precision    recall  f1-score   support

         0.0       0.90      0.90      0.90     42741
         1.0       0.05      0.03      0.04       926
         2.0       0.43      0.45      0.44      7069

    accuracy                           0.82     50736
   macro avg       0.46      0.46      0.46     50736
weighted avg       0.82      0.82      0.82     50736



In [41]:
smote = SMOTE(random_state=42)
smote2 = SMOTE(sampling_strategy={2: 250000})
X_res, y_res = smote2.fit_resample(X_train, y_train)

weights_res = class_weight.compute_sample_weight('balanced', y_res)
dtrain_res = xgb.DMatrix(X_res, label=y_res, weight=weights_res)

model2_ = xgb.train(params, dtrain_res, num_round)

y_pred2 = model2_.predict(dtest)
accuracy = accuracy_score(y_test, y_pred2)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred2))

Accuracy: 0.5758
              precision    recall  f1-score   support

         0.0       0.95      0.66      0.78     42741
         1.0       0.03      0.69      0.06       926
         2.0       0.52      0.05      0.09      7069

    accuracy                           0.58     50736
   macro avg       0.50      0.47      0.31     50736
weighted avg       0.87      0.58      0.67     50736



In [45]:
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
param_grid = {
    'max_depth': [5, 7],                
    'min_child_weight': [1, 3, 5],          
    'gamma': [0, 0.1, 0.2],                
    'learning_rate': [0.01, 0.1],                  
}
from sklearn.metrics import make_scorer, f1_score
def f1_score_2(y_true, y_pred):
    return f1_score(y_true, y_pred, labels=[2], average='macro')
scorer = make_scorer(f1_score_2, greater_is_better=True)

model = XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    eval_metric='mlogloss',
    n_estimators=100,
    random_state=42
)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring=scorer,         
    cv=5,                   
    n_jobs=-1,               
    verbose=3
)

grid_search.fit(X_train_smote, y_train_smote)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=0))
print("Best Parameters:", grid_search.best_params_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
              precision    recall  f1-score   support

         0.0       0.89      0.91      0.90     42741
         1.0       0.04      0.02      0.03       926
         2.0       0.43      0.41      0.42      7069

    accuracy                           0.82     50736
   macro avg       0.46      0.45      0.45     50736
weighted avg       0.81      0.82      0.82     50736

Best Parameters: {'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 5}


From the result above: XGboost performs badly in the original dataset which is extremely imbalanced, predictions on class 1 and 2 almost all go wrong.
By using some sampling method including SMOTE and rus,ros, the performance slighly becomes better while still bad in class 1. PCA doesn't help significantly for XGboost so we did not further explore it. Among the sampling method SMOTE is best so we continue with it for grid search with params in XGboost.

For grid search part, after consideration of time and complexity we choose max depth, min child weight, gamma and lr as param grid. For better balance the metric here is macro f1 score. The best model gets a macro f1 0.45, increasing from 0.31 with params  {'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 5}.

The class 1 predition is always bad throughout the XGboost training, so it's possible that the features are not significant for classifying this class or this task is beyond the domain of XGboost.