In [1]:
from matplotlib import pyplot as plt
from sklearn.datasets import make_classification
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from collections import Counter
from sklearn.metrics import balanced_accuracy_score

In [2]:
# Load data 
outliers_removed_df = pd.read_csv('../indicators_bmi_filtered.csv')
outliers_removed_df.head()

Unnamed: 0,DIABETE4,_RACE,TOLDHI3,BPHIGH6,_BMI5,SMOKE100,_RFBING5,EDUCA,GENHLTH,_AGEG5YR,EXERANY2,FRUIT2,VEGETAB2,_INCOMG1,MEDCOST1,_SEX
0,No,White,Yes,No,14.54,Yes,Yes,High School Grad,Poor,Age 70 to 74,No,Yes,Yes,"$25,000 to < $35,000",No,Female
1,Yes,Black,No,Yes,28.29,No,Yes,High School Grad,Very Good,Age 70 to 74,No,Yes,Yes,"$15,000 to < $25,000",No,Female
2,Yes,White,Yes,Yes,33.47,No,No,High School Grad,Very Good,Age 60 to 64,Yes,Yes,Yes,"$50,000 to < $100,000",No,Female
3,Yes,Multiracial,Yes,No,28.73,No,Yes,Some High School,Poor,Age 75 to 79,Yes,Yes,Yes,"$15,000 to < $25,000",No,Male
4,No,White,No,No,24.37,Yes,Yes,Some College,Good,Age 80 or older,No,Yes,Yes,"$35,000 to < $50,000",No,Male


In [3]:
outliers_removed_df.dtypes

DIABETE4     object
_RACE        object
TOLDHI3      object
BPHIGH6      object
_BMI5       float64
SMOKE100     object
_RFBING5     object
EDUCA        object
GENHLTH      object
_AGEG5YR     object
EXERANY2     object
FRUIT2       object
VEGETAB2     object
_INCOMG1     object
MEDCOST1     object
_SEX         object
dtype: object

In [4]:
# Generate our categorical variable list
indicators_cat = outliers_removed_df.dtypes[outliers_removed_df.dtypes == "object"].index.tolist()

In [5]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(outliers_removed_df[indicators_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names_out(indicators_cat)
encode_df.head()

Unnamed: 0,DIABETE4_No,DIABETE4_Yes,_RACE_ Native Hawaiian or other Pacific Islander,_RACE_American Indian or Alaskan Native,_RACE_Asian,_RACE_Black,_RACE_Hispanic,_RACE_Multiracial,_RACE_Other,_RACE_White,...,"_INCOMG1_$100,000 to < $200,000","_INCOMG1_$15,000 to < $25,000","_INCOMG1_$25,000 to < $35,000","_INCOMG1_$35,000 to < $50,000","_INCOMG1_$50,000 to < $100,000","_INCOMG1_Less than $15,000",MEDCOST1_No,MEDCOST1_Yes,_SEX_Female,_SEX_Male
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [6]:
# Merge one-hot encoded features and drop the originals
outliers_removed_df = outliers_removed_df.merge(encode_df,left_index=True, right_index=True)
outliers_removed_df = outliers_removed_df.drop(indicators_cat,1)
outliers_removed_df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,_BMI5,DIABETE4_No,DIABETE4_Yes,_RACE_ Native Hawaiian or other Pacific Islander,_RACE_American Indian or Alaskan Native,_RACE_Asian,_RACE_Black,_RACE_Hispanic,_RACE_Multiracial,_RACE_Other,...,"_INCOMG1_$100,000 to < $200,000","_INCOMG1_$15,000 to < $25,000","_INCOMG1_$25,000 to < $35,000","_INCOMG1_$35,000 to < $50,000","_INCOMG1_$50,000 to < $100,000","_INCOMG1_Less than $15,000",MEDCOST1_No,MEDCOST1_Yes,_SEX_Female,_SEX_Male
0,14.54,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,28.29,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,33.47,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,28.73,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,24.37,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [7]:
# Define features and targets
y = outliers_removed_df['DIABETE4_Yes']
X = outliers_removed_df.drop(columns=['DIABETE4_Yes', 'DIABETE4_No'])

In [8]:
# Split into training and testing 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 78, stratify=y)
Counter(y_train)

Counter({0.0: 155232, 1.0: 26869})

In [9]:
# Create Standard Scaler
scaler = StandardScaler()

In [10]:
# Fit Data
X_scaler = scaler.fit(X_train)

# Scale Data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [11]:
# Undersample the data using `RandomUnderSampler`
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({0.0: 26869, 1.0: 26869})

# Logistic Regression 

In [12]:
# Fit a Logistic regression model using random undersampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
clf = model.fit(X_resampled, y_resampled)

In [13]:
# Validate and make predictions with testing data 
predictions = model.predict(X_test_scaled)

In [14]:
# Display the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, index=['Actual 0', 'Acutal 1'], columns = ['Predicted 0', 'Predicted 1'])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,35929,15815
Acutal 1,2325,6632


In [15]:
# Calculate the Balanced Accuracy Score
balanced_accuracy_score(y_test, predictions)

0.7173935904530192

In [16]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.94      0.69      0.74      0.80      0.72      0.51     51744
        1.0       0.30      0.74      0.69      0.42      0.72      0.52      8957

avg / total       0.84      0.70      0.73      0.74      0.72      0.51     60701



In [17]:
print(clf.coef_)

[[ 0.36395885  0.02203845  0.05498054  0.07476455  0.02451912  0.04670463
  -0.00580535 -0.01557259 -0.08802449 -0.12500346  0.12500346 -0.17123823
   0.17123823  0.01094892 -0.01094892 -0.08973359  0.08973359 -0.03008321
   0.0183826  -0.00239886  0.02314141  0.02494111  0.01280785 -0.30299938
   0.25095929  0.13598776  0.15680405 -0.12083455 -0.27795019 -0.20863009
  -0.18158498 -0.13211836 -0.07967754 -0.02715521  0.00693519  0.06769849
   0.07218795  0.1156407   0.147276    0.1318441   0.09474022  0.04224135
  -0.04224135 -0.0018568   0.0018568   0.006072   -0.006072   -0.04556188
  -0.04462828  0.03613118  0.03716921  0.01024069 -0.00982117  0.03445935
   0.00486272 -0.00486272 -0.02298987  0.02298987]]


In [18]:
coef = pd.DataFrame(zip(X_train, np.transpose(clf.coef_.tolist()[0])), columns=['features', 'coef'])
coef

Unnamed: 0,features,coef
0,_BMI5,0.363959
1,_RACE_ Native Hawaiian or other Pacific Islander,0.022038
2,_RACE_American Indian or Alaskan Native,0.054981
3,_RACE_Asian,0.074765
4,_RACE_Black,0.024519
5,_RACE_Hispanic,0.046705
6,_RACE_Multiracial,-0.005805
7,_RACE_Other,-0.015573
8,_RACE_White,-0.088024
9,TOLDHI3_No,-0.125003


In [19]:
coef.sort_values('coef')

Unnamed: 0,features,coef
23,GENHLTH_Excellent,-0.302999
28,_AGEG5YR_Age 18 to 24,-0.27795
29,_AGEG5YR_Age 25 to 29,-0.20863
30,_AGEG5YR_Age 30 to 34,-0.181585
11,BPHIGH6_No,-0.171238
31,_AGEG5YR_Age 35 to 39,-0.132118
9,TOLDHI3_No,-0.125003
27,GENHLTH_Very Good,-0.120835
15,_RFBING5_No,-0.089734
8,_RACE_White,-0.088024


In [20]:
# p value
import statsmodels.api as sm
logit_model=sm.Logit(y_train, X_train)
result=logit_model.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.344711
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:           DIABETE4_Yes   No. Observations:               182101
Model:                          Logit   Df Residuals:                   182056
Method:                           MLE   Df Model:                           44
Date:                Sat, 22 Apr 2023   Pseudo R-squ.:                  0.1762
Time:                        14:51:51   Log-Likelihood:                -62772.
converged:                       True   LL-Null:                       -76198.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                                       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------------------
_BMI5                           

# Random Forest

In [21]:
# Create Random Forest Classifier 
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

In [22]:
# Fit Model
rf_model = rf_model.fit(X_resampled, y_resampled)

In [23]:
# Validate and make predictions with testing data 
predictions = rf_model.predict(X_test_scaled)

In [24]:
# Evaluate performance with accuracy score
balanced_accuracy_score(y_test, predictions)

0.6834672029798248

In [25]:
# Generate confusion matrix
cm = confusion_matrix(y_test, predictions)

In [26]:
# Create df from confusion matrix
cm_df = pd.DataFrame(cm, index=['Actual 0', 'Acutal 1'], columns = ['Predicted 0', 'Predicted 1'])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,34879,16865
Acutal 1,2751,6206


In [27]:
# Classification Report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.93      0.67      0.69      0.78      0.68      0.47     51744
        1.0       0.27      0.69      0.67      0.39      0.68      0.47      8957

avg / total       0.83      0.68      0.69      0.72      0.68      0.47     60701



In [28]:
# Rank importance of features
importances = rf_model.feature_importances_
importances

array([0.29270122, 0.00121488, 0.00422111, 0.00440718, 0.01004135,
       0.00922931, 0.00479545, 0.00197802, 0.01678296, 0.01961648,
       0.02204555, 0.04131766, 0.03927015, 0.01740053, 0.01722334,
       0.00784493, 0.00744747, 0.01777683, 0.00321024, 0.01701203,
       0.00029942, 0.01825642, 0.00634818, 0.0247488 , 0.0180229 ,
       0.01324706, 0.00772904, 0.02008384, 0.00575299, 0.0055537 ,
       0.00683141, 0.00831797, 0.00986691, 0.01092514, 0.0125354 ,
       0.01400681, 0.01587827, 0.01618365, 0.01496524, 0.01309533,
       0.01236676, 0.01125635, 0.01137243, 0.00511628, 0.0050464 ,
       0.00413783, 0.00416143, 0.00735614, 0.01489442, 0.01310114,
       0.01516733, 0.01652713, 0.02000118, 0.00948904, 0.00762877,
       0.00745455, 0.01845062, 0.01828704])

In [29]:
# Sort features 
sorted(zip(rf_model.feature_importances_,X.columns), reverse=True)

[(0.2927012211117811, '_BMI5'),
 (0.04131766340355101, 'BPHIGH6_No'),
 (0.03927014636332566, 'BPHIGH6_Yes'),
 (0.02474879988591775, 'GENHLTH_Excellent'),
 (0.022045551626891667, 'TOLDHI3_Yes'),
 (0.020083842332557007, 'GENHLTH_Very Good'),
 (0.020001176010426544, '_INCOMG1_$50,000 to < $100,000'),
 (0.01961647522986096, 'TOLDHI3_No'),
 (0.018450615915654847, '_SEX_Female'),
 (0.018287043608729844, '_SEX_Male'),
 (0.018256416083623613, 'EDUCA_Some College'),
 (0.018022904795953334, 'GENHLTH_Fair'),
 (0.017776830194129963, 'EDUCA_College Grad'),
 (0.01740052844419337, 'SMOKE100_No'),
 (0.01722333955424829, 'SMOKE100_Yes'),
 (0.01701203289322486, 'EDUCA_High School Grad'),
 (0.016782963623211546, '_RACE_White'),
 (0.016527134777388132, '_INCOMG1_$35,000 to < $50,000'),
 (0.01618365076365703, '_AGEG5YR_Age 65 to 69 '),
 (0.015878268425791865, '_AGEG5YR_Age 60 to 64 '),
 (0.01516733473530592, '_INCOMG1_$25,000 to < $35,000'),
 (0.014965236287295867, '_AGEG5YR_Age 70 to 74 '),
 (0.0148944247