In [5]:
from matplotlib import pyplot as plt
from sklearn.datasets import make_classification
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [6]:
# Load data 
outliers_removed_df = pd.read_csv('indicators_bmi_filtered.csv')
outliers_removed_df.head()

Unnamed: 0,DIABETE4,_RACE,TOLDHI3,BPHIGH6,_BMI5,SMOKE100,_RFBING5,EDUCA,GENHLTH,_AGEG5YR,EXERANY2,FRUIT2,VEGETAB2,_INCOMG1,MEDCOST1,_SEX
0,No,White,Yes,No,14.54,Yes,Yes,High School Grad,Poor,Age 70 to 74,No,Yes,Yes,"$25,000 to < $35,000",No,Female
1,Yes,Black,No,Yes,28.29,No,Yes,High School Grad,Very Good,Age 70 to 74,No,Yes,Yes,"$15,000 to < $25,000",No,Female
2,Yes,White,Yes,Yes,33.47,No,No,High School Grad,Very Good,Age 60 to 64,Yes,Yes,Yes,"$50,000 to < $100,000",No,Female
3,Yes,Multiracial,Yes,No,28.73,No,Yes,Some High School,Poor,Age 75 to 79,Yes,Yes,Yes,"$15,000 to < $25,000",No,Male
4,No,White,No,No,24.37,Yes,Yes,Some College,Good,Age 80 or older,No,Yes,Yes,"$35,000 to < $50,000",No,Male


In [7]:
outliers_removed_df.dtypes

DIABETE4     object
_RACE        object
TOLDHI3      object
BPHIGH6      object
_BMI5       float64
SMOKE100     object
_RFBING5     object
EDUCA        object
GENHLTH      object
_AGEG5YR     object
EXERANY2     object
FRUIT2       object
VEGETAB2     object
_INCOMG1     object
MEDCOST1     object
_SEX         object
dtype: object

In [8]:
# Generate our categorical variable list
indicators_cat = outliers_removed_df.dtypes[outliers_removed_df.dtypes == "object"].index.tolist()

In [9]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(outliers_removed_df[indicators_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names_out(indicators_cat)
encode_df.head()

Unnamed: 0,DIABETE4_No,DIABETE4_Yes,_RACE_ Native Hawaiian or other Pacific Islander,_RACE_American Indian or Alaskan Native,_RACE_Asian,_RACE_Black,_RACE_Hispanic,_RACE_Multiracial,_RACE_Other,_RACE_White,...,"_INCOMG1_$100,000 to < $200,000","_INCOMG1_$15,000 to < $25,000","_INCOMG1_$25,000 to < $35,000","_INCOMG1_$35,000 to < $50,000","_INCOMG1_$50,000 to < $100,000","_INCOMG1_Less than $15,000",MEDCOST1_No,MEDCOST1_Yes,_SEX_Female,_SEX_Male
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [10]:
# Merge one-hot encoded features and drop the originals
outliers_removed_df = outliers_removed_df.merge(encode_df,left_index=True, right_index=True)
outliers_removed_df = outliers_removed_df.drop(indicators_cat,1)
outliers_removed_df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,_BMI5,DIABETE4_No,DIABETE4_Yes,_RACE_ Native Hawaiian or other Pacific Islander,_RACE_American Indian or Alaskan Native,_RACE_Asian,_RACE_Black,_RACE_Hispanic,_RACE_Multiracial,_RACE_Other,...,"_INCOMG1_$100,000 to < $200,000","_INCOMG1_$15,000 to < $25,000","_INCOMG1_$25,000 to < $35,000","_INCOMG1_$35,000 to < $50,000","_INCOMG1_$50,000 to < $100,000","_INCOMG1_Less than $15,000",MEDCOST1_No,MEDCOST1_Yes,_SEX_Female,_SEX_Male
0,14.54,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,28.29,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,33.47,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,28.73,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,24.37,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [11]:
# Define features and targets
y = outliers_removed_df['DIABETE4_Yes']
X = outliers_removed_df.drop(columns=['DIABETE4_Yes', 'DIABETE4_No'])

In [12]:
# Split into training and testing 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 78, stratify=y)

In [13]:
# Create Standard Scaler
scaler = StandardScaler()

In [14]:
# Fit Data
X_scaler = scaler.fit(X_train)

# Scale Data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [15]:
# Create Random Forest Classifier 
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

In [16]:
# Fit Model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [17]:
# Validate and make predictions with testing data 
predictions = rf_model.predict(X_test_scaled)

In [18]:
# Evaluate performance with accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.8298710070674289

In [19]:
# Generate confusion matrix
cm = confusion_matrix(y_test, predictions)

In [20]:
# Create df from confusion matrix
cm_df = pd.DataFrame(cm, index=['Actual 0', 'Acutal 1'], columns = ['Predicted 0', 'Predicted 1'])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,48626,3118
Acutal 1,7209,1748


In [21]:
# Classification Report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.87      0.94      0.90     51744
         1.0       0.36      0.20      0.25      8957

    accuracy                           0.83     60701
   macro avg       0.62      0.57      0.58     60701
weighted avg       0.80      0.83      0.81     60701



In [22]:
# Rank importance of features
importances = rf_model.feature_importances_
importances

array([0.38843556, 0.00178339, 0.0053167 , 0.00444539, 0.01036091,
       0.00989113, 0.0048679 , 0.00232427, 0.01554271, 0.01164075,
       0.01360384, 0.0228531 , 0.02158659, 0.01719636, 0.01727718,
       0.00533247, 0.00498662, 0.01749484, 0.00430733, 0.01915485,
       0.00051645, 0.02001613, 0.00793557, 0.01168089, 0.01522769,
       0.00986332, 0.00840853, 0.01351752, 0.00211003, 0.00290433,
       0.00433078, 0.00594201, 0.00790939, 0.00961986, 0.01240483,
       0.01427625, 0.01576241, 0.01548321, 0.01400421, 0.01258864,
       0.01145803, 0.00938227, 0.00915602, 0.00559755, 0.00572441,
       0.00499078, 0.00499888, 0.00581364, 0.01333076, 0.01384808,
       0.01611666, 0.01690387, 0.0196254 , 0.01076229, 0.00794118,
       0.00797395, 0.01668691, 0.01678538])

In [23]:
# Sort features 
sorted(zip(rf_model.feature_importances_,X.columns), reverse=True)

[(0.388435562989011, '_BMI5'),
 (0.022853097295732025, 'BPHIGH6_No'),
 (0.02158659328808731, 'BPHIGH6_Yes'),
 (0.02001613209753124, 'EDUCA_Some College'),
 (0.019625398916940588, '_INCOMG1_$50,000 to < $100,000'),
 (0.0191548538116438, 'EDUCA_High School Grad'),
 (0.017494837069310168, 'EDUCA_College Grad'),
 (0.017277183236023667, 'SMOKE100_Yes'),
 (0.017196360567853605, 'SMOKE100_No'),
 (0.016903874874496686, '_INCOMG1_$35,000 to < $50,000'),
 (0.016785376052353078, '_SEX_Male'),
 (0.016686911415112356, '_SEX_Female'),
 (0.01611665806964043, '_INCOMG1_$25,000 to < $35,000'),
 (0.015762409261791847, '_AGEG5YR_Age 60 to 64 '),
 (0.01554271324926969, '_RACE_White'),
 (0.015483208188816976, '_AGEG5YR_Age 65 to 69 '),
 (0.015227691322697277, 'GENHLTH_Fair'),
 (0.014276254043985023, '_AGEG5YR_Age 55 to 59 '),
 (0.014004214834929947, '_AGEG5YR_Age 70 to 74 '),
 (0.01384808215390898, '_INCOMG1_$15,000 to < $25,000'),
 (0.013603836161029231, 'TOLDHI3_Yes'),
 (0.013517519519600906, 'GENHLTH_Ve