In [1]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix , classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
import psycopg2
import config as creds

In [2]:
#Set up connection to database
def connect():
    
    # Set up a connection to the postgres server.
    conn_string = "host="+ creds.PGHOST +" port="+ "5432" +" dbname="+ creds.PGDATABASE +" user=" + creds.PGUSER \
                  +" password="+ creds.PGPASSWORD
    
    conn = psycopg2.connect(conn_string)
    print("Connected!")

    # Create a cursor object
    cursor = conn.cursor()
    
    return conn, cursor


In [3]:
conn, cursor = connect()

Connected!


In [4]:
# Load the data

file_path = ('diabetes_012_health_indicators_BRFSS2015.csv')
df = pd.read_csv(file_path)
#print(df.shape)

#Convert diabetes to 1 and prediabetes to 0
def diabetes_binary(x, r):
    if x == r:
        return('1')
    if x < r:
        return('0')
    
df['Diabetes_012'] = df["Diabetes_012"].apply(diabetes_binary, args =[2])



df.head(20)


Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0
5,0,1.0,1.0,1.0,25.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,2.0,0.0,1.0,10.0,6.0,8.0
6,0,1.0,0.0,1.0,30.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,3.0,0.0,14.0,0.0,0.0,9.0,6.0,7.0
7,0,1.0,1.0,1.0,25.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,0.0,0.0,1.0,0.0,11.0,4.0,4.0
8,1,1.0,1.0,1.0,30.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,5.0,30.0,30.0,1.0,0.0,9.0,5.0,1.0
9,0,0.0,0.0,1.0,24.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,1.0,8.0,4.0,3.0


In [5]:
#Drops NA rows
df.dropna()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,0,1.0,1.0,1.0,45.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,5.0,0.0,1.0,5.0,6.0,7.0
253676,1,1.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,4.0,0.0,0.0,1.0,0.0,11.0,2.0,4.0
253677,0,0.0,0.0,1.0,28.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,5.0,2.0
253678,0,1.0,0.0,1.0,23.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,7.0,5.0,1.0


In [6]:
#Split the data into features(x) and Target(y)
y = df["Diabetes_012"]
X = df.drop(columns="Diabetes_012")

In [7]:
#Check data types
df.dtypes

Diabetes_012             object
HighBP                  float64
HighChol                float64
CholCheck               float64
BMI                     float64
Smoker                  float64
Stroke                  float64
HeartDiseaseorAttack    float64
PhysActivity            float64
Fruits                  float64
Veggies                 float64
HvyAlcoholConsump       float64
AnyHealthcare           float64
NoDocbcCost             float64
GenHlth                 float64
MentHlth                float64
PhysHlth                float64
DiffWalk                float64
Sex                     float64
Age                     float64
Education               float64
Income                  float64
dtype: object

In [8]:
df['Diabetes_012'] = df['Diabetes_012'] .astype('int64')

In [9]:
y.value_counts()

0    218334
1     35346
Name: Diabetes_012, dtype: int64

In [10]:
X.describe()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
count,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,...,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0
mean,0.429001,0.424121,0.96267,28.382364,0.443169,0.040571,0.094186,0.756544,0.634256,0.81142,...,0.951053,0.084177,2.511392,3.184772,4.242081,0.168224,0.440342,8.032119,5.050434,6.053875
std,0.494934,0.49421,0.189571,6.608694,0.496761,0.197294,0.292087,0.429169,0.481639,0.391175,...,0.215759,0.277654,1.068477,7.412847,8.717951,0.374066,0.496429,3.05422,0.985774,2.071148
min,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,0.0,0.0,1.0,24.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,6.0,4.0,5.0
50%,0.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,7.0
75%,1.0,1.0,1.0,31.0,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,3.0,2.0,3.0,0.0,1.0,10.0,6.0,8.0
max,1.0,1.0,1.0,98.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,1.0,13.0,6.0,8.0


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,)




In [12]:
X_test

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
235899,1.0,0.0,1.0,23.0,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,13.0,6.0,6.0
74852,0.0,0.0,1.0,22.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,0.0,2.0,0.0,5.0,0.0,1.0,9.0,6.0,8.0
8205,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,4.0,6.0
127632,1.0,0.0,1.0,39.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,4.0,0.0,0.0,0.0,1.0,11.0,6.0,5.0
32021,0.0,0.0,1.0,22.0,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,10.0,6.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108360,0.0,1.0,1.0,29.0,1.0,0.0,0.0,1.0,1.0,1.0,...,0.0,1.0,1.0,7.0,0.0,0.0,0.0,6.0,5.0,6.0
8531,1.0,0.0,1.0,24.0,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,30.0,1.0,0.0,9.0,4.0,1.0
183429,1.0,1.0,1.0,31.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,4.0,0.0,30.0,0.0,0.0,10.0,4.0,7.0
52932,0.0,0.0,1.0,32.0,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,4.0,0.0,5.0,1.0,0.0,10.0,6.0,3.0


In [13]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
                                   
b_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
b_model = b_model.fit(X_train, y_train)






b_model


BalancedRandomForestClassifier(random_state=1)

In [14]:
# Calculated the balanced accuracy score
predictions = b_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, predictions)
acc_score

0.7390906595635285

In [15]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predictions)
cm


array([[38233, 16318],
       [ 1975,  6894]], dtype=int64)

In [16]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, predictions))


                   pre       rec       spe        f1       geo       iba       sup

          0       0.95      0.70      0.78      0.81      0.74      0.54     54551
          1       0.30      0.78      0.70      0.43      0.74      0.55      8869

avg / total       0.86      0.71      0.77      0.75      0.74      0.54     63420



In [17]:
# List the features sorted in descending order by feature importance
sorted(zip(b_model.feature_importances_, X.columns), reverse=True)

[(0.17000124811539444, 'BMI'),
 (0.12575158527645164, 'Age'),
 (0.10749147974153575, 'GenHlth'),
 (0.08336544416804584, 'Income'),
 (0.07753186746999935, 'HighBP'),
 (0.06937583674498776, 'PhysHlth'),
 (0.05758725891674523, 'Education'),
 (0.052364785165043004, 'MentHlth'),
 (0.03933350971705517, 'HighChol'),
 (0.029250587632398198, 'Smoker'),
 (0.028374368434229892, 'Fruits'),
 (0.026364942711488707, 'Sex'),
 (0.024579523901965092, 'DiffWalk'),
 (0.023965927250993477, 'PhysActivity'),
 (0.021826009539610627, 'Veggies'),
 (0.018120857242927918, 'HeartDiseaseorAttack'),
 (0.011876087749694089, 'NoDocbcCost'),
 (0.01015508439558031, 'Stroke'),
 (0.009297580843359127, 'HvyAlcoholConsump'),
 (0.007479284399379615, 'AnyHealthcare'),
 (0.005906730583114808, 'CholCheck')]

### Removing weak indicators

In [35]:
dropped_df = df[['Diabetes_012', 'BMI','Age','GenHlth','Income','HighBP','PhysHlth','Education','Education','MentHlth','HighChol','Smoker','Fruits','Sex','DiffWalk','PhysActivity','Veggies']]
dropped_df

Unnamed: 0,Diabetes_012,BMI,Age,GenHlth,Income,HighBP,PhysHlth,Education,Education.1,MentHlth,HighChol,Smoker,Fruits,Sex,DiffWalk,PhysActivity,Veggies
0,0,40.0,9.0,5.0,3.0,1.0,15.0,4.0,4.0,18.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
1,0,25.0,7.0,3.0,1.0,0.0,0.0,6.0,6.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0,28.0,9.0,5.0,8.0,1.0,30.0,4.0,4.0,30.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0,27.0,11.0,2.0,6.0,1.0,0.0,3.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
4,0,24.0,11.0,2.0,4.0,1.0,0.0,5.0,5.0,3.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,0,45.0,5.0,3.0,7.0,1.0,5.0,6.0,6.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
253676,1,18.0,11.0,4.0,4.0,1.0,0.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
253677,0,28.0,2.0,1.0,2.0,0.0,0.0,5.0,5.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
253678,0,23.0,7.0,3.0,1.0,1.0,0.0,5.0,5.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0


In [36]:
df.dtypes

Diabetes_012              int64
HighBP                  float64
HighChol                float64
CholCheck               float64
BMI                     float64
Smoker                  float64
Stroke                  float64
HeartDiseaseorAttack    float64
PhysActivity            float64
Fruits                  float64
Veggies                 float64
HvyAlcoholConsump       float64
AnyHealthcare           float64
NoDocbcCost             float64
GenHlth                 float64
MentHlth                float64
PhysHlth                float64
DiffWalk                float64
Sex                     float64
Age                     float64
Education               float64
Income                  float64
dtype: object

In [37]:
#Split the data into features(x) and Target(y)
y = dropped_df["Diabetes_012"]
X = dropped_df.drop(columns="Diabetes_012")


In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,)

In [39]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
                                   
dropped_b_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
dropped_b_model = dropped_b_model.fit(X_train, y_train)






dropped_b_model

BalancedRandomForestClassifier(random_state=1)

In [40]:
# Calculated the balanced accuracy score
predictions = dropped_b_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, predictions)
acc_score

0.7344684597949853

In [41]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predictions)
cm


array([[38067, 16484],
       [ 2030,  6839]], dtype=int64)

In [42]:
# List the features sorted in descending order by feature importance
sorted(zip(dropped_b_model.feature_importances_, X.columns), reverse=True)

[(0.17778843911064407, 'BMI'),
 (0.13221629203812557, 'Age'),
 (0.11454791360583164, 'GenHlth'),
 (0.08978778419075584, 'Income'),
 (0.08595401943975732, 'HighBP'),
 (0.07290079639657086, 'PhysHlth'),
 (0.05609491928605742, 'MentHlth'),
 (0.03719098261141204, 'HighChol'),
 (0.033141038806314846, 'Education'),
 (0.03313093335288378, 'Education'),
 (0.03194627017144877, 'Smoker'),
 (0.030784377527843524, 'Fruits'),
 (0.028220097336876932, 'Sex'),
 (0.02713323787333767, 'DiffWalk'),
 (0.025207871801623337, 'PhysActivity'),
 (0.02395502645051642, 'Veggies')]

In [43]:
# Execute SQL Command and commit to DB
conn.commit()