In [1]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix , classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import config as creds
import psycopg2

In [2]:

#Set up connection to database
def connect():
    
    # Set up a connection to the postgres server.
    conn_string = "host="+ creds.PGHOST +" port="+ "5432" +" dbname="+ creds.PGDATABASE +" user=" + creds.PGUSER \
                  +" password="+ creds.PGPASSWORD
    
    conn = psycopg2.connect(conn_string)
    print("Connected!")

    # Create a cursor object
    cursor = conn.cursor()
    
    return conn, cursor

In [3]:
conn, cursor = connect()

Connected!


In [5]:
# Load the data

file_path = ('diabetes_012_health_indicators_BRFSS2015.csv')
df = pd.read_csv(file_path)
#print(df.shape)

#Convert diabetes to 1 and prediabetes to 0
def diabetes_binary(x, r):
    if x == r:
        return('1')
    if x < r:
        return('0')
    
df['Diabetes_012'] = df["Diabetes_012"].apply(diabetes_binary, args =[2])

#Convert BMI to binary
def bmi_binary(x, r):
    if x > r:
        return('1')
    if x <= r:
        return('0')
    
df['BMI'] = df["BMI"].apply(bmi_binary, args =[25])
df['BMI'] = df['BMI'].astype('float')



df.head(20)

                      

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0
5,0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,2.0,0.0,1.0,10.0,6.0,8.0
6,0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,3.0,0.0,14.0,0.0,0.0,9.0,6.0,7.0
7,0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,0.0,0.0,1.0,0.0,11.0,4.0,4.0
8,1,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,5.0,30.0,30.0,1.0,0.0,9.0,5.0,1.0
9,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,1.0,8.0,4.0,3.0


In [6]:
#Create the lifestyle df
health_indicators_df = df[['Diabetes_012','HighBP','HighChol','BMI','Fruits','Stroke','HeartDiseaseorAttack','DiffWalk']]

#Drops NA rows
health_indicators_df.dropna()

Unnamed: 0,Diabetes_012,HighBP,HighChol,BMI,Fruits,Stroke,HeartDiseaseorAttack,DiffWalk
0,0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
3,0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
4,0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
253675,0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
253676,1,1.0,1.0,0.0,0.0,0.0,0.0,1.0
253677,0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
253678,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [7]:
#Split the data into features(x) and Target(y)
y = health_indicators_df["Diabetes_012"]
X = health_indicators_df.drop(['Diabetes_012'], axis = 1)

In [8]:
#Check data types
health_indicators_df.dtypes

Diabetes_012             object
HighBP                  float64
HighChol                float64
BMI                     float64
Fruits                  float64
Stroke                  float64
HeartDiseaseorAttack    float64
DiffWalk                float64
dtype: object

In [30]:
health_indicators_df['Diabetes_012'] = health_indicators_df['Diabetes_012'] .astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [31]:
health_indicators_df.dtypes

Diabetes_012              int64
HighBP                  float64
HighChol                float64
BMI                     float64
Fruits                  float64
Stroke                  float64
HeartDiseaseorAttack    float64
DiffWalk                float64
dtype: object

In [32]:
X.describe()

Unnamed: 0,HighBP,HighChol,BMI,Fruits,Stroke,HeartDiseaseorAttack,DiffWalk
count,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0
mean,0.429001,0.424121,0.648273,0.634256,0.040571,0.094186,0.168224
std,0.494934,0.49421,0.47751,0.481639,0.197294,0.292087,0.374066
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,1.0,1.0,0.0,0.0,0.0
75%,1.0,1.0,1.0,1.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [33]:
y.value_counts()

0    218334
1     35346
Name: Diabetes_012, dtype: int64

In [34]:
#Split the data into test and training
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50,random_state=1, stratify=y)
Counter(y_train)

Counter({'0': 109167, '1': 17673})

In [35]:
X_test

Unnamed: 0,HighBP,HighChol,BMI,Fruits,Stroke,HeartDiseaseorAttack,DiffWalk
5404,1.0,1.0,0.0,0.0,0.0,0.0,0.0
134844,0.0,0.0,1.0,1.0,0.0,0.0,0.0
129255,1.0,0.0,0.0,1.0,0.0,0.0,0.0
224524,0.0,0.0,1.0,1.0,0.0,0.0,0.0
92752,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
103802,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7476,0.0,1.0,1.0,1.0,0.0,0.0,0.0
96653,1.0,0.0,1.0,1.0,0.0,0.0,0.0
87326,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [36]:
#Initialize Logistic Regression Model
model = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)



In [37]:
#Fit the model
model.fit(X_train,y_train)

LogisticRegression(max_iter=200, random_state=1)

In [38]:
predictions = model.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
results.head(10)



Unnamed: 0,Prediction,Actual
5404,0,0
134844,0,0
129255,0,0
224524,0,0
92752,0,0
151708,0,0
55470,0,1
30688,0,0
94938,0,0
245034,0,0


In [39]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.8615894039735099

In [40]:
from imblearn.metrics import classification_report_imbalanced
matrix = confusion_matrix(y_test, predictions)
print(matrix)

[[107849   1318]
 [ 16238   1435]]


In [41]:
report = classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

           0       0.87      0.99      0.92    109167
           1       0.52      0.08      0.14     17673

    accuracy                           0.86    126840
   macro avg       0.70      0.53      0.53    126840
weighted avg       0.82      0.86      0.82    126840



   ### Oversampling

In [42]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)



In [43]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)



LogisticRegression(random_state=1)

In [44]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7082688329529998

In [45]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[80076, 29091],
       [ 5602, 12071]], dtype=int64)

In [46]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.93      0.73      0.68      0.82      0.71      0.50    109167
          1       0.29      0.68      0.73      0.41      0.71      0.50     17673

avg / total       0.85      0.73      0.69      0.76      0.71      0.50    126840



In [51]:
create_table = """
    CREATE TABLE IF NOT EXISTS health_indicators_df(
Diabetes_012            INTEGER,
HighBP                  FLOAT,
HighChol                FLOAT,
BMI                     FLOAT,
Fruits                  FLOAT,
Stroke                  FLOAT,
HeartDiseaseorAttack    FLOAT,
DiffWalk                FLOAT
    )
    """

In [52]:
# Execute SQL Command and commit to DB
cursor.execute(create_table)
conn.commit()