In [None]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix , classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import config as creds
import psycopg2

In [None]:

#Set up connection to database
def connect():
    
    # Set up a connection to the postgres server.
    conn_string = "host="+ creds.PGHOST +" port="+ "5432" +" dbname="+ creds.PGDATABASE +" user=" + creds.PGUSER \
                  +" password="+ creds.PGPASSWORD
    
    conn = psycopg2.connect(conn_string)
    print("Connected!")

    # Create a cursor object
    cursor = conn.cursor()
    
    return conn, cursor

In [None]:
conn, cursor = connect()

In [None]:
# Load the data

file_path = ('diabetes_012_health_indicators_BRFSS2015.csv')
df = pd.read_csv(file_path)
#print(df.shape)

#Convert diabetes to 1 and prediabetes to 0
def diabetes_binary(x, r):
    if x == r:
        return('1')
    if x < r:
        return('0')
    
df['Diabetes_012'] = df["Diabetes_012"].apply(diabetes_binary, args =[2])

#Convert BMI to binary
def bmi_binary(x, r):
    if x > r:
        return('1')
    if x <= r:
        return('0')
    
df['BMI'] = df["BMI"].apply(bmi_binary, args =[25])
df['BMI'] = df['BMI'].astype('float')



df.head(20)

                      

In [None]:
#Create the lifestyle df
health_indicators_df = df[['Diabetes_012','HighBP','HighChol','BMI','Fruits','Stroke','HeartDiseaseorAttack','DiffWalk']]

#Drops NA rows
health_indicators_df.dropna()

In [None]:
#Split the data into features(x) and Target(y)
y = health_indicators_df["Diabetes_012"]
X = health_indicators_df.drop(['Diabetes_012'], axis = 1)

In [None]:
#Check data types
health_indicators_df.dtypes

In [None]:
X.describe()

In [None]:
y.value_counts()

In [None]:
#Split the data into test and training
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50,random_state=1, stratify=y)
Counter(y_train)

In [None]:
X_test

In [None]:
#Initialize Logistic Regression Model
model = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)



In [None]:
#Fit the model
model.fit(X_train,y_train)

In [None]:
predictions = model.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
results.head(10)



In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

In [None]:
from imblearn.metrics import classification_report_imbalanced
matrix = confusion_matrix(y_test, predictions)
print(matrix)

In [None]:
report = classification_report(y_test, predictions)
print(report)

   ### Oversampling

In [None]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)



In [None]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)



In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# Execute SQL Command and commit to DB
conn.commit()