# Create the binary classifier model

In [1]:
#Import the NumPy and Pandas dependencies   
import numpy as np
import pandas as pd

# Import the Path module from the pathlib library
from pathlib import Path

# Import SKLearn dependencies
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

In [5]:
# Read the CSV file from the resources folder into a Pandas DataFrame

heart_data_path = Path('resources/heart_data_2015.csv')
heart_data_df = pd.read_csv(heart_data_path)

# Review the DataFrame
heart_data_df.head()

Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENUM,...,_PAREC1,_PASTAE1,_LMTACT1,_LMTWRK1,_LMTSCL1,_RFSEAT2,_RFSEAT3,_FLSHOT6,_PNEUMO2,_AIDTST3
0,1.0,1.0,b'01292015',b'01',b'29',b'2015',1200.0,2015000000.0,2015000000.0,1.0,...,4.0,2.0,1.0,1.0,1.0,1.0,1.0,,,1.0
1,1.0,1.0,b'01202015',b'01',b'20',b'2015',1100.0,2015000000.0,2015000000.0,1.0,...,2.0,2.0,3.0,3.0,4.0,2.0,2.0,,,2.0
2,1.0,1.0,b'02012015',b'02',b'01',b'2015',1200.0,2015000000.0,2015000000.0,1.0,...,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,
3,1.0,1.0,b'01142015',b'01',b'14',b'2015',1100.0,2015000000.0,2015000000.0,1.0,...,4.0,2.0,1.0,1.0,1.0,1.0,1.0,,,9.0
4,1.0,1.0,b'01142015',b'01',b'14',b'2015',1100.0,2015000000.0,2015000000.0,1.0,...,4.0,2.0,1.0,1.0,1.0,1.0,1.0,,,1.0


In [7]:
# Get the shape of the DataFrame
heart_data_df.shape

(441456, 330)

In [14]:
# Choose the columns to keep
heart_data_cols_df = heart_data_df[[ '_MICHD', 
                                        'GENHLTH',
                                        'BPMEDS',
                                        'BLOODCHO',
                                        'CHOLCHK',
                                         '_RFHYPE5',  
                                         'TOLDHI2', 
                                         '_BMI5', 
                                         'SMOKE100', 
                                         'CVDSTRK3', 'DIABETE3', 
                                         '_TOTINDA',
                                         '_RFDRHV5', 
                                         'PHYSHLTH', 'DIFFWALK', 
                                         'SEX', '_AGEG5YR',]]

heart_data_cols_df.head()   

Unnamed: 0,_MICHD,GENHLTH,BPMEDS,BLOODCHO,CHOLCHK,_RFHYPE5,TOLDHI2,_BMI5,SMOKE100,CVDSTRK3,DIABETE3,_TOTINDA,_RFDRHV5,PHYSHLTH,DIFFWALK,SEX,_AGEG5YR
0,2.0,5.0,1.0,1.0,1.0,2.0,1.0,4018.0,1.0,2.0,3.0,2.0,1.0,15.0,1.0,2.0,9.0
1,2.0,3.0,,1.0,4.0,1.0,2.0,2509.0,1.0,2.0,3.0,1.0,1.0,88.0,2.0,2.0,7.0
2,,4.0,,1.0,1.0,1.0,1.0,2204.0,,1.0,3.0,9.0,9.0,15.0,,2.0,11.0
3,2.0,5.0,1.0,1.0,1.0,2.0,1.0,2819.0,2.0,2.0,3.0,2.0,1.0,30.0,1.0,2.0,9.0
4,2.0,5.0,,1.0,1.0,1.0,2.0,2437.0,2.0,2.0,3.0,2.0,1.0,20.0,2.0,2.0,9.0


In [16]:
# Get the shape of the new DataFrame

heart_data_cols_df.shape    

(441456, 17)

In [None]:
# Rename the columns    

heart_data_cols_df = heart_data_cols_df.rename(columns={'_MICHD': 'Heart_Disease', 
                                                        'GENHLTH': 'Gen_Health',
                                                        'BPMEDS': 'BP_Meds',
                                                        'BLOODCHO': 'Bld_Chol',
                                                        'CHOLCHK': 'Chol_Check',   
                                                        '_RFHYPE5': 'High_BP', 
                                         'TOLDHI2', 
                                         '_BMI5', 
                                         'SMOKE100', 
                                         'CVDSTRK3', 'DIABETE3', 
                                         '_TOTINDA',
                                         '_RFDRHV5', 
                                         'PHYSHLTH', 'DIFFWALK', 
                                         'SEX', '_AGEG5YR',]]
                                                        
                                                        
                                                        
                                                        
                                                        
                                                        
                                                         {'_MICHD':'HeartDiseaseorAttack', 
                                         '_RFHYPE5':'HighBP',  
                                         'TOLDHI2':'HighChol', '_CHOLCHK':'CholCheck', 
                                         '_BMI5':'BMI', 
                                         'SMOKE100':'Smoker', 
                                         'CVDSTRK3':'Stroke', 'DIABETE3':'Diabetes', 
                                         '_TOTINDA':'PhysActivity', 
                                         '_FRTLT1':'Fruits', '_VEGLT1':"Veggies", 
                                         '_RFDRHV5':'HvyAlcoholConsump', 
                                         'HLTHPLN1':'AnyHealthcare', 'MEDCOST':'NoDocbcCost', 
                                         'GENHLTH':'GenHlth', 'MENTHLTH':'MentHlth', 'PHYSHLTH':'PhysHlth', 'DIFFWALK':'DiffWalk', 
                                         'SEX':'Sex', '_AGEG5YR':'Age', 'EDUCA':'Education', 'INCOME2':'Income' })