### SC1015 Mini Project

In [None]:
#### The dataset used is the 2020 Behavioral Risk Factor Surveillance System (BRFSS) data provided by the Centers for Disease Control and Prevention (CDC), a national public health agency of the United States. 
#### The chosen dataset contains information of 401,958 records of the surveys conducted through landline and cellular telephone. According to CDC, the BRFSS's aims to collect data about chronic diseases and conditions, health risk behaviors, accessibility to health care facilities, and use of health services related to causes of disability and death.
#### Dataset source url: https://www.cdc.gov/brfss/annual_data/annual_2020.html 

In [None]:
# Import basic libraries
import numpy as np
import pandas as pd


In [None]:
# Importing raw data in csv format
raw_data = pd.read_csv('Selected_Data_2020.csv')
raw_data.head()

Unnamed: 0.1,Unnamed: 0,X_AGE_G,X_SEX,HTM4,WTKG3,X_BMI5,MARITAL,X_INCOMG,X_EDUCAG,GENHLTH,...,CVDSTRK3,ADDEPEV3,CHCKDNY2,CHCOCNCR,X_DRDXAR2,CHECKUP1,X_RFDRHV7,PDIABTST,X_TOTINDA,DIABETE4
0,0,6,0,0.0168,1.0,0.0,1.0,4,2,3.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0,0.0,1
1,1,3,0,0.018,1.0,0.0,2.0,4,4,2.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,1
2,2,6,0,0.0168,1.0,0.0,1.0,5,4,4.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1,0.0,1
3,3,6,0,0.0155,1.0,0.0,2.0,2,1,4.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0,2.0,1
4,4,6,1,0.0183,1.0,0.0,2.0,1,2,3.0,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1,0.0,1


In [None]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144917 entries, 0 to 144916
Data columns (total 25 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  144917 non-null  int64  
 1   X_AGE_G     144917 non-null  int64  
 2   X_SEX       144917 non-null  int64  
 3   HTM4        144917 non-null  float64
 4   WTKG3       144917 non-null  float64
 5   X_BMI5      144917 non-null  float64
 6   MARITAL     144917 non-null  float64
 7   X_INCOMG    144917 non-null  int64  
 8   X_EDUCAG    144917 non-null  int64  
 9   GENHLTH     144917 non-null  float64
 10  PHYSHLTH    144917 non-null  float64
 11  MENTHLTH    144917 non-null  float64
 12  SLEPTIM1    144917 non-null  float64
 13  X_SMOKER3   144917 non-null  int64  
 14  X_MICHD     144917 non-null  float64
 15  CVDSTRK3    144917 non-null  float64
 16  ADDEPEV3    144917 non-null  float64
 17  CHCKDNY2    144917 non-null  float64
 18  CHCOCNCR    144917 non-null  float64
 19  X_

#### There are too many variables in the dataset. Hence, we decided to filter the dataset and select the variables that are more important for the diseases we are studying. The chosen variables and their description along with their redefined variable name which is assigned to improve readability are listed in the "variable.xlsx" file. *(To prevent overloading the notebook with content)*

In [None]:
# Data is filtered based on variables chosen
selected_data  =  raw_data[['X_AGE_G' , 'X_SEX'    , 'HTM4'     , 'WTKG3'    , 'X_BMI5'   , 'MARITAL'  ,
                          'X_INCOMG'  , 'X_EDUCAG' , 'GENHLTH'  , 'PHYSHLTH' , 'MENTHLTH' , 'SLEPTIM1' ,
                          'X_SMOKER3' , 'X_MICHD'  , 'CVDSTRK3' , 'ADDEPEV3' , 'CHCKDNY2' , 'CHCOCNCR' ,
                          'X_DRDXAR2' , 'CHECKUP1' ,  'PDIABTST' , 'X_TOTINDA', 'DIABETE4' ,
                          'X_RFDRHV7' ]]

In [None]:
# The rows that contains null values are removed
selected_data = selected_data.dropna()
selected_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144917 entries, 0 to 144916
Data columns (total 24 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   X_AGE_G    144917 non-null  int64  
 1   X_SEX      144917 non-null  int64  
 2   HTM4       144917 non-null  float64
 3   WTKG3      144917 non-null  float64
 4   X_BMI5     144917 non-null  float64
 5   MARITAL    144917 non-null  float64
 6   X_INCOMG   144917 non-null  int64  
 7   X_EDUCAG   144917 non-null  int64  
 8   GENHLTH    144917 non-null  float64
 9   PHYSHLTH   144917 non-null  float64
 10  MENTHLTH   144917 non-null  float64
 11  SLEPTIM1   144917 non-null  float64
 12  X_SMOKER3  144917 non-null  int64  
 13  X_MICHD    144917 non-null  float64
 14  CVDSTRK3   144917 non-null  float64
 15  ADDEPEV3   144917 non-null  float64
 16  CHCKDNY2   144917 non-null  float64
 17  CHCOCNCR   144917 non-null  float64
 18  X_DRDXAR2  144917 non-null  float64
 19  CHECKUP1   144917 non-n

#### The variables are explored invididually and renamed to be more readable. 
#### The values of the categorical variables are also listed here.

#### X_AGE_G

In [None]:
# Age category of the respondent

# Values:
# 1: Age 18 to 24
# 2: Age 25 to 34
# 3: Age 35 to 44
# 4: Age 45 to 54
# 5: Age 55 to 64
# 6: Age 65 or older

selected_data.X_AGE_G.unique()

array([6, 3, 5, 4, 1, 2], dtype=int64)

#### X_SEX

In [None]:
# Sex of the respondent

# Values:
# 0: Male
# 1: Female

#selected_data = selected_data[selected_data.X_SEX != 9]
selected_data['X_SEX'] = selected_data['X_SEX'].replace({2:0})
selected_data['X_SEX'].unique()

array([0, 1], dtype=int64)

#### HTM4

In [None]:
# Height of respondent in meters

selected_data['HTM4'] = selected_data['HTM4'].div(100)
selected_data['HTM4'].unique()

array([1.68e-04, 1.80e-04, 1.55e-04, 1.83e-04, 1.52e-04, 1.63e-04,
       1.50e-04, 1.85e-04, 1.70e-04, 1.57e-04, 1.60e-04, 1.73e-04,
       1.78e-04, 1.88e-04, 1.75e-04, 1.65e-04, 1.91e-04, 2.01e-04,
       1.96e-04, 1.98e-04, 1.93e-04, 2.06e-04, 1.47e-04, 1.42e-04,
       1.45e-04, 1.37e-04, 1.32e-04, 1.22e-04, 1.24e-04, 1.40e-04,
       1.62e-04, 1.61e-04, 1.53e-04, 1.90e-04, 1.05e-04, 1.06e-04,
       1.69e-04, 1.76e-04, 1.54e-04, 2.03e-04, 1.67e-04, 1.82e-04,
       1.27e-04, 2.08e-04, 2.11e-04, 2.13e-04, 1.19e-04, 1.35e-04,
       2.16e-04, 1.56e-04, 1.72e-04, 1.58e-04, 1.66e-04, 1.04e-04,
       1.59e-04, 1.14e-04, 1.25e-04, 1.71e-04, 1.01e-04, 1.44e-04,
       1.64e-04, 1.38e-04, 1.20e-04, 1.30e-04, 1.74e-04, 1.07e-04,
       1.48e-04, 2.29e-04, 9.10e-05, 1.86e-04, 1.77e-04, 2.18e-04,
       1.49e-04, 1.17e-04, 1.51e-04, 1.09e-04, 1.03e-04, 1.95e-04,
       1.79e-04, 1.23e-04, 1.00e-04, 2.09e-04, 1.39e-04])

#### WTKG3

In [None]:
# Weight of respondent in kilograms

selected_data['WTKG3'] = selected_data['WTKG3'].div(100).round(0)
selected_data['WTKG3'].unique()

array([0.])

#### X_BMI5 

In [None]:
# Body Mass Index (BMI) of respondent

selected_data['X_BMI5'] = selected_data['X_BMI5'].div(100).round(0)
selected_data['X_BMI5'].unique()

array([0.])

#### MARITAL

In [None]:
# Marital status of respondent

selected_data = selected_data[selected_data['MARITAL'] != 9]
selected_data['MARITAL'].unique()

array([1., 2., 3., 5., 4., 6.])

#### X_INCOMG 

In [None]:
# Income category of respondent

selected_data = selected_data[selected_data['X_INCOMG'] != 9]
selected_data['X_INCOMG'].unique()

array([4, 5, 2, 1, 3], dtype=int64)

#### X_EDUCAG 

In [None]:
# Level of education completed by respondent

selected_data = selected_data[selected_data['X_EDUCAG'] != 9]
selected_data['X_EDUCAG'].unique()

array([2, 4, 1, 3], dtype=int64)

#### GENHLTH 

In [None]:
# General health of respondent (Self-opinion)

selected_data = selected_data[selected_data['GENHLTH'] != 7]
selected_data = selected_data[selected_data['GENHLTH'] != 9]
selected_data['GENHLTH'].unique()

array([3., 2., 4., 5., 1.])

#### MENTHLTH 

In [None]:
# Number of days when mental health is poor (In past 30 days)

selected_data = selected_data[selected_data['MENTHLTH'] != 77]
selected_data['MENTHLTH'] = selected_data['MENTHLTH'].replace({88:0})
selected_data = selected_data[selected_data['MENTHLTH'] != 99]
selected_data['MENTHLTH'].unique()

array([ 0., 30.,  2., 15.,  8.,  4.,  5., 10.,  3., 14., 20.,  1.,  7.,
       24., 28., 16., 25., 17.,  9., 12., 18., 29.,  6., 23., 21., 22.,
       13., 26., 27., 11., 19.])

#### PHYSHLTH

In [None]:
# Number of days when physical health is poor (In past 30 days)

selected_data = selected_data[selected_data['PHYSHLTH'] != 77]
selected_data['PHYSHLTH'] = selected_data['PHYSHLTH'].replace({88:0})
selected_data = selected_data[selected_data['PHYSHLTH'] != 99]
selected_data['PHYSHLTH'].unique()

array([ 0., 28., 15.,  7.,  1.,  5., 30., 21.,  4.,  3.,  2., 20., 14.,
       18.,  6., 10., 25., 27., 17.,  8., 12., 23., 16., 19., 29.,  9.,
       26., 22., 13., 24., 11.])

#### SLEPTIM1

In [None]:
# Hours of sleep in 24-hour period

selected_data = selected_data[selected_data['SLEPTIM1'] != 77]
selected_data['SLEPTIM1'].unique()

array([ 6.,  8.,  4.,  5., 10.,  7.,  9., 12.,  3.,  2., 15.,  1., 16.,
       18., 99., 14., 11., 20., 13., 24., 22., 17., 19.])

#### X_MICHD

In [None]:
# Coronary heart disease/Myocardial infarction

# Values:
# 0: Did not report having MI or CHD
# 1: Reported having MI or CHD

selected_data['X_MICHD'] = selected_data['X_MICHD'].replace({2:0})
selected_data['X_MICHD'].unique()

array([0., 1.])

#### CVDSTRK3

In [None]:
# Stroke

# Values:
# 0: No
# 1: Yes

selected_data = selected_data[selected_data['CVDSTRK3'] != 7]
selected_data = selected_data[selected_data['CVDSTRK3'] != 9]
selected_data['CVDSTRK3'] = selected_data['CVDSTRK3'].replace({2:0})
selected_data['CVDSTRK3'].unique()

array([0., 1.])

#### ADDEPEV3

In [None]:
# Depressive disorder

# Values:
# 0: No
# 1: Yes

selected_data = selected_data[selected_data['ADDEPEV3'] != 7]
selected_data = selected_data[selected_data['ADDEPEV3'] != 9]
selected_data['ADDEPEV3'] = selected_data['ADDEPEV3'].replace({2:0})
selected_data['ADDEPEV3'].unique()

array([0., 1.])

#### CHCKDNY2

In [None]:
# Kidney disease

# Values:
# 0: No
# 1: Yes

selected_data = selected_data[selected_data['CHCKDNY2'] != 7]
selected_data = selected_data[selected_data['CHCKDNY2'] != 9]
selected_data['CHCKDNY2'] = selected_data['CHCKDNY2'].replace({2:0})
selected_data['CHCKDNY2'].unique()

array([0., 1.])

#### DIABETE4

In [None]:
# Diabetes

# Values:
# 0: No
# 1: Pre-diabetes/Borderline diabetes
# 2: Yes

selected_data['DIABETE4'] = selected_data['DIABETE4'].replace({2:0, 3:0, 1:2, 4:1})
selected_data = selected_data[selected_data.DIABETE4 != 7]
selected_data = selected_data[selected_data.DIABETE4 != 9]
selected_data['DIABETE4'].unique()


array([2, 0], dtype=int64)

#### X_DRDXAR2

In [None]:
# Arthritis

# Values:
# 0: Not diagnosed with arthritis
# 1: Diagnosed with arthritis

selected_data['X_DRDXAR2'] = selected_data['X_DRDXAR2'].replace({2:0})
selected_data.X_DRDXAR2.unique()

array([0., 1.])

#### CHECKUP1

In [None]:
# How long since last routine checkup

# Values:
# 1: Less than 12 months
# 2: 1 to 2 years
# 3: 2 to 5 years
# 4: 5 or more years

selected_data = selected_data[selected_data.CHECKUP1 != 7]
selected_data = selected_data[selected_data.CHECKUP1 != 8]
selected_data = selected_data[selected_data.CHECKUP1 != 9]
selected_data.CHECKUP1.unique()

array([1., 3., 2., 4.])

#### PDIABTST

In [None]:
# High blood sugar/diabetes test (In the past 3 years)

# Values:
# 0: No
# 1: Yes

selected_data['PDIABTST'] = selected_data['PDIABTST'].replace({2:0})
selected_data = selected_data[selected_data.PDIABTST != 7]
selected_data = selected_data[selected_data.PDIABTST != 9]
selected_data.PDIABTST.unique()

array([0, 1], dtype=int64)

#### X_TOTINDA

In [None]:
# Exercise in the past 30 days

# Values:
# 0: No exercise (in last 30 days)
# 1: Had exercise 

selected_data['X_TOTINDA'] = selected_data['X_TOTINDA'].replace({2:0})
selected_data = selected_data[selected_data.X_TOTINDA != 9]
selected_data.X_TOTINDA.unique()

array([0.])

#### X_SMOKER3

In [None]:
# Smoker status

# Values:
# 1: Current smoker (Every day)
# 2: Current smoker (Some days)
# 3: Former smoker
# 4: Never smoked

selected_data = selected_data[selected_data.X_SMOKER3 != 9]
selected_data.X_SMOKER3.unique()

array([4, 3, 1, 2], dtype=int64)

#### X_RFDRHV7

In [None]:
# Heavy drinker

# Values:
# 0: Yes
# 1: No

selected_data['X_RFDRHV7'] = selected_data['X_RFDRHV7'].replace({2:0})
selected_data = selected_data[selected_data.X_RFDRHV7 != 9]
selected_data['X_RFDRHV7'].unique()

array([1., 0.])

#### CHCOCNCR

In [None]:
# (Did the respondent have any type of) Cancer

# Values:
# 0: No
# 1: Yes

selected_data['CHCOCNCR'] = selected_data['CHCOCNCR'].replace({2:0})
selected_data = selected_data[selected_data['CHCOCNCR'] != 7]
selected_data = selected_data[selected_data['CHCOCNCR'] != 9]
selected_data['CHCOCNCR'].unique()

array([1., 0.])

### Renaming the variables

In [None]:
selected_data = selected_data.rename(columns = {'X_AGE_G'  :'Age Category', 'X_SEX':'Sex', 'HTM4':'Height', 
                                                'WTKG3'    :'Weight', 'X_BMI5':'BMI', 'INCOME2':'Income Level',
                                                'X_INCOMG' :'Income Category', 'X_EDUCAG':"Education Level", 
                                                'GENHLTH'  :'General Health', 'PHYSHLTH':'No of Days of Poor Physical Health','MENTHLTH':'No of Days of Poor Mental Health ',  
                                                'SLEPTIM1' :'Total Sleeping Time', 'MARITAL':'Marital Status',
                                                'X_MICHD'  :'Coronary Heart Disease/Myocardial Infarction', 'CVDSTRK3':'Stroke', 'ADDEPEV3':'Depressive Disorder', 'CHCKDNY2':'Kidney Disease', 
                                                'DIABETE4' :'Diabetes', 'X_DRDXAR2':'Arthritis', 'CHECKUP1':'Time since last routine checkup', 'COLNSCPY':'Colonoscopy Check',
                                                'PDIABTST' :'High blood sugar/Diabetes Test','X_TOTINDA':'Workout','X_SMOKER3':'Smoker Status',
                                                'X_RFDRHV7':'Heavy Drinker','X_URBSTAT':'Urban/Rural','CHCOCNCR':'Cancer'})

In [None]:
selected_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 144917 entries, 0 to 144916
Data columns (total 24 columns):
 #   Column                                        Non-Null Count   Dtype  
---  ------                                        --------------   -----  
 0   Age Category                                  144917 non-null  int64  
 1   Sex                                           144917 non-null  int64  
 2   Height                                        144917 non-null  float64
 3   Weight                                        144917 non-null  float64
 4   BMI                                           144917 non-null  float64
 5   Marital Status                                144917 non-null  float64
 6   Income Category                               144917 non-null  int64  
 7   Education Level                               144917 non-null  int64  
 8   General Health                                144917 non-null  float64
 9   No of Days of Poor Physical Health            14

In [None]:
selected_data.to_csv("New_Selected_Data_2020.csv")

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, Softmax
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.utils import resample


In [None]:

#features.drop('General Health',inplace=True, axis=1)
#features.drop('No of Days of Poor Physical Health',inplace=True, axis=1)
#features.drop('Cancer',inplace=True, axis=1)

#features.drop('Stroke',inplace=True, axis=1)

#train_features,test_features,train_labels,test_labels = train_test_split(features,labels,test_size=0.2)

#Separating the target variable (Heart) from predictor variables
features = selected_data.copy()
labels= selected_data.pop('Coronary Heart Disease/Myocardial Infarction')
features.drop('Coronary Heart Disease/Myocardial Infarction',inplace=True, axis=1)

#Split the train/test set
train_features,test_features,train_labels,test_labels = train_test_split(features,labels,test_size=0.2)

#Re-merge the training features and labels for upsampling
selected_data= pd.concat([train_features, train_labels], axis=1)

#Detecting negative and positive for upsampling
negative = selected_data.loc[selected_data['Coronary Heart Disease/Myocardial Infarction']==0]
positive = selected_data.loc[selected_data['Coronary Heart Disease/Myocardial Infarction']==1]

#upsampling code
pos_upsampled = resample(positive,replace=True, # sample with replacement
                        n_samples=len(negative), # match number in majority class
 random_state=27) 

#Get the BIG,UPSAMPLED dataset
selected_data = pd.concat([negative, pos_upsampled])

selected_data.shape

#Again, splitting the already upsampled dataset
features = selected_data.copy()
labels= selected_data.pop('Coronary Heart Disease/Myocardial Infarction')
features.drop('Coronary Heart Disease/Myocardial Infarction',inplace=True, axis=1)
train_features,test_features,train_labels,test_labels = train_test_split(features,labels,test_size=0.2)


In [None]:
model = Sequential([
    Dense(100,  activation='relu'),
    Dense(50,  activation='relu'),
    Dense(50,  activation='relu'),
    Dense(1, activation='sigmoid'),
    
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.Recall(),'accuracy',tf.keras.metrics.Precision()])

In [None]:
model.fit(features,labels, epochs=10,batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x176006963d0>

In [None]:
model.evaluate(test_features,test_labels,verbose=1, batch_size=1)



[0.47178518772125244,
 0.7734864354133606,
 0.7327145934104919,
 0.16850483417510986]