
## Heart 2020 Naive Bayes


In [2]:
# Load library
import pandas as pd

# Read the data
Heart2020 =  pd.read_csv(r'C:\Users\roryq\Downloads\heart2020 (1).csv')


# Check the shape of the data frame
Heart2020.shape

(319795, 18)

In [3]:
Heart2020.head(5)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


To facilitate future modeling process, we will convert the two ordinal variables, `GenHealth` and `AgeCategory`, to integers first.

In [22]:
GH_mapper = {'Poor':1,
              'Fair':2, 
              'Good':3,
              'Very good':4, 
              'Excellent':5}

GH = Heart2020['GenHealth'].replace(GH_mapper)
Heart2020['GenHealth'] = GH

In [27]:
age_mapper = {'18-24':1,
              '25-29':2, 
              '30-34':3,
              '35-39':4, 
              '40-44':5, 
              '45-49':6, 
              '50-54':7,
              '55-59':8,  
              '60-64':9, 
              '65-69':10,
              '70-74':11,
              '75-79':12, 
              '80 or older':13}

Age = Heart2020['AgeCategory'].replace(age_mapper)
Heart2020['AgeCategory'] = Age

In [25]:
Heart2020.head(5)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,8,White,Yes,Yes,4,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,13,White,No,Yes,4,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,10,White,Yes,Yes,2,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,12,White,No,No,3,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,5,White,No,Yes,4,8.0,No,No,No


###  Gaussian Naive Bayes 
Create a naive Bayes model to predict the chance of having heart disease using numerical variables only. 


In [16]:
from sklearn import datasets
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

# Specify feature data
data_num =  Heart2020[['PhysicalHealth', 'MentalHealth', 'SleepTime', 'AgeCategory', 'GenHealth', 'BMI']]
data_cat= Heart2020[['Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'Race','Diabetic','PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer']]

# Specify target data
y = Heart2020['HeartDisease']

# Specify training and test data with 75% training split
X_train, X_test, y_train, y_test = train_test_split(
    data_num, y, random_state=100, stratify=y, test_size=.25)

# Select model type
classifier_NB = GaussianNB()

# Fit and train model
model_NB = classifier_NB.fit(X_train, y_train)

# Score Model with training data
model_NB.score(X_train, y_train)

0.8721054343203556

In [17]:
# Score model with test data
model_NB.score(X_test, y_test)

0.8700046279503183

### Multinomial Naive Bayes 
Create a naive Bayes model to predict the chance of having heart disease using categorical variables only. 

In [39]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

# Specify feature data
data_cat1 = pd.get_dummies(data_cat, columns = ['Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'Race','Diabetic','PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer'])

# Specify training and test data
X_train, X_test, y_train, y_test = train_test_split(
    data_cat1, y, random_state=100, stratify=y, test_size=.25)

# Create and train model
model_MNB = classifier_MN.fit(data_cat1, y)

# Score model with test set
model_MNB.score(X_test, y_test)

0.9049018749452776

In [40]:
# Score model with training set
model_MNB.score(X_train, y_train)

0.9049640185785879

###  Naive Bayes Using All Variables 
Transform all categorical variables to dummy variables or integers and use Gaussian naive Bayes.

In [28]:
# Create dummy variables
d=Heart2020.loc[:, Heart2020.columns != 'HeartDisease']
data_dummy= pd.get_dummies(d)

In [72]:
# Specify Features
features= data_dummy
# Specify target
y= Heart2020['HeartDisease']

# Specify training and test data
X_train, X_test, y_train, y_test = train_test_split(
    features, y, random_state=100, stratify=y, test_size=.25)

# Select model type
classifier_NB1 = GaussianNB()

# Create and train model
model_NB1 = classifier_NB1.fit(X_train, y_train)

# Score model with training data
model_NB1.score(X_train, y_train)

0.8235868015309824

In [69]:
# Score model with test data
model_NB1.score(X_test, y_test)

0.822311723723874

## Conclusion

+ All models were fairly accurate
+ The multinomial NB using only categorical variables has the best accuracy
+ The Gaussian NB with all variables performed the worst
    + Even though the worst it still had an accuracy of over 82% in predicting heart disease

