# ML Project

## 1. Importing Required Libraries

In [84]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from itertools import combinations
import collections
from matplotlib.figure import Figure
from matplotlib.ticker import MaxNLocator

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import model_selection
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

## 2. Exploratory Data Analysis

In [85]:
heart_2020_cleaned_df = pd.read_csv("data/heart_2020_cleaned.csv")
print("\n Heart 2020 Cleaned \n")
print(heart_2020_cleaned_df.info())


 Heart 2020 Cleaned 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  object 
 1

In [86]:
print("\n NA Values \n")
print(heart_2020_cleaned_df.isna().sum())


 NA Values 

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64


In [87]:
heart_2020_cleaned_df.head().T

Unnamed: 0,0,1,2,3,4
HeartDisease,No,No,No,No,No
BMI,16.6,20.34,26.58,24.21,23.71
Smoking,Yes,No,Yes,No,No
AlcoholDrinking,No,No,No,No,No
Stroke,No,Yes,No,No,No
PhysicalHealth,3.0,0.0,20.0,0.0,28.0
MentalHealth,30.0,0.0,30.0,0.0,0.0
DiffWalking,No,No,No,No,Yes
Sex,Female,Female,Male,Female,Female
AgeCategory,55-59,80 or older,65-69,75-79,40-44


## 3. Data Preprocessing

In [88]:
# Make a copy of the original DataFrame to perform edits on
df_tmp = heart_2020_cleaned_df.copy()

### 3.1 Converting strings (object) to categories

In [89]:
print("Labels for which data is string:")
for label, content in df_tmp.items():
    if pd.api.types.is_string_dtype(content):
        print(label)

Labels for which data is string:
HeartDisease
Smoking
AlcoholDrinking
Stroke
DiffWalking
Sex
AgeCategory
Race
Diabetic
PhysicalActivity
GenHealth
Asthma
KidneyDisease
SkinCancer


In [90]:
# This will turn all of the string values into category values
for label, content in df_tmp.items():
    if pd.api.types.is_string_dtype(content):
        df_tmp[label] = content.astype("category").cat.as_ordered()
df_tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   HeartDisease      319795 non-null  category
 1   BMI               319795 non-null  float64 
 2   Smoking           319795 non-null  category
 3   AlcoholDrinking   319795 non-null  category
 4   Stroke            319795 non-null  category
 5   PhysicalHealth    319795 non-null  float64 
 6   MentalHealth      319795 non-null  float64 
 7   DiffWalking       319795 non-null  category
 8   Sex               319795 non-null  category
 9   AgeCategory       319795 non-null  category
 10  Race              319795 non-null  category
 11  Diabetic          319795 non-null  category
 12  PhysicalActivity  319795 non-null  category
 13  GenHealth         319795 non-null  category
 14  SleepTime         319795 non-null  float64 
 15  Asthma            319795 non-null  category
 16  Ki

In [91]:
df_tmp.GenHealth.cat.categories

Index(['Excellent', 'Fair', 'Good', 'Poor', 'Very good'], dtype='object')

In [92]:
df_tmp.GenHealth.cat.codes

0         4
1         4
2         1
3         2
4         4
         ..
319790    1
319791    4
319792    2
319793    2
319794    2
Length: 319795, dtype: int8

#### All of our data is categorical and thus we can now turn the categories into numbers.

### 3.2 Saving Processed Data

In [93]:
# Saving
df_tmp.to_csv("data/heart_2020_cleaned_preprocessed.csv", index=False)

In [94]:
# Importing
df_tmp = pd.read_csv("data/heart_2020_cleaned_preprocessed.csv")
df_tmp.head().T

Unnamed: 0,0,1,2,3,4
HeartDisease,No,No,No,No,No
BMI,16.6,20.34,26.58,24.21,23.71
Smoking,Yes,No,Yes,No,No
AlcoholDrinking,No,No,No,No,No
Stroke,No,Yes,No,No,No
PhysicalHealth,3.0,0.0,20.0,0.0,28.0
MentalHealth,30.0,0.0,30.0,0.0,0.0
DiffWalking,No,No,No,No,Yes
Sex,Female,Female,Male,Female,Female
AgeCategory,55-59,80 or older,65-69,75-79,40-44


### 3.3 Turning categorical values to numbers

In [95]:
# Check columns which *aren't* numeric
for label, content in df_tmp.items():
    if not pd.api.types.is_numeric_dtype(content):
        print(label)

HeartDisease
Smoking
AlcoholDrinking
Stroke
DiffWalking
Sex
AgeCategory
Race
Diabetic
PhysicalActivity
GenHealth
Asthma
KidneyDisease
SkinCancer


In [96]:
# Turn categorical variables into numbers and fill missing
for label, content in df_tmp.items():
    # Check columns which *aren't* numeric
    if not pd.api.types.is_numeric_dtype(content):
        df_tmp[label] = pd.Categorical(content).codes

In [97]:
df_tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  int8   
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  int8   
 3   AlcoholDrinking   319795 non-null  int8   
 4   Stroke            319795 non-null  int8   
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  int8   
 8   Sex               319795 non-null  int8   
 9   AgeCategory       319795 non-null  int8   
 10  Race              319795 non-null  int8   
 11  Diabetic          319795 non-null  int8   
 12  PhysicalActivity  319795 non-null  int8   
 13  GenHealth         319795 non-null  int8   
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  int8   
 16  KidneyDisease     31

In [98]:
df_tmp.head().T

Unnamed: 0,0,1,2,3,4
HeartDisease,0.0,0.0,0.0,0.0,0.0
BMI,16.6,20.34,26.58,24.21,23.71
Smoking,1.0,0.0,1.0,0.0,0.0
AlcoholDrinking,0.0,0.0,0.0,0.0,0.0
Stroke,0.0,1.0,0.0,0.0,0.0
PhysicalHealth,3.0,0.0,20.0,0.0,28.0
MentalHealth,30.0,0.0,30.0,0.0,0.0
DiffWalking,0.0,0.0,0.0,0.0,1.0
Sex,0.0,0.0,1.0,0.0,0.0
AgeCategory,7.0,12.0,9.0,11.0,4.0


### 3.4 Train - Test Split

In [99]:
# Split data into X and y
# Split data into X and y
X = df_tmp.drop("HeartDisease", axis=1)

y = df_tmp["HeartDisease"]

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [101]:
X.head()

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,16.6,1,0,0,3.0,30.0,0,0,7,5,2,1,4,5.0,1,0,1
1,20.34,0,0,1,0.0,0.0,0,0,12,5,0,1,4,7.0,0,0,0
2,26.58,1,0,0,20.0,30.0,0,1,9,5,2,1,1,8.0,1,0,0
3,24.21,0,0,0,0.0,0.0,0,0,11,5,0,0,2,6.0,0,0,1
4,23.71,0,0,0,28.0,0.0,1,0,4,5,0,1,4,8.0,0,0,0


In [102]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: HeartDisease, dtype: int8

In [103]:
# Split into train & test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [104]:
X_train.head()

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
94577,52.13,0,0,0,0.0,0.0,0,0,4,5,0,1,2,6.0,0,0,0
167034,24.8,0,0,0,0.0,0.0,1,0,10,5,0,1,2,6.0,0,0,0
104959,34.21,0,1,0,0.0,2.0,0,0,9,5,0,1,2,8.0,1,0,1
196047,32.12,0,0,0,0.0,0.0,0,0,8,5,0,1,4,7.0,0,0,0
280900,48.36,0,0,0,5.0,20.0,0,0,5,5,0,0,2,3.0,0,0,0


In [105]:
y_train.head()

94577     0
167034    0
104959    1
196047    0
280900    0
Name: HeartDisease, dtype: int8