In [16]:
import pandas as pd
import numpy as np 
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
df.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [4]:
df.shape

(918, 12)

In [5]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


### Treat Outliers:
Remove outliers using Z score. Usual guideline is to remove anything that has Z score > 3 or Z score < -3. In general, a Z-score of -3.0 to 3.0 suggests that a data point is within three standard deviations of its mean.

In this case, since all numeric values in dataset are positive, we only need to consider Z score > 3 as outliers.

In [6]:
numeric_col = ['RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']
class_col = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

In [7]:
# remove outliers
df_update = df 
for col in numeric_col:
    df_update = df_update[df_update[col] <= (df_update[col].mean() + 3*df_update[col].std())]
    print('After removing outliers of column {}, df shape is {}'.format(col, df_update.shape))

After removing outliers of column RestingBP, df shape is (911, 12)
After removing outliers of column Cholesterol, df shape is (908, 12)
After removing outliers of column FastingBS, df shape is (908, 12)
After removing outliers of column MaxHR, df shape is (908, 12)
After removing outliers of column Oldpeak, df shape is (902, 12)


In [8]:
# determine unique values of text columns 
for col in class_col:
    unique = df[col].unique()
    print('For column {}, there are {} unique values.'.format(col, unique))

For column Sex, there are ['M' 'F'] unique values.
For column ChestPainType, there are ['ATA' 'NAP' 'ASY' 'TA'] unique values.
For column RestingECG, there are ['Normal' 'ST' 'LVH'] unique values.
For column ExerciseAngina, there are ['N' 'Y'] unique values.
For column ST_Slope, there are ['Up' 'Flat' 'Down'] unique values.


In [9]:
df_final = df_update.copy()
df_final.shape

(902, 12)

In [10]:
df_final = pd.get_dummies(df_final, drop_first=True)
df_final.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,1,1,0,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,0,1,1,0,0,0,1,0,0,1
3,48,138,214,0,108,1.5,1,0,0,0,0,1,0,1,1,0
4,54,150,195,0,122,0.0,0,1,0,1,0,1,0,0,0,1


In [22]:
X = df_final.drop('HeartDisease', axis = 1)
X.shape

(902, 15)

In [12]:
y = df_final.HeartDisease
y.head()

0    0
1    1
2    0
3    1
4    0
Name: HeartDisease, dtype: int64

In [13]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state = 30)

In [14]:
X_train.shape

(721, 15)

In [18]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB

model_params = {
    'SVC':{
        'model':SVC(),
        'params':{
            'C':[1, 10, 20],
            'kernel':['linear', 'rbf', 'sigmoid'],
            'gamma':['auto', 'scale']
        }
    },
    'random_forest':{
        'model':RandomForestClassifier(),
        'params':{
            'n_estimators':[1, 10, 20, 30]
        }
    },
    'logistic_regression':{
        'model':LogisticRegression(),
        'params':{
            'C':[1, 5, 10],
            'max_iter':[100, 150, 200]
        }
    },
    'gaussianNB':{
        'model':GaussianNB(),
        'params':{
            'var_smoothing':[1e-08, 1e-09]
        }
    },
    'decision_tree':{
        'model':DecisionTreeClassifier(),
        'params':{
            'splitter':['random', 'best'],
        }
    }
}

from sklearn.model_selection import GridSearchCV

scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
hyperparameters = pd.DataFrame(scores, columns = ['model', 'best_score', 'best_params'])
hyperparameters


Unnamed: 0,model,best_score,best_params
0,SVC,0.871015,"{'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}"
1,random_forest,0.869626,{'n_estimators': 20}
2,logistic_regression,0.865469,"{'C': 1, 'max_iter': 100}"
3,gaussianNB,0.865441,{'var_smoothing': 1e-08}
4,decision_tree,0.809933,{'splitter': 'random'}


### Use PCA to reduce dimensions 

In [19]:
from sklearn.decomposition import PCA
pca = PCA(0.95) # preserve features such that 95% of variance is retained
# we can also use n_component 
X_pca = pca.fit_transform(X_scaled)
X_pca.shape

(902, 13)

In [23]:
pca.explained_variance_ratio_ # features ordered by its importance in descending order

array([0.22805968, 0.11019144, 0.0947933 , 0.08197633, 0.07465653,
       0.07115659, 0.06264869, 0.05493807, 0.0511539 , 0.04360192,
       0.04016294, 0.0301208 , 0.02841237])

In [24]:
X_pca_train, X_pca_test, y_pca_train, y_pca_test = train_test_split(X_pca, y, test_size = 0.2)
model = SVC()
model.fit(X_pca_train, y_pca_train)
model.score(X_pca_test, y_pca_test)

0.8729281767955801

### Even with the reduced dimentionality of features, the model performance did not get impacted much. However, reduced dimentionality allows us to better visualize the dataset, and investigate the relationship.