In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [2]:
df = pd.read_csv('heart.csv')
df.shape

(918, 12)

In [3]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
age_outliers = df[(df['Age'] > df['Age'].mean() + 3 * df['Age'].std()) | (df['Age'] < df['Age'].mean() - 3 * df['Age'].std())]
age_outliers

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease


In [5]:
restingbp_outliers = df[(df['RestingBP'] > df['RestingBP'].mean() + 3 * df['RestingBP'].std()) | (df['RestingBP'] < df['RestingBP'].mean() - 3 * df['RestingBP'].std())]
restingbp_outliers

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
109,39,M,ATA,190,241,0,Normal,106,N,0.0,Up,0
241,54,M,ASY,200,198,0,Normal,142,Y,2.0,Flat,1
365,64,F,ASY,200,0,0,Normal,140,Y,1.0,Flat,1
399,61,M,NAP,200,0,1,ST,70,N,0.0,Flat,1
449,55,M,NAP,0,0,0,Normal,155,N,1.5,Flat,1
592,61,M,ASY,190,287,1,LVH,150,Y,2.0,Down,1
732,56,F,ASY,200,288,1,LVH,133,Y,4.0,Down,1
759,54,M,ATA,192,283,0,LVH,195,N,0.0,Up,1


In [6]:
df.drop(restingbp_outliers.index, axis='index', inplace=True)

In [7]:
cholesterol_outliers = df[(df['Cholesterol'] > df['Cholesterol'].mean() + 3 * df['Cholesterol'].std()) | (df['Cholesterol'] < df['Cholesterol'].mean() - 3 * df['Cholesterol'].std())]
cholesterol_outliers

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
76,32,M,ASY,118,529,0,Normal,130,N,0.0,Flat,1
149,54,M,ASY,130,603,1,Normal,125,Y,1.0,Flat,1
616,67,F,NAP,115,564,0,LVH,160,N,1.6,Flat,0


In [8]:
df.drop(cholesterol_outliers.index, axis='index', inplace=True)

In [9]:
maxhr_outliers = df[(df['MaxHR'] > df['MaxHR'].mean() + 3 * df['MaxHR'].std()) | (df['MaxHR'] < df['MaxHR'].mean() - 3 * df['MaxHR'].std())]
maxhr_outliers

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
390,51,M,ASY,140,0,0,Normal,60,N,0.0,Flat,1


In [10]:
df.drop(maxhr_outliers.index, axis='index', inplace=True)

In [11]:
oldpeak_outliers = df[(df['Oldpeak'] > df['Oldpeak'].mean() + 3 * df['Oldpeak'].std()) | (df['Oldpeak'] < df['Oldpeak'].mean() - 3 * df['Oldpeak'].std())]
oldpeak_outliers

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
166,50,M,ASY,140,231,0,ST,140,Y,5.0,Flat,1
324,46,M,ASY,100,0,1,ST,133,N,-2.6,Flat,1
702,59,M,TA,178,270,0,LVH,145,N,4.2,Down,0
771,55,M,ASY,140,217,0,Normal,111,Y,5.6,Down,1
791,51,M,ASY,140,298,0,Normal,122,Y,4.2,Flat,1
850,62,F,ASY,160,164,0,LVH,145,N,6.2,Down,1
900,58,M,ASY,114,318,0,ST,140,N,4.4,Down,1


In [12]:
df.drop(oldpeak_outliers.index, axis='index', inplace=True)

In [13]:
df.shape

(899, 12)

In [14]:
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['ChestPainType'] = le.fit_transform(df['ChestPainType'])
df['RestingECG'] = le.fit_transform(df['RestingECG'])
df['ExerciseAngina'] = le.fit_transform(df['ExerciseAngina'])
df['ST_Slope'] = le.fit_transform(df['ST_Slope'])
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0


In [15]:
input_data = df.drop('HeartDisease', axis='columns')
target = df['HeartDisease']

In [16]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(input_data)
scaled_data

array([[-1.42815446,  0.515943  ,  0.2245723 , ..., -0.8229452 ,
        -0.85546862,  1.04249607],
       [-0.47585532, -1.93819859,  1.27063705, ..., -0.8229452 ,
         0.13751561, -0.62216462],
       [-1.7455875 ,  0.515943  ,  0.2245723 , ..., -0.8229452 ,
        -0.85546862,  1.04249607],
       ...,
       [ 0.3706328 ,  0.515943  , -0.82149245, ...,  1.21514774,
         0.33611246, -0.62216462],
       [ 0.3706328 , -1.93819859,  0.2245723 , ..., -0.8229452 ,
        -0.85546862, -0.62216462],
       [-1.63977649,  0.515943  ,  1.27063705, ..., -0.8229452 ,
        -0.85546862,  1.04249607]])

In [17]:
model_params = {
    'svm': {
        'model': svm.SVC(),
        'params': {
            'C': [1, 5, 10, 15, 20, 25, 30],
            'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            'gamma': ['auto', 'scale']
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [100, 125, 150, 175, 200]
        }
    },
    'logistic_regression': {
        'model': LogisticRegression(max_iter=1000),
        'params': {
            'C': [1, 5, 10, 15, 20, 25, 30],
            'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky']
        }
    },
    'gaussian_nb': {
        'model': GaussianNB(),
        'params': {}
    },
    'decision_tree_classifier': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini', 'entropy', 'log_loss'],
            'splitter': ['best', 'random']
        }
    },
    'knn': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 18, 20]
        }
    }
}

In [18]:
scores = []

for mn, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=10, return_train_score=False)
    clf.fit(scaled_data, target)

    scores.append({
        'model': mn,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])

Unnamed: 0,model,best_score,best_params
0,svm,0.853121,"{'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}"
1,random_forest,0.85422,{'n_estimators': 175}
2,logistic_regression,0.828639,"{'C': 1, 'solver': 'liblinear'}"
3,gaussian_nb,0.843071,{}
4,decision_tree_classifier,0.797553,"{'criterion': 'log_loss', 'splitter': 'random'}"
5,knn,0.849713,{'n_neighbors': 5}


In [19]:
pca = PCA(0.95)
pca_data = pca.fit_transform(scaled_data)
pca_data.shape

(899, 10)

In [20]:
scores = []

for mn, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=10, return_train_score=False)
    clf.fit(pca_data, target)

    scores.append({
        'model': mn,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])

Unnamed: 0,model,best_score,best_params
0,svm,0.865331,"{'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}"
1,random_forest,0.833084,{'n_estimators': 200}
2,logistic_regression,0.833096,"{'C': 1, 'solver': 'lbfgs'}"
3,gaussian_nb,0.834207,{}
4,decision_tree_classifier,0.771823,"{'criterion': 'entropy', 'splitter': 'best'}"
5,knn,0.85422,{'n_neighbors': 5}
