In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
df = pd.read_csv('heart.csv')

In [7]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [8]:
df['Sex'] = pd.get_dummies(df['Sex']).astype('int32').drop('F', axis = 1)

In [9]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,0,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,1,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,0,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,1,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [10]:
df.ChestPainType.unique()

array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object)

In [11]:
Chestpainle = LabelEncoder()

In [12]:
df['ChestPainType'] = Chestpainle.fit_transform(df['ChestPainType'])

In [13]:
RestingECGle = LabelEncoder()

In [14]:
df['RestingECG'] = RestingECGle.fit_transform(df['RestingECG'])

In [15]:
df.drop(['RestingECGle'], axis = 1, inplace = True)

KeyError: "['RestingECGle'] not found in axis"

In [17]:
ExerciseAnginale = LabelEncoder()

In [18]:
df['ExerciseAngina'] = ExerciseAnginale.fit_transform(df['ExerciseAngina'])

In [19]:
df['ST_Slope'].unique()

array(['Up', 'Flat', 'Down'], dtype=object)

In [20]:
ST_Slopele = LabelEncoder()

In [21]:
df['ST_Slope'] = ST_Slopele.fit_transform(df['ST_Slope'])

In [22]:
df.head(6)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0
5,39,1,2,120,339,0,1,170,0,0.0,2,0


# outlier detection 

In [23]:
df['zRestingBP'] = (df['RestingBP'] - df['RestingBP'].mean())/df['RestingBP'].std()

In [24]:
df.shape

(918, 13)

In [25]:
df = df[~((df['zRestingBP'] > 3) | (df['zRestingBP'] < - 3))] 

In [26]:
df['zCholesterol'] = (df['Cholesterol'] - df['Cholesterol'].mean())/ df['Cholesterol'].std()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['zCholesterol'] = (df['Cholesterol'] - df['Cholesterol'].mean())/ df['Cholesterol'].std()


In [27]:
df = df[~((df['zCholesterol'] > 3) | (df['zCholesterol'] < - 3))]

In [28]:
df['zMaxHR'] = (df['MaxHR'] - df['MaxHR'].mean())/df['MaxHR'].std()

In [29]:
df = df[~((df['zMaxHR'] > 3) | (df['zMaxHR'] < - 3))]

In [30]:
df['zOldpeak'] = (df['Oldpeak'] - df['Oldpeak'].mean())/df['Oldpeak'].std()

In [31]:
df = df[~((df['zOldpeak'] > 3) | (df['zOldpeak'] < - 3))]

In [32]:
df.drop(['zRestingBP', 'zCholesterol', 'zMaxHR', 'zOldpeak'], axis = 1, inplace = True)

In [33]:
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,3,110,264,0,1,132,0,1.2,1,1
914,68,1,0,144,193,1,1,141,0,3.4,1,1
915,57,1,0,130,131,0,1,115,1,1.2,1,1
916,57,0,1,130,236,0,0,174,0,0.0,1,1


# Data Decleration

In [34]:
X = df.drop('HeartDisease', axis = 1)

In [35]:
y = df['HeartDisease']

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# random forest

In [37]:
model1 = RandomForestClassifier(verbose = 1)

In [38]:
model1.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.5s


In [39]:
model1.score(X_test, y_test)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


0.8444444444444444

# SVM

In [40]:
model2 = SVC(verbose = 3)

In [41]:
model2.fit(X_train, y_train)

[LibSVM]

In [42]:
model2.score(X_test, y_test)

0.6777777777777778

# Logistic Regression

In [43]:
model3 = LogisticRegression(max_iter = 712)

In [44]:
model3.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [45]:
model3.score(X_test, y_test)

0.8444444444444444

# Using PCA

In [46]:
pca = PCA(5)

In [47]:
datapca = pca.fit_transform(df.drop('HeartDisease',axis = 1))

In [49]:
datapca.shape

(899, 5)

In [189]:
X_train, X_test, y_train, y_test = train_test_split(datapca, y, test_size = 0.2)

# Random Forest

In [190]:
model4 = RandomForestClassifier(verbose = 1)

In [191]:
model4.fit(X_train, y_train)
model4.score(X_test, y_test)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


0.8

# SVM

In [193]:
model5 =SVC(verbose = 1)

In [194]:
model5.fit(X_train, y_train)
model5.score(X_test, y_test)

[LibSVM]

0.7388888888888889

# Logistic Regression

In [201]:
model6 =LogisticRegression(verbose = 3, max_iter = 712)

In [202]:
model6.fit(X_train, y_train)
model6.score(X_test, y_test)

0.8