In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
data = pd.read_csv("/content/Heart.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,1,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
1,2,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
2,3,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
3,4,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
4,5,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No


In [4]:
data.shape

(303, 15)

In [5]:
# prompt: drop the Unnamed: 0 column permanently

data = data.drop('Unnamed: 0', axis=1)


In [6]:
data.head()

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
1,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
2,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
3,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
4,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No


In [7]:
# prompt: want to change the column's ChestPain, and Thal values from string to numerical using Label Encoder

from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
le = LabelEncoder()

# Fit and transform the ChestPain column
data['ChestPain'] = le.fit_transform(data['ChestPain'])

# Fit and transform the Thal column
data['Thal'] = le.fit_transform(data['Thal'])


In [8]:
data.head()

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,63,1,3,145,233,1,2,150,0,2.3,3,0.0,0,No
1,67,1,0,160,286,0,2,108,1,1.5,2,3.0,1,Yes
2,67,1,0,120,229,0,2,129,1,2.6,2,2.0,2,Yes
3,37,1,1,130,250,0,0,187,0,3.5,3,0.0,1,No
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,1,No



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [9]:
# prompt: convert the name of column AHD to name Target permanently

data.rename(columns={'AHD': 'Target'}, inplace=True)


In [10]:
# prompt: now convert the values of target to numerical ones using Label Encoder

# Create a LabelEncoder object
le = LabelEncoder()

# Fit and transform the Target column
data['Target'] = le.fit_transform(data['Target'])


In [11]:
data.head()

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,Target
0,63,1,3,145,233,1,2,150,0,2.3,3,0.0,0,0
1,67,1,0,160,286,0,2,108,1,1.5,2,3.0,1,1
2,67,1,0,120,229,0,2,129,1,2.6,2,2.0,2,1
3,37,1,1,130,250,0,0,187,0,3.5,3,0.0,1,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,1,0


In [12]:
data.isnull().sum()

Age          0
Sex          0
ChestPain    0
RestBP       0
Chol         0
Fbs          0
RestECG      0
MaxHR        0
ExAng        0
Oldpeak      0
Slope        0
Ca           4
Thal         0
Target       0
dtype: int64

In [13]:
# prompt: column "Ca" has some null values use mean to fix it permenantly

data['Ca'] = data['Ca'].fillna(data['Ca'].mean())


In [14]:
data.isnull().sum()

Age          0
Sex          0
ChestPain    0
RestBP       0
Chol         0
Fbs          0
RestECG      0
MaxHR        0
ExAng        0
Oldpeak      0
Slope        0
Ca           0
Thal         0
Target       0
dtype: int64

In [16]:
data['Target'].value_counts()

Target
0    164
1    139
Name: count, dtype: int64

In [17]:
X = data.drop(columns = 'Target', axis = 1)
y = data['Target']
print(X)

     Age  Sex  ChestPain  RestBP  Chol  Fbs  RestECG  MaxHR  ExAng  Oldpeak  \
0     63    1          3     145   233    1        2    150      0      2.3   
1     67    1          0     160   286    0        2    108      1      1.5   
2     67    1          0     120   229    0        2    129      1      2.6   
3     37    1          1     130   250    0        0    187      0      3.5   
4     41    0          2     130   204    0        2    172      0      1.4   
..   ...  ...        ...     ...   ...  ...      ...    ...    ...      ...   
298   45    1          3     110   264    0        0    132      0      1.2   
299   68    1          0     144   193    1        0    141      0      3.4   
300   57    1          0     130   131    0        0    115      1      1.2   
301   57    0          2     130   236    0        2    174      0      0.0   
302   38    1          1     138   175    0        0    173      0      0.0   

     Slope        Ca  Thal  
0        3  0.000000  

In [18]:
print(y)

0      0
1      1
2      1
3      0
4      0
      ..
298    1
299    1
300    1
301    1
302    0
Name: Target, Length: 303, dtype: int64


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 42)

In [20]:
print(X.shape, X_train.shape, X_test.shape)

(303, 13) (242, 13) (61, 13)


In [21]:
models = [LogisticRegression(max_iter = 1000), SVC(kernel = 'linear'), KNeighborsClassifier(), RandomForestClassifier()]

In [22]:
def compare_models():
  for model in models:
    model.fit(X_train, y_train)
    test_data = model.predict(X_test)
    accuracy = accuracy_score(y_test, test_data)
    print("Accuracy score of the : ", model, " = ", accuracy)

In [23]:
compare_models()

Accuracy score of the :  LogisticRegression(max_iter=1000)  =  0.8688524590163934
Accuracy score of the :  SVC(kernel='linear')  =  0.8688524590163934
Accuracy score of the :  KNeighborsClassifier()  =  0.7377049180327869
Accuracy score of the :  RandomForestClassifier()  =  0.9016393442622951


In [24]:
cv_score_lr = cross_val_score(LogisticRegression(max_iter = 1000), X, y, cv = 5)
print(cv_score_lr)
mean_acc = sum(cv_score_lr)/len(cv_score_lr)
mean_acc = mean_acc*100
mean_acc = round(mean_acc, 2)
print(mean_acc)

[0.86885246 0.8852459  0.7704918  0.85       0.81666667]
83.83


In [25]:
cv_score_svc = cross_val_score(SVC(kernel = 'linear'), X, y, cv = 5)
print(cv_score_svc)
mean_acc = sum(cv_score_svc)/len(cv_score_svc)
mean_acc = mean_acc*100
mean_acc = round(mean_acc, 2)
print(mean_acc)

[0.80327869 0.86885246 0.7704918  0.8        0.83333333]
81.52


In [26]:
models = [LogisticRegression(max_iter=1000), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier()]

In [29]:
def compare_models_cross_validation():

  for model in models:

    cv_score = cross_val_score(model, X, y, cv=5)

    mean_accuracy = sum(cv_score)/len(cv_score)

    mean_accuracy = mean_accuracy*100

    mean_accuracy = round(mean_accuracy, 2)

    print('Cross Validation accuracies for ', model, '=  ', cv_score)
    print('Accuracy % of the ', model, mean_accuracy)
    print('----------------------------------------------')
compare_models_cross_validation()


Cross Validation accuracies for  LogisticRegression(max_iter=1000) =   [0.86885246 0.8852459  0.7704918  0.85       0.81666667]
Accuracy % of the  LogisticRegression(max_iter=1000) 83.83
----------------------------------------------
Cross Validation accuracies for  SVC(kernel='linear') =   [0.80327869 0.86885246 0.7704918  0.8        0.83333333]
Accuracy % of the  SVC(kernel='linear') 81.52
----------------------------------------------
Cross Validation accuracies for  KNeighborsClassifier() =   [0.60655738 0.6557377  0.57377049 0.75       0.63333333]
Accuracy % of the  KNeighborsClassifier() 64.39
----------------------------------------------
Cross Validation accuracies for  RandomForestClassifier() =   [0.85245902 0.8852459  0.83606557 0.76666667 0.8       ]
Accuracy % of the  RandomForestClassifier() 82.81
----------------------------------------------
