In [32]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

### Importing the models

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

### Data Collection and Processing

In [3]:
heart_data = pd.read_csv('Fichiers_csv/heart.csv')

In [4]:
heart_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [5]:
heart_data.shape

(1025, 14)

In [6]:
heart_data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [7]:
heart_data['target'].value_counts()

target
1    526
0    499
Name: count, dtype: int64

In [8]:
X = heart_data.drop(columns='target', axis=1)
Y = heart_data['target']

In [9]:
print(X)

      age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0      52    1   0       125   212    0        1      168      0      1.0   
1      53    1   0       140   203    1        0      155      1      3.1   
2      70    1   0       145   174    0        1      125      1      2.6   
3      61    1   0       148   203    0        1      161      0      0.0   
4      62    0   0       138   294    1        1      106      0      1.9   
...   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
1020   59    1   1       140   221    0        1      164      1      0.0   
1021   60    1   0       125   258    0        0      141      1      2.8   
1022   47    1   0       110   275    0        0      118      1      1.0   
1023   50    0   0       110   254    0        0      159      0      0.0   
1024   54    1   0       120   188    0        1      113      0      1.4   

      slope  ca  thal  
0         2   2     3  
1         0   0     3  
2  

In [10]:
print(Y)

0       0
1       0
2       0
3       0
4       0
       ..
1020    1
1021    0
1022    0
1023    1
1024    0
Name: target, Length: 1025, dtype: int64


In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=3)

In [12]:
print(X.shape, X_train.shape, X_test.shape)

(1025, 13) (820, 13) (205, 13)


### Comparing the performance of the models

In [13]:
# list of models
models = [LogisticRegression(max_iter=1000), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier()]

In [14]:
def compare_models_train_test():

    for model in models:
        model.fit(X_train, Y_train)
        test_data_precision = model.predict(X_test)
        accuracy = accuracy_score(Y_test, test_data_precision)
        print('Accuracy score of the ',model, ' = ', accuracy)

In [15]:
compare_models_train_test()

Accuracy score of the  LogisticRegression(max_iter=1000)  =  0.8390243902439024
Accuracy score of the  SVC(kernel='linear')  =  0.824390243902439
Accuracy score of the  KNeighborsClassifier()  =  0.7560975609756098
Accuracy score of the  RandomForestClassifier()  =  1.0


### Cross Validation

#### Logistic Regression

In [16]:
cv_score_lr = cross_val_score(LogisticRegression(max_iter=1000), X, Y, cv=5)
print(cv_score_lr)
mean_accuracy_lr = round((sum(cv_score_lr)/len(cv_score_lr))*100, 2)
print(mean_accuracy_lr)

[0.88292683 0.85853659 0.87804878 0.82439024 0.80487805]
84.98


#### Support Vector Classifier

In [17]:
cv_score_svc = cross_val_score(SVC(kernel='linear'), X, Y, cv=5)
print(cv_score_svc)
mean_accuracy_svc = round((sum(cv_score_svc)/len(cv_score_svc))*100, 2)
print(mean_accuracy_svc)

[0.88292683 0.86829268 0.84390244 0.81463415 0.80487805]
84.29


In [18]:
def compare_models_cross_validation():
    for model in models:
        cv_score = cross_val_score(model, X, Y, cv=5)
        mean_accuracy = round((sum(cv_score)/len(cv_score))*100,2)
        print('Cross Validation accuracy for the ', model, ' = ', cv_score)
        print('Accuracy % of the ', model, mean_accuracy)
        print('-----------------------------------------------------------')

In [19]:
compare_models_cross_validation()

Cross Validation accuracy for the  LogisticRegression(max_iter=1000)  =  [0.88292683 0.85853659 0.87804878 0.82439024 0.80487805]
Accuracy % of the  LogisticRegression(max_iter=1000) 84.98
-----------------------------------------------------------
Cross Validation accuracy for the  SVC(kernel='linear')  =  [0.88292683 0.86829268 0.84390244 0.81463415 0.80487805]
Accuracy % of the  SVC(kernel='linear') 84.29
-----------------------------------------------------------
Cross Validation accuracy for the  KNeighborsClassifier()  =  [0.76585366 0.74634146 0.76097561 0.71219512 0.75121951]
Accuracy % of the  KNeighborsClassifier() 74.73
-----------------------------------------------------------
Cross Validation accuracy for the  RandomForestClassifier()  =  [1.         1.         1.         1.         0.98536585]
Accuracy % of the  RandomForestClassifier() 99.71
-----------------------------------------------------------


### Precision

In [25]:
def compare_precision_models():

    for model in models:
        model.fit(X_train, Y_train)
        X_train_prediction = model.predict(X_train)
        X_test_prediction  = model.predict(X_test)
        precision_train    = precision_score(Y_train, X_train_prediction)
        precision_test     = precision_score(Y_test, X_test_prediction)
        print('Training data Precision of ',model, ' = ', precision_train)
        print('Test data Precision of ',model, ' = ', precision_test)
        print('-----------------------------------------------------------')

In [26]:
compare_precision_models()

Training data Precision of  LogisticRegression(max_iter=1000)  =  0.8322440087145969
Test data Precision of  LogisticRegression(max_iter=1000)  =  0.8272727272727273
-----------------------------------------------------------
Training data Precision of  SVC(kernel='linear')  =  0.8183760683760684
Test data Precision of  SVC(kernel='linear')  =  0.8108108108108109
-----------------------------------------------------------
Training data Precision of  KNeighborsClassifier()  =  0.9376558603491272
Test data Precision of  KNeighborsClassifier()  =  0.8021978021978022
-----------------------------------------------------------
Training data Precision of  RandomForestClassifier()  =  1.0
Test data Precision of  RandomForestClassifier()  =  1.0
-----------------------------------------------------------


### Recall

In [30]:
def compare_recall_models():

    for model in models:
        model.fit(X_train, Y_train)
        X_train_prediction = model.predict(X_train)
        X_test_prediction  = model.predict(X_test)
        recall_train       = recall_score(Y_train, X_train_prediction)
        recall_test        = recall_score(Y_test, X_test_prediction)
        print('Training data Recall of ',model, ' = ', recall_train)
        print('Test data Recall of ',model, ' = ', recall_test)
        print('-----------------------------------------------------------')

In [31]:
compare_recall_models()

Training data Recall of  LogisticRegression(max_iter=1000)  =  0.9073634204275535
Test data Recall of  LogisticRegression(max_iter=1000)  =  0.8666666666666667
-----------------------------------------------------------
Training data Recall of  SVC(kernel='linear')  =  0.9097387173396675
Test data Recall of  SVC(kernel='linear')  =  0.8571428571428571
-----------------------------------------------------------
Training data Recall of  KNeighborsClassifier()  =  0.8931116389548693
Test data Recall of  KNeighborsClassifier()  =  0.6952380952380952
-----------------------------------------------------------
Training data Recall of  RandomForestClassifier()  =  1.0
Test data Recall of  RandomForestClassifier()  =  1.0
-----------------------------------------------------------


In [35]:
def compare_f1_score_models():

    for model in models:
        model.fit(X_train, Y_train)
        X_train_prediction = model.predict(X_train)
        X_test_prediction  = model.predict(X_test)
        f1_score_train     = f1_score(Y_train, X_train_prediction)
        f1_score_test      = f1_score(Y_test, X_test_prediction)
        print('Training data F1 score of ',model, ' = ', f1_score_train)
        print('Test data F1 score of ',model, ' = ', f1_score_test)
        print('-----------------------------------------------------------')

In [36]:
compare_f1_score_models()

Training data F1 score of  LogisticRegression(max_iter=1000)  =  0.8681818181818182
Test data F1 score of  LogisticRegression(max_iter=1000)  =  0.8465116279069769
-----------------------------------------------------------
Training data F1 score of  SVC(kernel='linear')  =  0.8616422947131609
Test data F1 score of  SVC(kernel='linear')  =  0.8333333333333334
-----------------------------------------------------------
Training data F1 score of  KNeighborsClassifier()  =  0.9148418491484184
Test data F1 score of  KNeighborsClassifier()  =  0.7448979591836735
-----------------------------------------------------------
Training data F1 score of  RandomForestClassifier()  =  1.0
Test data F1 score of  RandomForestClassifier()  =  1.0
-----------------------------------------------------------


### Precision, Recall & F1 Score - functions

In [38]:
def precision_recall_f1_score(true_labels, pred_labels):

    precision_value = precision_score(true_labels, pred_labels)
    recall_value    = recall_score(true_labels, pred_labels)
    f1_score_value  = f1_score(true_labels, pred_labels)

    print('Precision = ', precision_value)
    print('Recall = ', recall_value)
    print('Precision = ', f1_score_value)

In [41]:
for model in models:
    model.fit(X_train, Y_train)
    X_train_prediction = model.predict(X_train)
    X_test_prediction  = model.predict(X_test)
    print(model)
    precision_recall_f1_score(Y_test, X_test_prediction)
    print('-----------------------------------------------------------')

LogisticRegression(max_iter=1000)
Precision =  0.8272727272727273
Recall =  0.8666666666666667
Precision =  0.8465116279069769
-----------------------------------------------------------
SVC(kernel='linear')
Precision =  0.8108108108108109
Recall =  0.8571428571428571
Precision =  0.8333333333333334
-----------------------------------------------------------
KNeighborsClassifier()
Precision =  0.8021978021978022
Recall =  0.6952380952380952
Precision =  0.7448979591836735
-----------------------------------------------------------
RandomForestClassifier()
Precision =  1.0
Recall =  1.0
Precision =  1.0
-----------------------------------------------------------
