In [2]:
# Common imports
import numpy as np
import os
import tarfile
import urllib
import pandas as pd
import seaborn as sb
import urllib.request
# Importing correlation coefficents
import scipy
from scipy.stats import pearsonr
# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
# Importing sklearn
from sklearn import metrics
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import classification_report

In [3]:
address = '/Users/sohamkakra/Desktop/F21DL_Portfolio/Datasets/heart.csv'
data = pd.read_csv(address)

In [4]:
# computes the standard correlation coefficient (Pearson’s r) of every feature with the output feature 
corr_pearson = data.corr(method='pearson')
print('PearsonR Correlation Coefficent Matrix')
print(corr_pearson.output)

PearsonR Correlation Coefficent Matrix
age        -0.225439
sex        -0.280937
cp          0.433798
trtbps     -0.144931
chol       -0.085239
fbs        -0.028046
restecg     0.137230
thalachh    0.421741
exng       -0.436757
oldpeak    -0.430696
slp         0.345877
caa        -0.391724
thall      -0.344029
output      1.000000
Name: output, dtype: float64


In [5]:
# computes the standard correlation coefficient (Pearson’s r) of every feature 
corr_pearson = data.corr(method='pearson')
print('PearsonR Correlation Coefficent Matrix')
print(corr_pearson.exng)

PearsonR Correlation Coefficent Matrix
age         0.096801
sex         0.141664
cp         -0.394280
trtbps      0.067616
chol        0.067023
fbs         0.025665
restecg    -0.070733
thalachh   -0.378812
exng        1.000000
oldpeak     0.288223
slp        -0.257748
caa         0.115739
thall       0.206754
output     -0.436757
Name: exng, dtype: float64


In [10]:
# computes the standard correlation coefficient (Pearson’s r) of every feature 
corr_pearson = data.corr(method='pearson')
print('PearsonR Correlation Coefficent Matrix')
print(corr_pearson.cp)

PearsonR Correlation Coefficent Matrix
age        -0.068653
sex        -0.049353
cp          1.000000
trtbps      0.047608
chol       -0.076904
fbs         0.094444
restecg     0.044421
thalachh    0.295762
exng       -0.394280
oldpeak    -0.149230
slp         0.119717
caa        -0.181053
thall      -0.161736
output      0.433798
Name: cp, dtype: float64


In [11]:
# computes the standard correlation coefficient (Pearson’s r) of every feature 
corr_pearson = data.corr(method='pearson')
print('PearsonR Correlation Coefficent Matrix')
print(corr_pearson.thalachh)

PearsonR Correlation Coefficent Matrix
age        -0.398522
sex        -0.044020
cp          0.295762
trtbps     -0.046698
chol       -0.009940
fbs        -0.008567
restecg     0.044123
thalachh    1.000000
exng       -0.378812
oldpeak    -0.344187
slp         0.386784
caa        -0.213177
thall      -0.096439
output      0.421741
Name: thalachh, dtype: float64


In [12]:
# computes the standard correlation coefficient (Pearson’s r) of every feature 
corr_pearson = data.corr(method='pearson')
print('PearsonR Correlation Coefficent Matrix')
print(corr_pearson.age)

PearsonR Correlation Coefficent Matrix
age         1.000000
sex        -0.098447
cp         -0.068653
trtbps      0.279351
chol        0.213678
fbs         0.121308
restecg    -0.116211
thalachh   -0.398522
exng        0.096801
oldpeak     0.210013
slp        -0.168814
caa         0.276326
thall       0.068001
output     -0.225439
Name: age, dtype: float64


In [13]:
# computes the standard correlation coefficient (Pearson’s r) of every feature 
corr_pearson = data.corr(method='pearson')
print('PearsonR Correlation Coefficent Matrix')
print(corr_pearson.trtbps)

PearsonR Correlation Coefficent Matrix
age         0.279351
sex        -0.056769
cp          0.047608
trtbps      1.000000
chol        0.123174
fbs         0.177531
restecg    -0.114103
thalachh   -0.046698
exng        0.067616
oldpeak     0.193216
slp        -0.121475
caa         0.101389
thall       0.062210
output     -0.144931
Name: trtbps, dtype: float64


In [14]:
# computes the standard correlation coefficient (Pearson’s r) of every feature 
corr_pearson = data.corr(method='pearson')
print('PearsonR Correlation Coefficent Matrix')
print(corr_pearson.oldpeak)

PearsonR Correlation Coefficent Matrix
age         0.210013
sex         0.096093
cp         -0.149230
trtbps      0.193216
chol        0.053952
fbs         0.005747
restecg    -0.058770
thalachh   -0.344187
exng        0.288223
oldpeak     1.000000
slp        -0.577537
caa         0.222682
thall       0.210244
output     -0.430696
Name: oldpeak, dtype: float64


In [15]:
# computes the standard correlation coefficient (Pearson’s r) of every feature 
corr_pearson = data.corr(method='pearson')
print('PearsonR Correlation Coefficent Matrix')
print(corr_pearson.slp)

PearsonR Correlation Coefficent Matrix
age        -0.168814
sex        -0.030711
cp          0.119717
trtbps     -0.121475
chol       -0.004038
fbs        -0.059894
restecg     0.093045
thalachh    0.386784
exng       -0.257748
oldpeak    -0.577537
slp         1.000000
caa        -0.080155
thall      -0.104764
output      0.345877
Name: slp, dtype: float64


In [16]:
# computes the standard correlation coefficient (Pearson’s r) of every feature 
corr_pearson = data.corr(method='pearson')
print('PearsonR Correlation Coefficent Matrix')
print(corr_pearson.thall)

PearsonR Correlation Coefficent Matrix
age         0.068001
sex         0.210041
cp         -0.161736
trtbps      0.062210
chol        0.098803
fbs        -0.032019
restecg    -0.011981
thalachh   -0.096439
exng        0.206754
oldpeak     0.210244
slp        -0.104764
caa         0.151832
thall       1.000000
output     -0.344029
Name: thall, dtype: float64


In [17]:
# computes the standard correlation coefficient (Pearson’s r) of every feature 
corr_pearson = data.corr(method='pearson')
print('PearsonR Correlation Coefficent Matrix')
print(corr_pearson.sex)

PearsonR Correlation Coefficent Matrix
age        -0.098447
sex         1.000000
cp         -0.049353
trtbps     -0.056769
chol       -0.197912
fbs         0.045032
restecg    -0.058196
thalachh   -0.044020
exng        0.141664
oldpeak     0.096093
slp        -0.030711
caa         0.118261
thall       0.210041
output     -0.280937
Name: sex, dtype: float64


In [18]:
# computes the standard correlation coefficient (Pearson’s r) of every feature 
corr_pearson = data.corr(method='pearson')
print('PearsonR Correlation Coefficent Matrix')
print(corr_pearson.chol)

PearsonR Correlation Coefficent Matrix
age         0.213678
sex        -0.197912
cp         -0.076904
trtbps      0.123174
chol        1.000000
fbs         0.013294
restecg    -0.151040
thalachh   -0.009940
exng        0.067023
oldpeak     0.053952
slp        -0.004038
caa         0.070511
thall       0.098803
output     -0.085239
Name: chol, dtype: float64


In [19]:
# computes the standard correlation coefficient (Pearson’s r) of every feature 
corr_pearson = data.corr(method='pearson')
print('PearsonR Correlation Coefficent Matrix')
print(corr_pearson.restecg)

PearsonR Correlation Coefficent Matrix
age        -0.116211
sex        -0.058196
cp          0.044421
trtbps     -0.114103
chol       -0.151040
fbs        -0.084189
restecg     1.000000
thalachh    0.044123
exng       -0.070733
oldpeak    -0.058770
slp         0.093045
caa        -0.072042
thall      -0.011981
output      0.137230
Name: restecg, dtype: float64


In [20]:
# computes the standard correlation coefficient (Pearson’s r) of every feature 
corr_pearson = data.corr(method='pearson')
print('PearsonR Correlation Coefficent Matrix')
print(corr_pearson.fbs)

PearsonR Correlation Coefficent Matrix
age         0.121308
sex         0.045032
cp          0.094444
trtbps      0.177531
chol        0.013294
fbs         1.000000
restecg    -0.084189
thalachh   -0.008567
exng        0.025665
oldpeak     0.005747
slp        -0.059894
caa         0.137979
thall      -0.032019
output     -0.028046
Name: fbs, dtype: float64


In [21]:
top_2_db = data.drop(['thalachh','age','trtbps','oldpeak','slp','thall','sex','restecg','fbs','caa','chol'], axis = 1)
top_2_db.head()

Unnamed: 0,cp,exng,output
0,3,0,1
1,2,0,1
2,1,0,1
3,1,0,1
4,0,1,1


In [22]:
top_5_db = data.drop(['oldpeak','slp','thall','sex','restecg','fbs','caa','chol'], axis = 1)
top_5_db.head()

Unnamed: 0,age,cp,trtbps,thalachh,exng,output
0,63,3,145,150,0,1
1,37,2,130,187,0,1
2,41,1,130,172,0,1
3,56,1,120,178,0,1
4,57,0,120,163,1,1


In [23]:
top_10_db = data.drop(['restecg','fbs','caa'], axis = 1)
top_10_db.head()

Unnamed: 0,age,sex,cp,trtbps,chol,thalachh,exng,oldpeak,slp,thall,output
0,63,1,3,145,233,150,0,2.3,0,1,1
1,37,1,2,130,250,187,0,3.5,0,2,1
2,41,0,1,130,204,172,0,1.4,2,2,1
3,56,1,1,120,236,178,0,0.8,2,2,1
4,57,0,0,120,354,163,1,0.6,2,2,1


In [44]:
# Dividing dataset into training and testing models for Top 2 features of the dataset
X_train, X_test, y_train, y_test = train_test_split(top_2_db.drop(['output'], axis=1),
                                                    top_2_db['output'], test_size=0.2)

# Getting the shape of the training data
print(X_train.shape)
print(y_train.shape)

# Printing the first 5 values of the training data
X_train[0:5]

(242, 2)
(242,)


Unnamed: 0,cp,exng
196,2,0
51,0,0
85,2,0
182,0,0
161,1,0


In [45]:
# Creating a logistic regression function modeled on the training data
LogReg = LogisticRegression(solver = 'liblinear')
LogReg.fit(X_train, y_train)

y_pred = LogReg.predict(X_test)

print("For the top 2 features of the dataset \n")
print("\nClassification Report :\n",classification_report(y_test, y_pred))

y_train_pred = cross_val_predict(LogReg, X_train, y_train, cv = 5)
print("\nConfusion Matrix :\n", confusion_matrix(y_train, y_train_pred))

print("\nPrecision Score : ", precision_score(y_train, y_train_pred))


Classification Report :
               precision    recall  f1-score   support

           0       0.73      0.83      0.77        29
           1       0.82      0.72      0.77        32

    accuracy                           0.77        61
   macro avg       0.77      0.77      0.77        61
weighted avg       0.78      0.77      0.77        61


Confusion Matrix :
 [[ 71  38]
 [ 28 105]]

Precision Score :  0.7342657342657343


In [42]:
# Dividing dataset into training and testing models for Top 5 features of the dataset
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(top_5_db.drop(['output'], axis=1),
                                                            top_5_db['output'], test_size=0.2)

# Getting the shape of the training data
print(X_train_2.shape)
print(y_train_2.shape)

# Printing the first 5 values of the training data
X_train_2[0:5]

(242, 5)
(242,)


Unnamed: 0,age,cp,trtbps,thalachh,exng
296,63,0,124,136,1
226,62,1,120,103,0
178,43,0,120,120,1
45,52,1,120,172,0
218,65,0,135,127,0


In [43]:
LogReg = LogisticRegression(solver = 'liblinear')
LogReg.fit(X_train_2, y_train_2)

y_pred_2 = LogReg.predict(X_test_2)

print("For the top 5 features of the dataset \n")
print("\nClassification Report :\n",classification_report(y_test_2, y_pred_2))

y_train_pred_2 = cross_val_predict(LogReg, X_train_2, y_train_2, cv = 5)
print("\nConfusion Matrix :\n", confusion_matrix(y_train_2, y_train_pred_2))

print("\nPrecision Score : ", precision_score(y_train_2, y_train_pred_2))

For the top 5 features of the dataset 


Classification Report :
               precision    recall  f1-score   support

           0       0.61      0.67      0.64        21
           1       0.82      0.78      0.79        40

    accuracy                           0.74        61
   macro avg       0.71      0.72      0.72        61
weighted avg       0.74      0.74      0.74        61


Confusion Matrix :
 [[ 81  36]
 [ 25 100]]

Precision Score :  0.7352941176470589


In [50]:
# Dividing dataset into training and testing models for Top 10 features of the dataset
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(top_10_db.drop(['output'], axis = 1), top_10_db['output'], test_size = 0.2, random_state = 200)

# Getting the shape of the training data
print(X_train_3.shape)
print(y_train_3.shape)

# Printing the first 5 values of the training data
X_train_3[0:5]

(242, 10)
(242,)


Unnamed: 0,age,sex,cp,trtbps,chol,thalachh,exng,oldpeak,slp,thall
167,62,0,0,140,268,160,0,3.6,0,2
271,61,1,3,134,234,145,0,2.6,1,2
270,46,1,0,120,249,144,0,0.8,2,3
60,71,0,2,110,265,130,0,0.0,2,2
94,45,0,1,112,160,138,0,0.0,1,2


In [51]:
LogReg = LogisticRegression(solver = 'liblinear')
LogReg.fit(X_train_3, y_train_3)

y_pred_3 = LogReg.predict(X_test_3)

print("For the top 10 features of the dataset \n")
print("\nClassification Report :\n",classification_report(y_test_3, y_pred_3))

y_train_pred_3 = cross_val_predict(LogReg, X_train_3, y_train_3, cv = 5)
print("\nConfusion Matrix :\n", confusion_matrix(y_train_3, y_train_pred_3))

print("\nPrecision Score : ", precision_score(y_train_3, y_train_pred_3))

For the top 10 features of the dataset 


Classification Report :
               precision    recall  f1-score   support

           0       0.93      0.81      0.87        32
           1       0.82      0.93      0.87        29

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.88      0.87      0.87        61


Confusion Matrix :
 [[ 77  29]
 [ 19 117]]

Precision Score :  0.8013698630136986


In [52]:
# Dividing dataset into training and testing models for all features of the dataset
X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(data.drop(['output'], axis=1),
                                                            data['output'], test_size=0.2, random_state=200)

# Getting the shape of the training data
print(X_train_4.shape)
print(y_train_4.shape)

# Printing the first 5 values of the training data
X_train_4[0:5]

(242, 13)
(242,)


Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
167,62,0,0,140,268,0,0,160,0,3.6,0,2,2
271,61,1,3,134,234,0,1,145,0,2.6,1,2,2
270,46,1,0,120,249,0,0,144,0,0.8,2,0,3
60,71,0,2,110,265,1,0,130,0,0.0,2,1,2
94,45,0,1,112,160,0,1,138,0,0.0,1,0,2


In [53]:
LogReg = LogisticRegression(solver = 'liblinear')
LogReg.fit(X_train_4, y_train_4)

y_pred_4 = LogReg.predict(X_test_4)

print("For all the features of the dataset \n")
print("\nClassification Report :\n",classification_report(y_test_4, y_pred_4))

y_train_pred_4 = cross_val_predict(LogReg, X_train_4, y_train_4, cv = 5)
print("\nConfusion Matrix :\n", confusion_matrix(y_train_4, y_train_pred_4))

print("\nPrecision Score : ", precision_score(y_train_4, y_train_pred_4))

For all the features of the dataset 


Classification Report :
               precision    recall  f1-score   support

           0       0.89      0.78      0.83        32
           1       0.79      0.90      0.84        29

    accuracy                           0.84        61
   macro avg       0.84      0.84      0.84        61
weighted avg       0.84      0.84      0.84        61


Confusion Matrix :
 [[ 81  25]
 [ 12 124]]

Precision Score :  0.8322147651006712
