In [1]:
# Import statements

import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd  

# To run regressions, we need the package Statsmodels
import statsmodels.api as sm
import seaborn as sns
from sklearn.linear_model import Lasso

  import pandas.util.testing as tm


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
Data = pd.read_csv('/content/drive/MyDrive/data/heart.csv')
Data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [None]:
# Remove bad Data
Data = Data[Data['Cholesterol'] > 0]
Data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [None]:
# Created dummy variables for "job","marital", "education", "default", "housing", "loan", "contact", "month", "day_of_week", "poutcome", "y"
Data["Sex"] = (Data["Sex"] == "F").astype("int")
Data["ExerciseAngina"] = (Data["ExerciseAngina"] == "Y").astype("int")
dummies = pd.get_dummies(Data[["ChestPainType", "RestingECG", "ExerciseAngina", "ST_Slope"]])
dummies.head()

Unnamed: 0,ExerciseAngina,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,0,0,1,0,0,0,1,0,0,0,1
1,0,0,0,1,0,0,1,0,0,1,0
2,0,0,1,0,0,0,0,1,0,0,1
3,1,1,0,0,0,0,1,0,0,1,0
4,0,0,0,1,0,0,1,0,0,0,1


In [None]:
# Drop
Data.drop(["ChestPainType", "RestingECG", "ST_Slope"],axis=1,inplace=True)

In [None]:
# Adding dummies into the table
Data[dummies.columns] = dummies
Data.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,0,140,289,0,172,0,0.0,0,0,1,0,0,0,1,0,0,0,1
1,49,1,160,180,0,156,0,1.0,1,0,0,1,0,0,1,0,0,1,0
2,37,0,130,283,0,98,0,0.0,0,0,1,0,0,0,0,1,0,0,1
3,48,1,138,214,0,108,1,1.5,1,1,0,0,0,0,1,0,0,1,0
4,54,0,150,195,0,122,0,0.0,0,0,0,1,0,0,1,0,0,0,1


In [None]:
Data.columns

Index(['Age', 'Sex', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR',
       'ExerciseAngina', 'Oldpeak', 'HeartDisease', 'ChestPainType_ASY',
       'ChestPainType_ATA', 'ChestPainType_NAP', 'ChestPainType_TA',
       'RestingECG_LVH', 'RestingECG_Normal', 'RestingECG_ST', 'ST_Slope_Down',
       'ST_Slope_Flat', 'ST_Slope_Up'],
      dtype='object')

In [None]:
Data.head(100)

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,0,140,289,0,172,0,0.0,0,0,1,0,0,0,1,0,0,0,1
1,49,1,160,180,0,156,0,1.0,1,0,0,1,0,0,1,0,0,1,0
2,37,0,130,283,0,98,0,0.0,0,0,1,0,0,0,0,1,0,0,1
3,48,1,138,214,0,108,1,1.5,1,1,0,0,0,0,1,0,0,1,0
4,54,0,150,195,0,122,0,0.0,0,0,0,1,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,58,0,130,263,0,140,1,2.0,1,1,0,0,0,0,1,0,0,1,0
96,43,0,142,207,0,138,0,0.0,0,0,1,0,0,0,1,0,0,0,1
97,39,0,160,147,1,160,0,0.0,0,0,0,1,0,0,1,0,0,0,1
98,56,0,120,85,0,140,0,0.0,0,1,0,0,0,0,1,0,0,0,1


In [None]:
Data.corr()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
Age,1.0,-0.040917,0.259865,0.058758,0.241338,-0.382112,0.245908,0.286006,0.298617,0.186737,-0.22982,-0.022271,0.050671,0.179061,-0.238592,0.109853,0.148987,0.202548,-0.272296
Sex,-0.040917,1.0,-0.034363,0.107045,-0.096075,0.157002,-0.192579,-0.125743,-0.292779,-0.176468,0.138835,0.080302,-0.013732,0.029859,0.002763,-0.037574,-0.073531,-0.102288,0.136713
RestingBP,0.259865,-0.034363,1.0,0.095939,0.173765,-0.125774,0.161035,0.198575,0.173242,0.079102,-0.076092,-0.054299,0.065069,0.023214,-0.086085,0.08669,0.096915,0.081746,-0.127077
Cholesterol,0.058758,0.107045,0.095939,1.0,0.054012,-0.019856,0.086768,0.058488,0.103866,0.09347,-0.015945,-0.068441,-0.050253,0.079589,-0.04695,-0.028806,-0.010734,0.103996,-0.099063
FastingBS,0.241338,-0.096075,0.173765,0.054012,1.0,-0.10271,0.109995,0.055568,0.160594,0.05027,-0.093325,0.00585,0.049295,0.071935,-0.165081,0.135072,0.089243,0.105542,-0.147309
MaxHR,-0.382112,0.157002,-0.125774,-0.019856,-0.10271,1.0,-0.396289,-0.259533,-0.377212,-0.371491,0.243329,0.152321,0.091131,0.095477,0.039154,-0.159973,-0.077855,-0.346152,0.382786
ExerciseAngina,0.245908,-0.192579,0.161035,0.086768,0.109995,-0.396289,1.0,0.465491,0.551834,0.483025,-0.317049,-0.197576,-0.118155,-0.0111,-0.090978,0.132127,0.135442,0.440354,-0.50396
Oldpeak,0.286006,-0.125743,0.198575,0.058488,0.055568,-0.259533,0.465491,1.0,0.495696,0.346352,-0.3032,-0.135819,0.042978,0.09605,-0.130778,0.062598,0.390171,0.370642,-0.553175
HeartDisease,0.298617,-0.292779,0.173242,0.103866,0.160594,-0.377212,0.551834,0.495696,1.0,0.522982,-0.375634,-0.222146,-0.053765,0.069599,-0.133255,0.095921,0.132193,0.591554,-0.653759
ChestPainType_ASY,0.186737,-0.176468,0.079102,0.09347,0.05027,-0.371491,0.483025,0.346352,0.522982,1.0,-0.530697,-0.536862,-0.239224,0.054987,-0.107715,0.078984,0.111273,0.345895,-0.398138


In [None]:
# Our regression analysis
Y = Data["HeartDisease"]
X = Data[["Age", 'Sex', 'ChestPainType_ASY',
       'ExerciseAngina','ST_Slope_Flat'
       ,'Oldpeak']]
X2 = sm.add_constant(X)
fit1 = sm.OLS(Y, X2).fit()
fit1.summary()

0,1,2,3
Dep. Variable:,HeartDisease,R-squared:,0.568
Model:,OLS,Adj. R-squared:,0.565
Method:,Least Squares,F-statistic:,161.9
Date:,"Sat, 08 Jan 2022",Prob (F-statistic):,4.53e-131
Time:,15:46:05,Log-Likelihood:,-227.59
No. Observations:,746,AIC:,469.2
Df Residuals:,739,BIC:,501.5
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.1430,0.070,-2.051,0.041,-0.280,-0.006
Age,0.0049,0.001,3.632,0.000,0.002,0.008
Sex,-0.1890,0.029,-6.559,0.000,-0.246,-0.132
ChestPainType_ASY,0.2213,0.028,7.803,0.000,0.166,0.277
ExerciseAngina,0.1635,0.032,5.147,0.000,0.101,0.226
ST_Slope_Flat,0.3467,0.028,12.437,0.000,0.292,0.401
Oldpeak,0.0789,0.013,5.920,0.000,0.053,0.105

0,1,2,3
Omnibus:,15.828,Durbin-Watson:,2.209
Prob(Omnibus):,0.0,Jarque-Bera (JB):,24.195
Skew:,-0.174,Prob(JB):,5.57e-06
Kurtosis:,3.81,Cond. No.,311.0


In [None]:
#Store the fitted values
Data["Predicted"] = fit1.fittedvalues
Data.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,Predicted
0,40,0,140,289,0,172,0,0.0,0,0,1,0,0,0,1,0,0,0,1,0.052139
1,49,1,160,180,0,156,0,1.0,1,0,0,1,0,0,1,0,0,1,0,0.332718
2,37,0,130,283,0,98,0,0.0,0,0,1,0,0,0,0,1,0,0,1,0.037503
3,48,1,138,214,0,108,1,1.5,1,1,0,0,0,0,1,0,0,1,0,0.752063
4,54,0,150,195,0,122,0,0.0,0,0,0,1,0,0,1,0,0,0,1,0.120442


In [None]:
# Manually compute R2 to verify this works
CorrTable = Data[["HeartDisease","Predicted"]].corr()
print(CorrTable)
CorrTable.iloc[0,1]**2

              HeartDisease  Predicted
HeartDisease      1.000000   0.753664
Predicted         0.753664   1.000000


0.568009496097745