In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("heart.csv")
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [3]:
explanation_row = ["age: age in years", "sex: 1 is male", "cp: chest pain type-- Value 0: asymptomatic-- Value 1: atypical angina-- Value 2: non-anginal pain-- Value 3: typical angina", "trestbps: resting blood pressure", "chol: serum cholestoral in mg/dl", "fbs:  (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false) ", "restecg: resting electrocardiographic results-- Value 0: showing probable or definite left ventricular hypertrophy by Estes' criteria-- Value 1: normal-- Value 2: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)", "thalach: maximum heart rate achieved ", " 38 exang: exercise induced angina (1 = yes; 0 = no) ", " 40 oldpeak = ST depression induced by exercise relative to rest ", "slope: the slope of the peak exercise ST segment0: downsloping; 1: flat; 2: upsloping", " 44 ca: number of major vessels (0-3) colored by flourosopy ", "thal: 1 = fixed defect; 2 = normal; 7 = reversable defect ", "target: 0 = disease, 1 = no disease"]


In [4]:
for i in explanation_row:
    print(i,"\n")

age: age in years 

sex: 1 is male 

cp: chest pain type-- Value 0: asymptomatic-- Value 1: atypical angina-- Value 2: non-anginal pain-- Value 3: typical angina 

trestbps: resting blood pressure 

chol: serum cholestoral in mg/dl 

fbs:  (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)  

restecg: resting electrocardiographic results-- Value 0: showing probable or definite left ventricular hypertrophy by Estes' criteria-- Value 1: normal-- Value 2: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV) 

thalach: maximum heart rate achieved  

 38 exang: exercise induced angina (1 = yes; 0 = no)  

 40 oldpeak = ST depression induced by exercise relative to rest  

slope: the slope of the peak exercise ST segment0: downsloping; 1: flat; 2: upsloping 

 44 ca: number of major vessels (0-3) colored by flourosopy  

thal: 1 = fixed defect; 2 = normal; 7 = reversable defect  

target: 0 = disease, 1 = no disease 



In [5]:
### Conversion of columns into dummy variable cols

## Convert angina to dummy variables

df["atypical_angina"] = np.where(df["cp"]==1, 1, 0)
df["non-anginal_pain"] = np.where(df["cp"]==2, 1, 0)
df["typical_angina"] = np.where(df["cp"]==3, 1, 0)


## ecg

df["left_ventricular_hypertrophy"] = np.where(df["restecg"]==0, 1, 0)
df["stt_wave_abnormality"] = np.where(df["restecg"]==2, 1, 0)


## slope: 
# In general, the occurrence of horizontal or down-sloping ST-segment 
# depression at a lower workload (calculated in METs) or heart rate indicates 
# a worse prognosis and higher likelihood of multi-vessel disease. 

df["slope_risk"] = np.where((df.slope == 1)|(df.slope == 2), 1, 0)


## thal:
# --Nuclear stress testing requires the injection of a tracer, commonly 
# technicium 99M (Myoview or Cardiolyte), which is then taken up by healthy, 
# viable myocardial cells. A camera (detector) is used afterwards to image 
# the heart and compare segments. A coronary stenosis is detected when a 
# myocardial segment takes up the nuclear tracer at rest, but not during 
# cardiac stress. This is called a "reversible defect." Scarred myocardium 
# from prior infarct will not take up tracer at all and is referred to as a "fixed defect." --

df["coronary_stenosis"] = np.where(df["thal"]==3, 1, 0)
df["prior_infarct"] = np.where(df["thal"]==1, 1, 0)


## target:
# if target == 0, disease = yes

df["heart_disease"] = np.where(df["target"]==0, 1, 0)


# Fix nan values

df["thal"] = np.where(df.thal==0, np.nan, df.thal)
df["ca"] = np.where(df.ca == 4, np.nan, df["ca"])


# Dropnan values

df = df.dropna()

In [6]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,...,target,atypical_angina,non-anginal_pain,typical_angina,left_ventricular_hypertrophy,stt_wave_abnormality,slope_risk,coronary_stenosis,prior_infarct,heart_disease
0,63,1,3,145,233,1,0,150,0,2.3,...,1,0,0,1,1,0,0,0,1,0
1,37,1,2,130,250,0,1,187,0,3.5,...,1,0,1,0,0,0,0,0,0,0
2,41,0,1,130,204,0,0,172,0,1.4,...,1,1,0,0,1,0,1,0,0,0
3,56,1,1,120,236,0,1,178,0,0.8,...,1,1,0,0,0,0,1,0,0,0
4,57,0,0,120,354,0,1,163,1,0.6,...,1,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,...,0,0,0,0,0,0,1,1,0,1
299,45,1,3,110,264,0,1,132,0,1.2,...,0,0,0,1,0,0,1,1,0,1
300,68,1,0,144,193,1,1,141,0,3.4,...,0,0,0,0,0,0,1,1,0,1
301,57,1,0,130,131,0,1,115,1,1.2,...,0,0,0,0,0,0,1,1,0,1


In [7]:
# Fisher's exact test for all dummy variables

# list of dummy variables:

dummylist = df.columns[14:-1]

pd.crosstab(df.atypical_angina, df.heart_disease)

heart_disease,0,1
atypical_angina,Unnamed: 1_level_1,Unnamed: 2_level_1
0,120,127
1,40,9


In [8]:
# Import fisher exact test

from scipy.stats import fisher_exact


In [9]:
for i in dummylist:
    print( i, "-  associated with heart disease? fishers exact:", fisher_exact(pd.crosstab(df[i], df.heart_disease)), "\n")

atypical_angina -  associated with heart disease? fishers exact: (0.2125984251968504, 1.647271468248216e-05) 

non-anginal_pain -  associated with heart disease? fishers exact: (0.22294654498044328, 1.384416565270507e-07) 

typical_angina -  associated with heart disease? fishers exact: (0.4883720930232558, 0.13282277424360076) 

left_ventricular_hypertrophy -  associated with heart disease? fishers exact: (1.8667009778692742, 0.0101836075512629) 

stt_wave_abnormality -  associated with heart disease? fishers exact: (3.5864661654135337, 0.33672657360980474) 

slope_risk -  associated with heart disease? fishers exact: (0.6158940397350994, 0.36467078016251575) 

coronary_stenosis -  associated with heart disease? fishers exact: (9.030864197530864, 2.351332358267592e-17) 

prior_infarct -  associated with heart disease? fishers exact: (2.4838709677419355, 0.08784772918542935) 



In [10]:
## ANOVA ols 

import statsmodels.api as sm
from statsmodels.formula.api import ols

In [11]:
## OLS forst continuous quantitative measures, association with heart diseaseu

X = df[['trestbps', 'chol', 'thalach', 'oldpeak']]
X = sm.add_constant(X) # We must add the intercept using the add_constant function
Y = df.heart_disease

model = sm.OLS(Y, X).fit()
predictions = model.predict(X) 

print_model = model.summary()
print_model

  return ptp(axis=axis, out=out, **kwargs)


0,1,2,3
Dep. Variable:,heart_disease,R-squared:,0.28
Model:,OLS,Adj. R-squared:,0.27
Method:,Least Squares,F-statistic:,28.25
Date:,"Sun, 02 Feb 2020",Prob (F-statistic):,7.71e-20
Time:,20:03:44,Log-Likelihood:,-165.3
No. Observations:,296,AIC:,340.6
Df Residuals:,291,BIC:,359.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.9815,0.274,3.582,0.000,0.442,1.521
trestbps,0.0019,0.001,1.287,0.199,-0.001,0.005
chol,0.0005,0.000,1.092,0.276,-0.000,0.001
thalach,-0.0069,0.001,-5.993,0.000,-0.009,-0.005
oldpeak,0.1298,0.023,5.611,0.000,0.084,0.175

0,1,2,3
Omnibus:,37.94,Durbin-Watson:,0.547
Prob(Omnibus):,0.0,Jarque-Bera (JB):,14.019
Skew:,0.288,Prob(JB):,0.000903
Kurtosis:,2.103,Cond. No.,3540.0


In [12]:
## Add age and sex

X = df[['trestbps', 'chol', 'thalach', 'oldpeak', 'age', 'sex']]
X = sm.add_constant(X) # We must add the intercept using the add_constant function
Y = df.heart_disease

model = sm.OLS(Y, X).fit()
predictions = model.predict(X) 

print_model = model.summary()
print_model

0,1,2,3
Dep. Variable:,heart_disease,R-squared:,0.347
Model:,OLS,Adj. R-squared:,0.333
Method:,Least Squares,F-statistic:,25.57
Date:,"Sun, 02 Feb 2020",Prob (F-statistic):,2.47e-24
Time:,20:03:45,Log-Likelihood:,-150.83
No. Observations:,296,AIC:,315.7
Df Residuals:,289,BIC:,341.5
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.4987,0.318,1.567,0.118,-0.128,1.125
trestbps,0.0020,0.001,1.435,0.153,-0.001,0.005
chol,0.0010,0.000,2.012,0.045,2.07e-05,0.002
thalach,-0.0065,0.001,-5.438,0.000,-0.009,-0.004
oldpeak,0.1165,0.022,5.239,0.000,0.073,0.160
age,0.0020,0.003,0.649,0.517,-0.004,0.008
sex,0.2849,0.052,5.444,0.000,0.182,0.388

0,1,2,3
Omnibus:,17.894,Durbin-Watson:,0.69
Prob(Omnibus):,0.0,Jarque-Bera (JB):,8.706
Skew:,0.208,Prob(JB):,0.0129
Kurtosis:,2.27,Cond. No.,4370.0


In [13]:
df["sexchol"] = df["sex"]*df["chol"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [14]:
df["invsexchol"] = np.where(df["sex"] == 1, 0, 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [15]:
df["invsexchol"] = df["invsexchol"] * df["chol"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [16]:
## Check for interaction between sex and chol. Because sex is binary, check for male = 1 and female = 1 each.
# Not sure if this makes a difference? 
# Checked, it just reverses the beta. No difference. No statistical significance.

In [17]:
# Your code here:

X = df[['trestbps', 'chol', 'thalach', 'oldpeak', "age", 'sex', 'sexchol', "ca"]]
X = sm.add_constant(X) # We must add the intercept using the add_constant function
Y = df.heart_disease

model = sm.OLS(Y, X).fit()
predictions = model.predict(X) 

print_model = model.summary()
print_model

0,1,2,3
Dep. Variable:,heart_disease,R-squared:,0.427
Model:,OLS,Adj. R-squared:,0.411
Method:,Least Squares,F-statistic:,26.73
Date:,"Sun, 02 Feb 2020",Prob (F-statistic):,8.270000000000001e-31
Time:,20:03:45,Log-Likelihood:,-131.46
No. Observations:,296,AIC:,280.9
Df Residuals:,287,BIC:,314.1
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.7245,0.315,2.302,0.022,0.105,1.344
trestbps,0.0023,0.001,1.722,0.086,-0.000,0.005
chol,0.0004,0.001,0.726,0.468,-0.001,0.002
thalach,-0.0059,0.001,-5.272,0.000,-0.008,-0.004
oldpeak,0.0903,0.021,4.228,0.000,0.048,0.132
age,-0.0031,0.003,-1.025,0.306,-0.009,0.003
sex,0.1015,0.226,0.450,0.653,-0.343,0.545
sexchol,0.0006,0.001,0.676,0.500,-0.001,0.002
ca,0.1657,0.027,6.210,0.000,0.113,0.218

0,1,2,3
Omnibus:,8.556,Durbin-Watson:,0.823
Prob(Omnibus):,0.014,Jarque-Bera (JB):,7.53
Skew:,0.32,Prob(JB):,0.0232
Kurtosis:,2.552,Cond. No.,5460.0


In [18]:
# Your code here:

X = df[['trestbps', 'chol', 'thalach', 'oldpeak', "age", 'sex', 'invsexchol', "ca"]]
X = sm.add_constant(X) # We must add the intercept using the add_constant function
Y = df.heart_disease

model = sm.OLS(Y, X).fit()
predictions = model.predict(X) 

print_model = model.summary()
print_model

0,1,2,3
Dep. Variable:,heart_disease,R-squared:,0.427
Model:,OLS,Adj. R-squared:,0.411
Method:,Least Squares,F-statistic:,26.73
Date:,"Sun, 02 Feb 2020",Prob (F-statistic):,8.270000000000001e-31
Time:,20:03:45,Log-Likelihood:,-131.46
No. Observations:,296,AIC:,280.9
Df Residuals:,287,BIC:,314.1
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.7245,0.315,2.302,0.022,0.105,1.344
trestbps,0.0023,0.001,1.722,0.086,-0.000,0.005
chol,0.0010,0.001,1.624,0.106,-0.000,0.002
thalach,-0.0059,0.001,-5.272,0.000,-0.008,-0.004
oldpeak,0.0903,0.021,4.228,0.000,0.048,0.132
age,-0.0031,0.003,-1.025,0.306,-0.009,0.003
sex,0.1015,0.226,0.450,0.653,-0.343,0.545
invsexchol,-0.0006,0.001,-0.676,0.500,-0.002,0.001
ca,0.1657,0.027,6.210,0.000,0.113,0.218

0,1,2,3
Omnibus:,8.556,Durbin-Watson:,0.823
Prob(Omnibus):,0.014,Jarque-Bera (JB):,7.53
Skew:,0.32,Prob(JB):,0.0232
Kurtosis:,2.552,Cond. No.,5040.0


In [19]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,...,non-anginal_pain,typical_angina,left_ventricular_hypertrophy,stt_wave_abnormality,slope_risk,coronary_stenosis,prior_infarct,heart_disease,sexchol,invsexchol
count,296.0,296.0,296.0,296.0,296.0,296.0,296.0,296.0,296.0,296.0,...,296.0,296.0,296.0,296.0,296.0,296.0,296.0,296.0,296.0,296.0
mean,54.523649,0.679054,0.959459,131.60473,247.155405,0.14527,0.523649,149.560811,0.327703,1.059122,...,0.280405,0.077703,0.489865,0.013514,0.929054,0.388514,0.060811,0.459459,163.138514,84.016892
std,9.059471,0.467631,1.034184,17.72662,51.977011,0.35297,0.526692,22.970792,0.470171,1.166474,...,0.449958,0.268157,0.500744,0.115655,0.257169,0.488238,0.239388,0.499198,117.747119,127.839477
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,133.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,56.0,1.0,1.0,130.0,242.5,0.0,1.0,152.5,0.0,0.8,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,211.5,0.0
75%,61.0,1.0,2.0,140.0,275.25,0.0,1.0,166.0,1.0,1.65,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,254.0,210.25
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,353.0,564.0


In [20]:
df.to_csv("heart_edited.csv")