In [1]:
#Import the required libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
import scipy.stats as st
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import matplotlib.mlab as mlab

#Supress warnings
import warnings
warnings.filterwarnings("ignore")





In [2]:
#Importing the dataset and dropping the unwanted columns 
heart_df=pd.read_csv("../input/framingham_heart_disease.csv")
heart_df.drop(['education'],axis=1,inplace=True)
heart_df.head()

Unnamed: 0,male,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [3]:
#Renaming the column name
heart_df.rename(columns={'male':'Sex_male'},inplace=True)
heart_df.head()

Unnamed: 0,Sex_male,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [4]:
# Checking the missing values 
heart_df.isnull().sum()

Sex_male             0
age                  0
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [5]:
#Counting the missing values and dropping them
count=0
for i in heart_df.isnull().sum(axis=1):
    if i>0:
        count=count+1
print('Total number of rows with missing values is ', count)
print('since it is only',round((count/len(heart_df.index))*100), 'percent of the entire dataset the rows with missing values are excluded.')

Total number of rows with missing values is  489
since it is only 12 percent of the entire dataset the rows with missing values are excluded.


In [6]:
#Dropping the missing values columns
heart_df.dropna(axis=0,inplace=True)
heart_df.describe()

Unnamed: 0,Sex_male,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
count,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0
mean,0.445185,49.578821,0.488397,9.005335,0.030408,0.005601,0.311816,0.027207,236.952787,132.365964,82.933716,25.809651,75.703921,81.883169,0.152574
std,0.497053,8.569322,0.499932,11.92244,0.17173,0.074643,0.463297,0.162709,44.610417,22.051951,11.933321,4.065894,11.957763,23.888039,0.359624
min,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,113.0,83.5,48.0,15.54,44.0,40.0,0.0
25%,0.0,42.0,0.0,0.0,0.0,0.0,0.0,0.0,206.0,117.0,75.0,23.09,68.0,71.0,0.0
50%,0.0,49.0,0.0,0.0,0.0,0.0,0.0,0.0,234.0,128.0,82.0,25.41,75.0,78.0,0.0
75%,1.0,56.0,1.0,20.0,0.0,0.0,1.0,0.0,264.0,144.0,90.0,28.06,82.0,87.0,0.0
max,1.0,70.0,1.0,70.0,1.0,1.0,1.0,1.0,696.0,295.0,142.5,56.8,143.0,394.0,1.0


In [7]:
# Again checking the missing values 
heart_df.isnull().sum()


Sex_male           0
age                0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

In [8]:
#Logistic regression - Adding a constant
from statsmodels.tools import add_constant as add_constant
heart_df_constant = add_constant(heart_df)
heart_df_constant.head()

Unnamed: 0,const,Sex_male,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1.0,1,39,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,1.0,0,46,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1.0,1,48,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,1.0,0,61,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,1.0,0,46,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [9]:
#Logistic regression - Chi square method
st.chisqprob = lambda chisq, df: st.chi2.sf(chisq, df)
cols=heart_df_constant.columns[:-1]
model=sm.Logit(heart_df.TenYearCHD,heart_df_constant[cols])
result=model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.377199
         Iterations 7


0,1,2,3
Dep. Variable:,TenYearCHD,No. Observations:,3749.0
Model:,Logit,Df Residuals:,3734.0
Method:,MLE,Df Model:,14.0
Date:,"Thu, 11 Feb 2021",Pseudo R-squ.:,0.1169
Time:,17:23:20,Log-Likelihood:,-1414.1
converged:,True,LL-Null:,-1601.4
Covariance Type:,nonrobust,LLR p-value:,2.922e-71

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-8.6463,0.687,-12.577,0.000,-9.994,-7.299
Sex_male,0.5740,0.107,5.343,0.000,0.363,0.785
age,0.0640,0.007,9.787,0.000,0.051,0.077
currentSmoker,0.0732,0.155,0.473,0.636,-0.230,0.376
cigsPerDay,0.0184,0.006,3.003,0.003,0.006,0.030
BPMeds,0.1446,0.232,0.622,0.534,-0.311,0.600
prevalentStroke,0.7191,0.489,1.471,0.141,-0.239,1.677
prevalentHyp,0.2146,0.136,1.574,0.116,-0.053,0.482
diabetes,0.0025,0.312,0.008,0.994,-0.609,0.614


In [10]:

def back_feature_elem (data_frame,dep_var,col_list):
    while len(col_list)>0 :
        model=sm.Logit(dep_var,data_frame[col_list])
        result=model.fit(disp=0)
        largest_pvalue=round(result.pvalues,3).nlargest(1)
        if largest_pvalue[0]<(0.05):
            return result
            break
        else:
            col_list=col_list.drop(largest_pvalue.index)

result=back_feature_elem(heart_df_constant,heart_df.TenYearCHD,cols)
result.summary()



0,1,2,3
Dep. Variable:,TenYearCHD,No. Observations:,3749.0
Model:,Logit,Df Residuals:,3742.0
Method:,MLE,Df Model:,6.0
Date:,"Thu, 11 Feb 2021",Pseudo R-squ.:,0.1148
Time:,17:23:20,Log-Likelihood:,-1417.6
converged:,True,LL-Null:,-1601.4
Covariance Type:,nonrobust,LLR p-value:,2.548e-76

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-9.1211,0.468,-19.491,0.000,-10.038,-8.204
Sex_male,0.5813,0.105,5.521,0.000,0.375,0.788
age,0.0654,0.006,10.330,0.000,0.053,0.078
cigsPerDay,0.0197,0.004,4.803,0.000,0.012,0.028
totChol,0.0023,0.001,2.099,0.036,0.000,0.004
sysBP,0.0174,0.002,8.166,0.000,0.013,0.022
glucose,0.0076,0.002,4.573,0.000,0.004,0.011


In [11]:
#Interpreting the results: Odds Ratio, Confidence Intervals and Pvalues
params = np.exp(result.params)
conf = np.exp(result.conf_int())
conf['OR'] = params
pvalue=round(result.pvalues,3)
conf['pvalue']=pvalue
conf.columns = ['CI 95%(2.5%)', 'CI 95%(97.5%)', 'Odds Ratio','pvalue']
print ((conf))


            CI 95%(2.5%)  CI 95%(97.5%)  Odds Ratio  pvalue
const           0.000044       0.000274    0.000109   0.000
Sex_male        1.454877       2.198166    1.788313   0.000
age             1.054409       1.080897    1.067571   0.000
cigsPerDay      1.011730       1.028128    1.019896   0.000
totChol         1.000150       1.004386    1.002266   0.036
sysBP           1.013299       1.021791    1.017536   0.000
glucose         1.004343       1.010895    1.007614   0.000


In [12]:
#Splitting data to train and test split
import sklearn
new_features=heart_df[['age','Sex_male','cigsPerDay','totChol','sysBP','glucose','TenYearCHD']]
x=new_features.iloc[:,:-1]
y=new_features.iloc[:,-1]

from sklearn.model_selection import train_test_split  
#use sklearn.cross_validation in jupyter/spyder in place of sklearn.model_selection
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.20,random_state=5)

In [13]:
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()
logreg.fit(x_train,y_train)
y_pred=logreg.predict(x_test)

In [14]:
#Confusion matrix
from sklearn import metrics
confusion_matrix = metrics.confusion_matrix(y_test,y_pred)
print(confusion_matrix)

[[645   2]
 [ 95   8]]


In [15]:
#Model Accuracy
sklearn.metrics.accuracy_score(y_test,y_pred)

0.8706666666666667