###### Coronary Heart Disease (CHD) Using Logistic Regression

In [25]:
import numpy as np
import pandas as pd

In [26]:
import statsmodels.api as sm
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import matplotlib.mlab as mlab
%matplotlib inline

In [27]:
chd = pd.read_csv('CHDdataP.csv')

In [28]:
chd.head()

Unnamed: 0,sex,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,CHD
0,1,39,4,0,0,0,0,0,0,195,106.0,70.0,26,80,77,0
1,0,46,2,0,0,0,0,0,0,250,121.0,81.0,28,95,76,0
2,1,48,1,1,20,0,0,0,0,245,127.5,80.0,25,75,70,0
3,0,61,3,1,30,0,0,1,0,225,150.0,95.0,28,65,103,1
4,0,46,3,1,23,0,0,0,0,285,130.0,84.0,23,85,85,0


In [29]:
#chd.info()

In [31]:
from statsmodels.tools import add_constant as add_constant
chd_constant = add_constant(chd)
chd_constant.head()

Unnamed: 0,const,sex,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,CHD
0,1.0,1,39,4,0,0,0,0,0,0,195,106.0,70.0,26,80,77,0
1,1.0,0,46,2,0,0,0,0,0,0,250,121.0,81.0,28,95,76,0
2,1.0,1,48,1,1,20,0,0,0,0,245,127.5,80.0,25,75,70,0
3,1.0,0,61,3,1,30,0,0,1,0,225,150.0,95.0,28,65,103,1
4,1.0,0,46,3,1,23,0,0,0,0,285,130.0,84.0,23,85,85,0


In [32]:
st.chisqprob = lambda chisq, df: st.chi2.sf(chisq, df)
cols=chd_constant.columns[:-1]
model=sm.Logit(chd.CHD,chd_constant[cols])
result=model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.379958
         Iterations 7


0,1,2,3
Dep. Variable:,CHD,No. Observations:,4240.0
Model:,Logit,Df Residuals:,4224.0
Method:,MLE,Df Model:,15.0
Date:,"Thu, 14 Mar 2019",Pseudo R-squ.:,0.108
Time:,20:36:01,Log-Likelihood:,-1611.0
converged:,True,LL-Null:,-1806.1
,,LLR p-value:,8.041e-74

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-7.7207,0.644,-11.990,0.000,-8.983,-6.459
sex,0.5382,0.100,5.369,0.000,0.342,0.735
age,0.0620,0.006,10.006,0.000,0.050,0.074
education,-0.0149,0.045,-0.328,0.743,-0.104,0.074
currentSmoker,0.0519,0.145,0.359,0.720,-0.232,0.335
cigsPerDay,0.0179,0.006,3.171,0.002,0.007,0.029
BPMeds,0.2445,0.219,1.117,0.264,-0.185,0.674
prevalentStroke,0.9932,0.441,2.253,0.024,0.129,1.857
prevalentHyp,0.2261,0.128,1.763,0.078,-0.025,0.477


###### Automated Feature selection using back_feature_elem method

In [33]:
def back_feature_elem (data_frame,dep_var,col_list):
    """ Takes in the dataframe, the dependent variable and a list of column names, runs the regression repeatedly eleminating feature with the highest
    P-value above alpha one at a time and returns the regression summary with all p-values below alpha"""

    while len(col_list)>0 :
        model=sm.Logit(dep_var,data_frame[col_list])
        result=model.fit(disp=0)
        largest_pvalue=round(result.pvalues,3).nlargest(1)
        if largest_pvalue[0]<(0.05):
            return result
            break
        else:
            col_list=col_list.drop(largest_pvalue.index)

result=back_feature_elem(chd_constant,chd.CHD,cols)

In [34]:
result.summary()

0,1,2,3
Dep. Variable:,CHD,No. Observations:,4240.0
Model:,Logit,Df Residuals:,4233.0
Method:,MLE,Df Model:,6.0
Date:,"Thu, 14 Mar 2019",Pseudo R-squ.:,0.1054
Time:,20:36:05,Log-Likelihood:,-1615.7
converged:,True,LL-Null:,-1806.1
,,LLR p-value:,3.664e-79

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-7.8720,0.372,-21.175,0.000,-8.601,-7.143
sex,0.4983,0.097,5.157,0.000,0.309,0.688
age,0.0646,0.006,10.925,0.000,0.053,0.076
cigsPerDay,0.0196,0.004,5.207,0.000,0.012,0.027
prevalentStroke,1.0665,0.436,2.446,0.014,0.212,1.921
diabetes,0.7932,0.217,3.658,0.000,0.368,1.218
sysBP,0.0173,0.002,8.668,0.000,0.013,0.021


###### slected veriables after Feature Engineering
sex
age
cigsPerDay
prevalentStroke
diabetes
sysBP

###### Splitting data to train and test split

In [37]:
import sklearn
new_features=chd[['sex','age','cigsPerDay','prevalentStroke','sysBP','diabetes','CHD']]
x=new_features.iloc[:,:-1]
y=new_features.iloc[:,-1]

#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.30,random_state=5)

In [63]:
from sklearn.linear_model import LogisticRegression

logreg=LogisticRegression()
logreg.fit(x_train,y_train)
y_pred=logreg.predict(x_test)
print(y_pred)
pdict = {"key":y_pred}
pdf = pd.DataFrame(pdict, index=None)
pdf.to_csv("predict1.csv")

[0 0 0 ... 0 0 0]


In [65]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)
cm

array([[1076,    2],
       [ 184,   10]], dtype=int64)

###### The confusion matrix shows 714+6 = 720 correct predictions and 127+1= 128 incorrect ones

In [13]:
#logreg Model Using Joblib And Pickle

In [66]:
import pickle
from sklearn.metrics import classification_report

In [67]:
pickle.dump(logreg, open("logreg.pkl","wb"))

In [68]:
my_logit = pickle.load(open("logreg.pkl","rb"))

In [69]:
my_logit

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [70]:
print(classification_report(y_test,my_logit.predict(x_test)))

             precision    recall  f1-score   support

          0       0.85      1.00      0.92      1078
          1       0.83      0.05      0.10       194

avg / total       0.85      0.85      0.79      1272



In [19]:
#next we will start a flask service . That is in the file 'chdpr.ipynb' ro chdpr.py

#Once its started, we can use this bit of code to call it

In [88]:
import json
import requests
import pandas as pd
#Netstat -an |find/i "established";
url = "http://localhost:5000/api"

# files = {'file': ('CHDtest.csv', open('./CHDtest.csv', 'rb'), 'application/vnd.ms-excel', {'Expires': '0'})}


df = pd.read_csv('./CHDtest.csv',  encoding="utf-8-sig")
df.drop(index = None, columns=['Sno','education', 'currentSmoker', 'BPMeds', 'prevalentHyp', 'totChol', 'diaBP', 'BMI', 'heartRate', 'glucose'],  inplace=True, )

data = df.to_json(orient='records')

print(data)


[{"sex":0,"age":66,"cigsPerDay":0,"prevalentStroke":0,"diabetes":0,"sysBP":166.0},{"sex":0,"age":54,"cigsPerDay":0,"prevalentStroke":0,"diabetes":0,"sysBP":143.5},{"sex":0,"age":39,"cigsPerDay":0,"prevalentStroke":0,"diabetes":0,"sysBP":161.0},{"sex":1,"age":44,"cigsPerDay":20,"prevalentStroke":0,"diabetes":0,"sysBP":135.0},{"sex":0,"age":64,"cigsPerDay":0,"prevalentStroke":0,"diabetes":0,"sysBP":185.0},{"sex":0,"age":44,"cigsPerDay":9,"prevalentStroke":0,"diabetes":0,"sysBP":128.0},{"sex":0,"age":63,"cigsPerDay":20,"prevalentStroke":0,"diabetes":0,"sysBP":136.0},{"sex":0,"age":42,"cigsPerDay":0,"prevalentStroke":0,"diabetes":0,"sysBP":129.0},{"sex":1,"age":55,"cigsPerDay":0,"prevalentStroke":0,"diabetes":0,"sysBP":134.0},{"sex":1,"age":43,"cigsPerDay":0,"prevalentStroke":0,"diabetes":0,"sysBP":115.0},{"sex":1,"age":66,"cigsPerDay":18,"prevalentStroke":0,"diabetes":0,"sysBP":142.0},{"sex":0,"age":40,"cigsPerDay":1,"prevalentStroke":0,"diabetes":0,"sysBP":135.0},{"sex":1,"age":60,"cigsP

In [93]:
header = {'Content-Type': 'application/json', 'Accept': 'application/json'}

res = requests.post(url, data = json.dumps(data), headers= header)
#print(res.text)



In [94]:
dict = json.loads(res.text)

print(type(dict))

df = pd.DataFrame(dict, index=[0])

print(df.head())

df.to_csv('result4.csv')

<class 'dict'>
                                         predictions
0  [{"age":66,"cigsPerDay":0,"diabetes":0,"preval...
