In [1]:
import os
import json
import math
import numpy as np
import pandas as pd
import time
from scipy.stats import zscore
from scipy.stats import multivariate_normal
import sklearn
from scipy.special import softmax as softmaxy
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

## PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
torch.manual_seed(57)

# Torchvision
import torchvision
from torchvision import transforms

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cpu


In [2]:
from google.colab import drive
drive.mount('/content/drive')
!mkdir "models"

Mounted at /content/drive


In [3]:
dataset_path = "/content/drive/MyDrive/CSCI5525Project/heartdata.csv"
heart_data = pd.read_csv(os.path.abspath(dataset_path))
np.random.seed(11037)

In [4]:
print(heart_data)

       HeartDisease    BMI Smoking AlcoholDrinking Stroke  PhysicalHealth  \
0                No  16.60     Yes              No     No             3.0   
1                No  20.34      No              No    Yes             0.0   
2                No  26.58     Yes              No     No            20.0   
3                No  24.21      No              No     No             0.0   
4                No  23.71      No              No     No            28.0   
...             ...    ...     ...             ...    ...             ...   
319790          Yes  27.41     Yes              No     No             7.0   
319791           No  29.84     Yes              No     No             0.0   
319792           No  24.24      No              No     No             0.0   
319793           No  32.81      No              No     No             0.0   
319794           No  46.56      No              No     No             0.0   

        MentalHealth DiffWalking     Sex  AgeCategory      Race Diabetic  \

In [5]:
from sklearn import model_selection
heart_train,heart_test=model_selection.train_test_split(heart_data,test_size=0.3,random_state=57,stratify=heart_data["HeartDisease"])

In [6]:
tr_g=heart_train.groupby('HeartDisease')
priors=tr_g.size()/len(heart_train)
print(priors)
sm=tr_g['Smoking'].value_counts()/tr_g['Smoking'].size()
ad=tr_g['AlcoholDrinking'].value_counts()/tr_g['AlcoholDrinking'].size()
st=tr_g['Stroke'].value_counts()/tr_g['Stroke'].size()
dw=tr_g['DiffWalking'].value_counts()/tr_g['DiffWalking'].size()
pa=tr_g['PhysicalActivity'].value_counts()/tr_g['PhysicalActivity'].size()
am=tr_g['Asthma'].value_counts()/tr_g['Asthma'].size()
kd=tr_g['KidneyDisease'].value_counts()/tr_g['KidneyDisease'].size()
sc=tr_g['SkinCancer'].value_counts()/tr_g['SkinCancer'].size()

sex_g=tr_g['Sex'].value_counts()/tr_g['Sex'].size()
age_g=tr_g['AgeCategory'].value_counts()/tr_g['AgeCategory'].size()
rac_g=tr_g['Race'].value_counts()/tr_g['Race'].size()
dia_g=tr_g['Diabetic'].value_counts()/tr_g['Diabetic'].size()
gnh_g=tr_g['GenHealth'].value_counts()/tr_g['GenHealth'].size()

HeartDisease
No     0.914405
Yes    0.085595
dtype: float64


In [7]:
tr_no=heart_train.copy()
tr_yes=heart_train.copy()
te_no=heart_test.copy()
te_yes=heart_test.copy()

In [8]:
tr_no['Smoking']=tr_no['Smoking'].map(sm.loc['No'])
tr_no['AlcoholDrinking']=tr_no['AlcoholDrinking'].map(ad.loc['No'])
tr_no['Stroke']=tr_no['Stroke'].map(st.loc['No'])
tr_no['DiffWalking']=tr_no['DiffWalking'].map(dw.loc['No'])
tr_no['PhysicalActivity']=tr_no['PhysicalActivity'].map(pa.loc['No'])
tr_no['Asthma']=tr_no['Asthma'].map(am.loc['No'])
tr_no['KidneyDisease']=tr_no['KidneyDisease'].map(kd.loc['No'])
tr_no['SkinCancer']=tr_no['SkinCancer'].map(sc.loc['No'])
tr_no['Sex']=tr_no['Sex'].map(sex_g.loc['No'])
tr_no['AgeCategory']=tr_no['AgeCategory'].map(age_g.loc['No'])
tr_no['Race']=tr_no['Race'].map(rac_g.loc['No'])
tr_no['Diabetic']=tr_no['Diabetic'].map(dia_g.loc['No'])
tr_no['GenHealth']=tr_no['GenHealth'].map(gnh_g.loc['No'])

tr_yes['Smoking']=tr_yes['Smoking'].map(sm.loc['Yes'])
tr_yes['AlcoholDrinking']=tr_yes['AlcoholDrinking'].map(ad.loc['Yes'])
tr_yes['Stroke']=tr_yes['Stroke'].map(st.loc['Yes'])
tr_yes['DiffWalking']=tr_yes['DiffWalking'].map(dw.loc['Yes'])
tr_yes['PhysicalActivity']=tr_yes['PhysicalActivity'].map(pa.loc['Yes'])
tr_yes['Asthma']=tr_yes['Asthma'].map(am.loc['Yes'])
tr_yes['KidneyDisease']=tr_yes['KidneyDisease'].map(kd.loc['Yes'])
tr_yes['SkinCancer']=tr_yes['SkinCancer'].map(sc.loc['Yes'])
tr_yes['Sex']=tr_yes['Sex'].map(sex_g.loc['Yes'])
tr_yes['AgeCategory']=tr_yes['AgeCategory'].map(age_g.loc['Yes'])
tr_yes['Race']=tr_yes['Race'].map(rac_g.loc['Yes'])
tr_yes['Diabetic']=tr_yes['Diabetic'].map(dia_g.loc['Yes'])
tr_yes['GenHealth']=tr_yes['GenHealth'].map(gnh_g.loc['Yes'])

no_vals=tr_g.get_group('No')
yes_vals=tr_g.get_group('Yes')

BMI_mno=no_vals['BMI'].mean()
BMI_sno=no_vals['BMI'].std()
BMI_myes=yes_vals['BMI'].mean()
BMI_syes=yes_vals['BMI'].std()

tr_no['BMI']=(tr_no['BMI']-BMI_mno)/BMI_sno
tr_no['BMI']=(np.exp(-0.5*(tr_no['BMI']**2)))
tr_yes['BMI']=(tr_yes['BMI']-BMI_myes)/BMI_syes
tr_yes['BMI']=(np.exp(-0.5*(tr_yes['BMI']**2)))


PH_mno=no_vals['PhysicalHealth'].mean()
PH_sno=no_vals['PhysicalHealth'].std()
PH_myes=yes_vals['PhysicalHealth'].mean()
PH_syes=yes_vals['PhysicalHealth'].std()

tr_no['PhysicalHealth']=(tr_no['PhysicalHealth']-PH_mno)/PH_sno
tr_no['PhysicalHealth']=(np.exp(-0.5*(tr_no['PhysicalHealth']**2)))
tr_yes['PhysicalHealth']=(tr_yes['PhysicalHealth']-PH_myes)/PH_syes
tr_yes['PhysicalHealth']=(np.exp(-0.5*(tr_yes['PhysicalHealth']**2)))


MH_mno=no_vals['MentalHealth'].mean()
MH_sno=no_vals['MentalHealth'].std()
MH_myes=yes_vals['MentalHealth'].mean()
MH_syes=yes_vals['MentalHealth'].std()

tr_no['MentalHealth']=(tr_no['MentalHealth']-MH_mno)/MH_sno
tr_no['MentalHealth']=(np.exp(-0.5*(tr_no['MentalHealth']**2)))
tr_yes['MentalHealth']=(tr_yes['MentalHealth']-MH_myes)/MH_syes
tr_yes['MentalHealth']=(np.exp(-0.5*(tr_yes['MentalHealth']**2)))


ST_mno=no_vals['SleepTime'].mean()
ST_sno=no_vals['SleepTime'].std()
ST_myes=yes_vals['SleepTime'].mean()
ST_syes=yes_vals['SleepTime'].std()

tr_no['SleepTime']=(tr_no['SleepTime']-ST_mno)/ST_sno
tr_no['SleepTime']=(np.exp(-0.5*(tr_no['SleepTime']**2)))
tr_yes['SleepTime']=(tr_yes['SleepTime']-ST_myes)/ST_syes
tr_yes['SleepTime']=(np.exp(-0.5*(tr_yes['SleepTime']**2)))

#Get Test Probs
te_no['Smoking']=te_no['Smoking'].map(sm.loc['No'])
te_no['AlcoholDrinking']=te_no['AlcoholDrinking'].map(ad.loc['No'])
te_no['Stroke']=te_no['Stroke'].map(st.loc['No'])
te_no['DiffWalking']=te_no['DiffWalking'].map(dw.loc['No'])
te_no['PhysicalActivity']=te_no['PhysicalActivity'].map(pa.loc['No'])
te_no['Asthma']=te_no['Asthma'].map(am.loc['No'])
te_no['KidneyDisease']=te_no['KidneyDisease'].map(kd.loc['No'])
te_no['SkinCancer']=te_no['SkinCancer'].map(sc.loc['No'])
te_no['Sex']=te_no['Sex'].map(sex_g.loc['No'])
te_no['AgeCategory']=te_no['AgeCategory'].map(age_g.loc['No'])
te_no['Race']=te_no['Race'].map(rac_g.loc['No'])
te_no['Diabetic']=te_no['Diabetic'].map(dia_g.loc['No'])
te_no['GenHealth']=te_no['GenHealth'].map(gnh_g.loc['No'])

te_yes['Smoking']=te_yes['Smoking'].map(sm.loc['Yes'])
te_yes['AlcoholDrinking']=te_yes['AlcoholDrinking'].map(ad.loc['Yes'])
te_yes['Stroke']=te_yes['Stroke'].map(st.loc['Yes'])
te_yes['DiffWalking']=te_yes['DiffWalking'].map(dw.loc['Yes'])
te_yes['PhysicalActivity']=te_yes['PhysicalActivity'].map(pa.loc['Yes'])
te_yes['Asthma']=te_yes['Asthma'].map(am.loc['Yes'])
te_yes['KidneyDisease']=te_yes['KidneyDisease'].map(kd.loc['Yes'])
te_yes['SkinCancer']=te_yes['SkinCancer'].map(sc.loc['Yes'])
te_yes['Sex']=te_yes['Sex'].map(sex_g.loc['Yes'])
te_yes['AgeCategory']=te_yes['AgeCategory'].map(age_g.loc['Yes'])
te_yes['Race']=te_yes['Race'].map(rac_g.loc['Yes'])
te_yes['Diabetic']=te_yes['Diabetic'].map(dia_g.loc['Yes'])
te_yes['GenHealth']=te_yes['GenHealth'].map(gnh_g.loc['Yes'])

te_no['BMI']=(te_no['BMI']-BMI_mno)/BMI_sno
te_no['BMI']=(np.exp(-0.5*(te_no['BMI']**2)))
te_yes['BMI']=(te_yes['BMI']-BMI_myes)/BMI_syes
te_yes['BMI']=(np.exp(-0.5*(te_yes['BMI']**2)))

te_no['PhysicalHealth']=(te_no['PhysicalHealth']-PH_mno)/PH_sno
te_no['PhysicalHealth']=(np.exp(-0.5*(te_no['PhysicalHealth']**2)))
te_yes['PhysicalHealth']=(te_yes['PhysicalHealth']-PH_myes)/PH_syes
te_yes['PhysicalHealth']=(np.exp(-0.5*(te_yes['PhysicalHealth']**2)))

te_no['MentalHealth']=(te_no['MentalHealth']-MH_mno)/MH_sno
te_no['MentalHealth']=(np.exp(-0.5*(te_no['MentalHealth']**2)))
te_yes['MentalHealth']=(te_yes['MentalHealth']-MH_myes)/MH_syes
te_yes['MentalHealth']=(np.exp(-0.5*(te_yes['MentalHealth']**2)))

te_no['SleepTime']=(te_no['SleepTime']-ST_mno)/ST_sno
te_no['SleepTime']=(np.exp(-0.5*(te_no['SleepTime']**2)))
te_yes['SleepTime']=(te_yes['SleepTime']-ST_myes)/ST_syes
te_yes['SleepTime']=(np.exp(-0.5*(te_yes['SleepTime']**2)))

In [9]:
tr_no["HeartDisease"] = tr_no["HeartDisease"].map({'No':0,'Yes':1})
te_no["HeartDisease"] = te_no["HeartDisease"].map({'No':0,'Yes':1})

train_labels = tr_no["HeartDisease"].copy()
test_labels = te_no["HeartDisease"].copy()

tr_no=tr_no.drop(["HeartDisease"], axis=1)
tr_yes=tr_yes.drop(["HeartDisease"], axis=1)
te_no=te_no.drop(["HeartDisease"], axis=1)
te_yes=te_yes.drop(["HeartDisease"], axis=1)


In [10]:
tr_no=tr_no.prod(axis=1).to_numpy().reshape((-1,1))
tr_no=tr_no*priors[0]
tr_yes=tr_yes.prod(axis=1).to_numpy().reshape((-1,1))
tr_yes=tr_yes*priors[1]
te_no=te_no.prod(axis=1).to_numpy().reshape((-1,1))
te_no=te_no*priors[0]
te_yes=te_yes.prod(axis=1).to_numpy().reshape((-1,1))
te_yes=te_yes*priors[1]

In [11]:
print(tr_no)
print(tr_yes)
print(te_no)
print(te_yes)

[[1.76451321e-05]
 [9.99761170e-04]
 [1.55018855e-04]
 ...
 [3.91311973e-05]
 [3.36613572e-05]
 [8.64457230e-07]]
[[2.39955757e-07]
 [1.56135518e-06]
 [2.20476888e-07]
 ...
 [1.38816316e-08]
 [1.87557871e-05]
 [8.42037186e-07]]
[[5.04983986e-06]
 [9.18558255e-10]
 [4.73285628e-06]
 ...
 [2.71355592e-05]
 [9.43658999e-07]
 [2.18606435e-06]]
[[1.81747376e-07]
 [8.74587713e-09]
 [1.83520419e-06]
 ...
 [1.06188415e-06]
 [8.73912588e-09]
 [2.97059740e-07]]


In [12]:
tr_out=np.hstack((tr_no,tr_yes))
te_out=np.hstack((te_no,te_yes))
print(tr_out)
print(te_out)

[[1.76451321e-05 2.39955757e-07]
 [9.99761170e-04 1.56135518e-06]
 [1.55018855e-04 2.20476888e-07]
 ...
 [3.91311973e-05 1.38816316e-08]
 [3.36613572e-05 1.87557871e-05]
 [8.64457230e-07 8.42037186e-07]]
[[5.04983986e-06 1.81747376e-07]
 [9.18558255e-10 8.74587713e-09]
 [4.73285628e-06 1.83520419e-06]
 ...
 [2.71355592e-05 1.06188415e-06]
 [9.43658999e-07 8.73912588e-09]
 [2.18606435e-06 2.97059740e-07]]


In [13]:
tr_sum=np.sum(tr_out,axis=1)
te_sum=np.sum(te_out,axis=1)
tr_out[:,0]=tr_out[:,0]/tr_sum
tr_out[:,1]=tr_out[:,1]/tr_sum
te_out[:,0]=te_out[:,0]/te_sum
te_out[:,1]=te_out[:,1]/te_sum
print(tr_out)
print(te_out)

[[9.86583473e-01 1.34165266e-02]
 [9.98440707e-01 1.55929297e-03]
 [9.98579761e-01 1.42023858e-03]
 ...
 [9.99645380e-01 3.54620095e-04]
 [6.42182202e-01 3.57817798e-01]
 [5.06569035e-01 4.93430965e-01]]
[[0.96525961 0.03474039]
 [0.09504521 0.90495479]
 [0.72058659 0.27941341]
 ...
 [0.96234112 0.03765888]
 [0.99082408 0.00917592]
 [0.88036855 0.11963145]]


In [51]:
out_probs = te_out

score_list=[]
threshy=0
while threshy<1:
  out_classes = np.ceil(out_probs[:,1]-threshy)
  out_classes[out_classes<0]=0
  out_classes=out_classes.astype(int)
  r_s=recall_score(test_labels,out_classes,average=None)[1]
  p_s=precision_score(test_labels,out_classes,average=None)[1]
  score_list.append([r_s,p_s])
  threshy+=0.01
np.savetxt('wumbo.csv',np.array(score_list),delimiter=",")

  _warn_prf(average, modifier, msg_start, len(result))


In [70]:
threshy=0.999999999999999
out_classes = np.ceil(out_probs[:,1]-threshy)
out_classes[out_classes<0]=0
out_classes=out_classes.astype(int)
print(recall_score(test_labels,out_classes,average=None)[1])
print(precision_score(test_labels,out_classes,average=None)[1])

0.0
0.0


  _warn_prf(average, modifier, msg_start, len(result))
