In [1]:
import os
import json
import math
import numpy as np
import pandas as pd
import time
from scipy.stats import zscore
import sklearn
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

## PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
torch.manual_seed(57)

# Torchvision
import torchvision
from torchvision import transforms

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda:0


In [2]:
from google.colab import drive
drive.mount('/content/drive')
!mkdir "models"

Mounted at /content/drive


In [3]:
dataset_path = "/content/drive/MyDrive/CSCI5525Project/heartdata.csv"
heart_data = pd.read_csv(os.path.abspath(dataset_path))
np.random.seed(11037)

In [4]:
print(heart_data)

       HeartDisease    BMI Smoking AlcoholDrinking Stroke  PhysicalHealth  \
0                No  16.60     Yes              No     No             3.0   
1                No  20.34      No              No    Yes             0.0   
2                No  26.58     Yes              No     No            20.0   
3                No  24.21      No              No     No             0.0   
4                No  23.71      No              No     No            28.0   
...             ...    ...     ...             ...    ...             ...   
319790          Yes  27.41     Yes              No     No             7.0   
319791           No  29.84     Yes              No     No             0.0   
319792           No  24.24      No              No     No             0.0   
319793           No  32.81      No              No     No             0.0   
319794           No  46.56      No              No     No             0.0   

        MentalHealth DiffWalking     Sex  AgeCategory      Race Diabetic  \

In [5]:
#Big Mess of Preprocessing

#Turn Booleans into 0-1.
heart_data["HeartDisease"] = heart_data["HeartDisease"].map({'No':0,'Yes':1})
heart_data["Smoking"] = heart_data["Smoking"].map({'No':0,'Yes':1})
heart_data["AlcoholDrinking"] = heart_data["AlcoholDrinking"].map({'No':0,'Yes':1})
heart_data["Stroke"] = heart_data["Stroke"].map({'No':0,'Yes':1})
heart_data["DiffWalking"] = heart_data["DiffWalking"].map({'No':0,'Yes':1})
heart_data["SkinCancer"] = heart_data["SkinCancer"].map({'No':0,'Yes':1})
heart_data["KidneyDisease"] = heart_data["KidneyDisease"].map({'No':0,'Yes':1})
heart_data["Asthma"] = heart_data["Asthma"].map({'No':0,'Yes':1})
heart_data["PhysicalActivity"] = heart_data["PhysicalActivity"].map({'No':0,'Yes':1})

#Turn Ordinal into Numbers
heart_data["AgeCategory"] = heart_data["AgeCategory"].map({'18-24':0,'25-29':1,'30-34':2,'35-39':3,'40-44':4,'45-49':5,'50-54':6,'55-59':7,'60-64':8,'65-69':9,'70-74':10,'75-79':11,'80 or older':12})

In [6]:
from sklearn import model_selection
heart_train,heart_test=model_selection.train_test_split(heart_data,test_size=0.3,random_state=57,stratify=heart_data["HeartDisease"])

In [7]:
#Target Encoding for Categorical
sex_means=heart_train.groupby('Sex')['HeartDisease'].mean()
print(sex_means)
heart_train['Sex']=heart_train['Sex'].map(sex_means)
heart_test['Sex']=heart_test['Sex'].map(sex_means)

race_means=heart_train.groupby('Race')['HeartDisease'].mean()
print(race_means)
heart_train['Race']=heart_train['Race'].map(race_means)
heart_test['Race']=heart_test['Race'].map(race_means)

diab_means=heart_train.groupby('Diabetic')['HeartDisease'].mean()
print(diab_means)
heart_train['Diabetic']=heart_train['Diabetic'].map(diab_means)
heart_test['Diabetic']=heart_test['Diabetic'].map(diab_means)

heal_means=heart_train.groupby('GenHealth')['HeartDisease'].mean()
print(heal_means)
heart_train['GenHealth']=heart_train['GenHealth'].map(heal_means)
heart_test['GenHealth']=heart_test['GenHealth'].map(heal_means)

Sex
Female    0.066746
Male      0.106360
Name: HeartDisease, dtype: float64
Race
American Indian/Alaskan Native    0.103346
Asian                             0.033539
Black                             0.075844
Hispanic                          0.051902
Other                             0.079411
White                             0.091883
Name: HeartDisease, dtype: float64
Diabetic
No                         0.065140
No, borderline diabetes    0.117399
Yes                        0.218024
Yes (during pregnancy)     0.043256
Name: HeartDisease, dtype: float64
GenHealth
Excellent    0.022899
Fair         0.203098
Good         0.102773
Poor         0.341380
Very good    0.047275
Name: HeartDisease, dtype: float64


In [8]:
BMI_mean=heart_train['BMI'].mean()
BMI_std=heart_train['BMI'].std()
heart_train['BMI']=(heart_train['BMI']-BMI_mean)/BMI_std
heart_test['BMI']=(heart_test['BMI']-BMI_mean)/BMI_std

PH_mean=heart_train['PhysicalHealth'].mean()
PH_std=heart_train['PhysicalHealth'].std()
heart_train['PhysicalHealth']=(heart_train['PhysicalHealth']-PH_mean)/PH_std
heart_test['PhysicalHealth']=(heart_test['PhysicalHealth']-PH_mean)/PH_std

MH_mean=heart_train['MentalHealth'].mean()
MH_std=heart_train['MentalHealth'].std()
heart_train['MentalHealth']=(heart_train['MentalHealth']-MH_mean)/MH_std
heart_test['MentalHealth']=(heart_test['MentalHealth']-MH_mean)/MH_std

ST_mean=heart_train['SleepTime'].mean()
ST_std=heart_train['SleepTime'].std()
heart_train['SleepTime']=(heart_train['SleepTime']-ST_mean)/ST_std
heart_test['SleepTime']=(heart_test['SleepTime']-ST_mean)/ST_std

In [9]:
y_vals=heart_train["HeartDisease"].to_numpy()
class_weights=sklearn.utils.class_weight.compute_class_weight(class_weight='balanced',classes=np.unique(y_vals),y=y_vals)
class_weights[1]*=1
class_weights=torch.tensor(class_weights,dtype=torch.float)
 
print(class_weights)

tensor([0.5468, 5.8414])


In [10]:
print(heart_train)
print(heart_test)

        HeartDisease       BMI  Smoking  AlcoholDrinking  Stroke  \
115863             0 -0.556194        1                1       0   
238218             0 -1.007267        0                0       0   
210531             0 -0.138126        0                0       0   
98699              0 -0.513759        1                0       0   
210045             0 -0.521617        1                0       0   
...              ...       ...      ...              ...     ...   
198519             0 -1.291742        1                0       0   
21189              0 -0.515330        1                1       0   
318794             0  1.348686        0                0       0   
77394              0 -0.812379        1                0       0   
222956             0  0.078766        0                0       0   

        PhysicalHealth  MentalHealth  DiffWalking       Sex  AgeCategory  \
115863       -0.424071     -0.490704            0  0.066746           11   
238218       -0.424071     -0.4

In [11]:
labels = heart_data["HeartDisease"]

In [12]:
train_labels = heart_train["HeartDisease"].copy()
test_labels = heart_test["HeartDisease"].copy()
train_data=heart_train
test_data=heart_test

In [13]:
train_data=train_data.drop(["HeartDisease"], axis=1)
feature_names = list(train_data.columns)
test_data=test_data.drop(["HeartDisease"], axis=1)

train_features=train_data.to_numpy()
test_features=test_data.to_numpy()
train_labels=train_labels.to_numpy()
test_labels=test_labels.to_numpy()

train_features=train_features.astype(float)
test_features=test_features.astype(float)

In [16]:
class Net(nn.Module):
  def __init__(self):
    super().__init__()
    self.lin=nn.Linear(17,2)
    self.sm=nn.Softmax(dim=1)
    torch.nn.init.xavier_uniform_(self.lin.weight)

  def forward(self,x):
    val=self.lin(x)
    val=self.sm(val)

    return val

net=Net()
net.to(device)
criterion=nn.CrossEntropyLoss(weight=class_weights.to(device))
optimizer=torch.optim.Adam(net.parameters(),lr=0.1,weight_decay=0.01)
scheduler=torch.optim.lr_scheduler.StepLR(optimizer,step_size=20,gamma=0.8)



In [17]:
num_epochs=100
input_tensor = torch.from_numpy(train_features).type(torch.FloatTensor)
input_tensor = input_tensor.to(device)
label_tensor = torch.from_numpy(train_labels)
label_tensor = label_tensor.to(device)

net.train()
for epoch in range(num_epochs):
  #Figure out how to shuffle inputs on epoch
  idx = torch.randperm(input_tensor.shape[0])
  input_tensor = input_tensor[idx]
  label_tensor = label_tensor[idx]

  optimizer.zero_grad()
  output=net(input_tensor)
  loss=criterion(output,label_tensor)
  loss.backward()
  optimizer.step()
  scheduler.step()
  if epoch % 20 == 0:
    print ('Epoch {}/{} => Loss: {:.2f}'.format(epoch+1, num_epochs, loss.item()))
torch.save(net.state_dict(), 'models/HeartDisease1.pt')

Epoch 1/100 => Loss: 0.76
Epoch 21/100 => Loss: 0.58
Epoch 41/100 => Loss: 0.58
Epoch 61/100 => Loss: 0.58
Epoch 81/100 => Loss: 0.58


In [18]:
test_input_tensor = torch.from_numpy(test_features).type(torch.FloatTensor)
out_probs = net(test_input_tensor.to(device)).cpu().detach().numpy()

In [19]:
score_list=[]
threshy=0
while threshy<1:
  out_classes = np.ceil(out_probs[:,1]-threshy)
  out_classes[out_classes<0]=0
  out_classes=out_classes.astype(int)
  r_s=recall_score(test_labels,out_classes,average=None)[1]
  p_s=precision_score(test_labels,out_classes,average=None)[1]
  score_list.append([r_s,p_s])
  threshy+=0.01
np.savetxt('wumbo.csv',np.array(score_list),delimiter=",")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
threshy=0.9985
out_classes = np.ceil(out_probs[:,1]-threshy)
out_classes[out_classes<0]=0
out_classes=out_classes.astype(int)
print(recall_score(test_labels,out_classes,average=None)[1])
print(precision_score(test_labels,out_classes,average=None)[1])

0.00012177301509985388
1.0
