In [10]:
import numpy as np
import pandas as pd


def fix_age(age) -> np.int8:
    if age == '[0-10)':
        return 5
    elif age == '[10-20)':
        return 15
    elif age == '[20-30)':
        return 25
    elif age == '[30-40)':
        return 35
    elif age == '[40-50)':
        return 45
    elif age == '[50-60)':
        return 55
    elif age == '[60-70)':
        return 65
    elif age == '[70-80)':
        return 75
    elif age == '[80-90)':
        return 85
    elif age == '[90-100)':
        return 95


def fix_demographics(row) -> np.int8:
    if row['gender'] == 'Female':
        if row['race'] == 'Caucasian':
            return 1
        elif row['race'] == 'Asian':
            return 2
        elif row['race'] == ' AfricanAmerican':
            return 3
        elif row['race'] == 'Hispanic':
            return 4
        elif row['race'] == 'Other':
            return 5
        else:
            return 6
    elif row['gender'] == 'Male':
        if row['race'] == 'Caucasian':
            return 7
        elif row['race'] == 'Asian':
            return 8
        elif row['race'] == ' AfricanAmerican':
            return 9
        elif row['race'] == 'Hispanic':
            return 10
        elif row['race'] == 'Other':
            return 11
        else:
            return 12
    else:
        if row['race'] == 'Caucasian':
            return 13
        elif row['race'] == 'Asian':
            return 14
        elif row['race'] == ' AfricanAmerican':
            return 15
        elif row['race'] == 'Hispanic':
            return 16
        elif row['race'] == 'Other':
            return 17
        else:
            return 18


def data_normalization(item) -> np.int8:
    # Values: '>200,' '>300','normal', and 'none' if not measured
    if item == 'None' or item == 'No' or item == 'NO':
        return 0
    elif item == 'Yes' or item == 'Ch' or item == 'Steady' or item == '<30' or item == 'Norm':
        return 1
    elif item == '>30' or item == 'Up' or item == '>200' or item == '>7':
        return 2
    elif item == '>300' or item == 'Down' or item == '>8':
        return 3
    elif item == '?':
        return None
    else:
        return item


def main():

    # read file to dataframe
    df = pd.read_csv("diabetic_data.csv")

    # remove data that is irrelevant
    df = df.drop(['encounter_id', 'weight', 'patient_nbr', 'discharge_disposition_id', 'admission_source_id',
                'payer_code', 'number_outpatient', 'number_emergency', 'number_inpatient', 'medical_specialty'], axis=1)
   
    # prepare data for a Classification or Clustering machine learning 
    # algorithm by converting data to numeric or nominal format
    df['age'] = df['age'].apply(fix_age)
    df['demographics'] = df.apply(fix_demographics, axis=1)
    df = df.applymap(lambda item: data_normalization(item))
    # df['max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide,glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'] = df['max_glu_serum',

    # df.info()
    # df.dropna(axis=1, thresh=10)
    # remove data that has null values
    df  = df.drop(['gender', 'race'], axis=1)
    df.dropna(axis=0)

    # divide 80/20 for training and testing
    df1 = df[df.index % 5 != 0]  # Excludes every 5th row starting from 0
    df2 = df[df.index % 5 == 0]  # Selects every 5th raw starting from 0

    
    df1.to_csv("diabetic_data_training.csv")
    df2.to_csv("diabetic_data_testing.csv")

if __name__=="__main__":
    main()

In [12]:
import torch
# x = torch.rand(5, 3)
# print(x)
# !nvidia-smi
torch.cuda.is_available()
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
# Count number of devices
torch.cuda.device_count()

# Create tensor (default on CPU)
tensor = torch.tensor([1, 2, 3])

# Tensor not on GPU
print(tensor, tensor.device)

# Move tensor to GPU (if available)
tensor_on_gpu = tensor.to(device)
print(tensor_on_gpu)

# If tensor is on GPU, can't transform it to NumPy (this will error)
# tensor_on_gpu.numpy()

# Instead, copy the tensor back to cpu
tensor_back_on_cpu = tensor_on_gpu.cpu().numpy()
tensor_back_on_cpu

cuda
tensor([1, 2, 3]) cpu
tensor([1, 2, 3], device='cuda:0')


array([1, 2, 3], dtype=int64)

In [18]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as torch_optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
from datetime import datetime

In [19]:
# DEVICE UTILITIES
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)
    
device = get_default_device()
device

device(type='cuda')

In [20]:
class DiabeticPatientDataset(Dataset):
    def __init__(self, data, labels):
        # stuff
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return # stuff
    

# Model
class DiabeticPatientModel(nn.Module):
    def __init__(self, ):
        super().__init__()

        

    def forward(self,):

        return # stuff

In [26]:

# read file to dataframe
df = pd.read_csv("diabetic_data.csv")
# for readibility on id operations only
# df['ID'] = df['encounter_id']
# divide 80/20 for training and testing
df1 = df[df.index % 5 != 0]  # Excludes every 5th row starting from 0
df2 = df[df.index % 5 == 0]  # Selects every 5th raw starting from 0
# remove data that is irrelevant
train_df = df1.drop(['encounter_id', 'weight', 'patient_nbr', 'discharge_disposition_id', 'admission_source_id',
                        'payer_code', 'number_outpatient', 'number_emergency', 'number_inpatient', 'medical_specialty'], axis=1)
test_df = df2.drop(['encounter_id', 'weight', 'patient_nbr', 'discharge_disposition_id', 'admission_source_id',
                        'payer_code', 'number_outpatient', 'number_emergency', 'number_inpatient', 'medical_specialty'], axis=1)

df=df.drop(['encounter_id', 'weight', 'patient_nbr', 'discharge_disposition_id', 'admission_source_id',
                        'payer_code', 'number_outpatient', 'number_emergency', 'number_inpatient', 'medical_specialty'], axis=1)

NUMERIC_COLUMNS = [ 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_diagnoses']
# remove numeric columns
df=df.drop(columns=NUMERIC_COLUMNS)
# DATA PREPROCESSING
Y = df1['readmitted']
# Stacking train and test set so that they undergo the same preprocessing
# df = train_df.append(test_df.drop(columns=['ID']))
# remove data that has null values
df.dropna(axis=0)

# label encoding
for col in df.columns:
    # if df.dtypes[col] == "object":
    #     df[col] = df[col].fillna("NA")
    # else:
    #     df[col] = df[col].fillna(0)
    df[col] = LabelEncoder().fit_transform(df[col])

df.head()
# making all variables categorical
for col in df.columns:
    df[col] = df[col].astype('category')

# df.info()
# splitting back train and test
train_df = df[df.index % 5 != 0]
test_df = df[df.index % 5 == 0]

# check if shape[0] matches original
print("train shape: ", train_df.shape, "original: ", df1.shape)
print("test shape: ", test_df.shape, "original: ", df2.shape)

# Encoding Target
Y = LabelEncoder().fit_transform(Y)
# sanity check to see numbers match and matching with previous counter to create target dictionary
print(Counter(df1['readmitted']))
print(Counter(Y))
target_dict = {
    'NO': 2,
    '>30': 1,
    '<30': 0,
}

train_df.to_csv("diabetic_data_training.csv")
test_df.to_csv("diabetic_data_testing.csv")


train shape:  (81412, 35) original:  (81412, 50)
test shape:  (20354, 35) original:  (20354, 50)
Counter({'NO': 43877, '>30': 28461, '<30': 9074})
Counter({2: 43877, 1: 28461, 0: 9074})


In [27]:
X_train, X_val, y_train, y_val = train_test_split(train_df, Y, test_size=0.10, random_state=0)
X_train.head()

# Choosing columns for embedding
#categorical embedding for columns having more than three values
embedded_cols = {n: len(col.cat.categories) for n,col in train_df.items() if len(col.cat.categories) > 3}
print("Embedded columns:", embedded_cols)

embedded_col_names = embedded_cols.keys()
len(train_df.columns) - len(embedded_cols) #number of numerical columns

# Determining size of embedding
embedding_sizes = [(n_categories, min(50, (n_categories+1)//2)) for _,n_categories in embedded_cols.items()]
print(embedding_sizes)
print("Embedding sizes:", embedding_sizes)

#creating train and valid datasets
train_ds = ShelterOutcomeDataset(X_train, y_train, embedded_col_names)
valid_ds = ShelterOutcomeDataset(X_val, y_val, embedded_col_names)

# creating model and sending it to gpu if possible
model = ShelterOutcomeModel(embedding_sizes, 1)
to_device(model, device)

# Training 
batch_size = 1000
train_dl = DataLoader(train_ds, batch_size=batch_size,shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size,shuffle=True)

train_dl = DeviceDataLoader(train_dl, device)
valid_dl = DeviceDataLoader(valid_dl, device)

Embedded columns: {'race': 6, 'age': 10, 'admission_type_id': 8, 'diag_1': 717, 'diag_2': 749, 'diag_3': 790, 'max_glu_serum': 4, 'A1Cresult': 4, 'metformin': 4, 'repaglinide': 4, 'nateglinide': 4, 'chlorpropamide': 4, 'glimepiride': 4, 'glipizide': 4, 'glyburide': 4, 'pioglitazone': 4, 'rosiglitazone': 4, 'acarbose': 4, 'miglitol': 4, 'insulin': 4, 'glyburide-metformin': 4}
[(6, 3), (10, 5), (8, 4), (717, 50), (749, 50), (790, 50), (4, 2), (4, 2), (4, 2), (4, 2), (4, 2), (4, 2), (4, 2), (4, 2), (4, 2), (4, 2), (4, 2), (4, 2), (4, 2), (4, 2), (4, 2)]
Embedding sizes: [(6, 3), (10, 5), (8, 4), (717, 50), (749, 50), (790, 50), (4, 2), (4, 2), (4, 2), (4, 2), (4, 2), (4, 2), (4, 2), (4, 2), (4, 2), (4, 2), (4, 2), (4, 2), (4, 2), (4, 2), (4, 2)]


In [28]:
# Optimizer 
def get_optimizer(model, lr = 0.001, wd = 0.0):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optim = torch_optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optim

# Training function
def train_model(model, optim, train_dl):
    model.train()
    total = 0
    sum_loss = 0
    for x1, x2, y in train_dl:
        batch = y.shape[0]
        output = model(x1, x2)
        loss = F.cross_entropy(output, y)   
        optim.zero_grad()
        loss.backward()
        optim.step()
        total += batch
        sum_loss += batch*(loss.item())
    return sum_loss/total

# Evaluation function
def val_loss(model, valid_dl):
    model.eval()
    total = 0
    sum_loss = 0
    correct = 0
    for x1, x2, y in valid_dl:
        current_batch_size = y.shape[0]
        out = model(x1, x2)
        loss = F.cross_entropy(out, y)
        sum_loss += current_batch_size*(loss.item())
        total += current_batch_size
        pred = torch.max(out, 1)[1]
        correct += (pred == y).float().sum().item()
    print("valid loss %.3f and accuracy %.3f" % (sum_loss/total, correct/total))
    return sum_loss/total, correct/total

def train_loop(model, epochs, lr=0.01, wd=0.0):
    optim = get_optimizer(model, lr = lr, wd = wd)
    for i in range(epochs): 
        loss = train_model(model, optim, train_dl)
        print("training loss: ", loss)
        val_loss(model, valid_dl)

In [29]:
train_loop(model, epochs=8, lr=0.05, wd=0.00001)

# Test Output
test_ds = DiabeticPatientDataset(test_df, np.zeros(len(test_df)), embedded_col_names)
test_dl = DataLoader(test_ds, batch_size=batch_size)

preds = []
with torch.no_grad():
    for x1,x2,y in test_dl:
        out = model(x1, x2)
        prob = F.softmax(out, dim=1)
        preds.append(prob)
final_probs = [item for sublist in preds for item in sublist]

sample_df=pd.from_dict(target_dict)
sample_df['NO']=[float(t[2]) for t in final_probs]
sample_df['>30']=[float(t[1]) for t in final_probs]
sample_df['<30']=[float(t[0]) for t in final_probs]
sample_df.head()
sample_df.to_csv('diabetic_trained_model_probabilties.csv', index=False)

RuntimeError: running_mean should contain 14 elements not 1