# Opioid Data - LSTM
HW #2 Part 2 - Timeseries.  
Use all rows per patient from about 30 consecutive days.
Standardize all of it.
Train LSTM classifier.

## Load data structures and scale features

In [1]:
from os import listdir
import numpy as np
import pandas as pd
import time

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Masking
from keras.preprocessing.sequence import pad_sequences

## From data file to Pandas dataframe

In [2]:
try:
    from google.colab import drive
    IN_COLAB = True
    PATH='/content/drive/'
    drive.mount(PATH)
    DATAPATH=PATH+'My Drive/data/'  # must end in "/"
except:
    IN_COLAB = False
    DATAPATH='data/' 
pathR=DATAPATH+'ChunkedData_R/'
pathN=DATAPATH+'chunkedData_NR/'
print("Data at ",pathR,pathN)

Mounted at /content/drive/
Data at  /content/drive/My Drive/data/ChunkedData_R/ /content/drive/My Drive/data/chunkedData_NR/


In [3]:
def initialize_globals():
    global patient_list, labels_list, features_df,UNIFORM_NUM_DAYS
    UNIFORM_NUM_DAYS = 29  # most daily records of any patient
    patient_list = [] # list of 40 (cohort,patient_ID,first_row,last_row+1)
    labels_list = [] # list of 1004 (cohort,patient_ID,date)
    features_df = pd.DataFrame()  # 1004 rows of 259 numeric values

In [4]:
# Read one CSV file. 
# Load global lists
def load_patient (filepath,cohort,patient_ID):
    global patient_list
    global labels_df
    global features_df
    one_patient = pd.read_csv(filepath)
    rows,cols = one_patient.shape
    features_df = features_df.append(one_patient)
    first_insert = len(labels_list)
    last_insert = first_insert + rows
    for rec in range(0,rows):
        one_label=(cohort,patient_ID,one_patient.loc[rec]['Date'])
        labels_list.append(one_label)
    patient_list.append([cohort,patient_ID,first_insert,last_insert])

# Read directory of CSV files (R or NR). 
# Given directory, load all the patients in that directory.
# We use filenames as patient names.
def load_cohort (cohort,directory):
    file_names = listdir(directory)
    for fp in file_names:
        dfp = directory+fp
        one_name = fp.split('.')[0]  # strip away .csv suffix
        one_name = one_name[6:]    # strip away Daily_ prefix
        one_patient = load_patient(dfp,cohort,one_name)

# Standardize features by shifting the mean to zero and scaling to unit variance.
# Subtract the mean and divide by the std.dev: z = (x - u) / s
def scale_features(df):
    scaled = StandardScaler().fit_transform(df.values)
    scaled_df = pd.DataFrame(scaled, index=df.index, columns=df.columns)
    return scaled_df

In [5]:
initialize_globals()
load_cohort('R',pathR)
load_cohort('N',pathN)
features_df = features_df.drop('Date',axis=1) 
scaled_features = scale_features(features_df)
features_df = None
print("Patients:",len(patient_list))
print("Label rows:",len(labels_list))
print("Feature rows:",len(scaled_features))

Patients: 40
Label rows: 1004
Feature rows: 1004


## Prepare numpy arrays as required by tensorflow

In [6]:
# Create numpy array of 40 1 or 0
def make_labels():
    # rely on global patient_list:
    # list of 40 (cohort,patient_ID,first_row,last_row+1)
    labels=[]
    for p in patient_list:
        cohort = p[0]
        y = 0
        if cohort=='R':
            y = 1
        labels.append(y)
        ary = np.array(labels).astype(np.float32)
    return ary

def make_data():
    patients=[]
    for p in patient_list:
        days=[]
        first = p[2]
        last = p[3]
        for r in range(first,last):
            day = scaled_features.iloc[r].values.tolist()
            days.append(day)
        patients.append(days)
    # The tensor data structure has fixed size for each dimension.
    # Thus we pad every patient to the maximum = UNIFORM_NUM_DAYS = 29.
    # The alternative is RaggedTensor which is slow and not fully supported.
    padded = pad_sequences(patients, padding="post")  
    ary = np.array(padded).astype(np.float32)
    return ary
        
X_all = make_data()
y_all = make_labels()
X_all,y_all = shuffle(X_all,y_all) # avoid training on all the R, then all the NR
print("y values, X values:",len(y_all),len(X_all))
UNIFORM_NUM_PATIENTS=len(y_all)
UNIFORM_NUM_DAYS=len(X_all[0])
UNIFORM_NUM_FEATURES=len(X_all[0][0])
print("Patients, Days, Features:",UNIFORM_NUM_PATIENTS,UNIFORM_NUM_DAYS,UNIFORM_NUM_FEATURES)

y values, X values: 40 40
Patients, Days, Features: 40 29 259


## LSTM

In [11]:
def build_model():
    # We padded each patient to 29 days. Pad value was zero. Here, mask thoses days.
    embed_layer = Masking(mask_value=0.,input_shape=(UNIFORM_NUM_DAYS,UNIFORM_NUM_FEATURES))
    rnn1_layer = LSTM(259, return_sequences=True) 
    rnn2_layer = LSTM(40, return_sequences=False)
    dense1_layer = Dense(30)
    dense2_layer = Dense(20)
    output_layer = Dense(1, activation="sigmoid")
    ann = keras.models.Sequential()
    ann.add(embed_layer)
    ann.add(rnn1_layer)
    ann.add(rnn2_layer)
    ann.add(dense1_layer)
    ann.add(dense2_layer)
    ann.add(output_layer)
    compiled = compile_model(ann)
    return compiled

# Default weight initializers.
# Keras Dense default = Glorot aka Xavier uniform initializer

def compile_model(model):
    bc=tf.keras.losses.BinaryCrossentropy(from_logits=False)
    model.compile(loss=bc, optimizer="adam", metrics=["accuracy"])
    return model

In [12]:
lstm = build_model()
lstm = compile_model(lstm)
lstm.summary()

Model: "sequential_41"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_41 (Masking)         (None, 29, 259)           0         
_________________________________________________________________
lstm_82 (LSTM)               (None, 29, 259)           537684    
_________________________________________________________________
lstm_83 (LSTM)               (None, 40)                48000     
_________________________________________________________________
dense_123 (Dense)            (None, 30)                1230      
_________________________________________________________________
dense_124 (Dense)            (None, 20)                620       
_________________________________________________________________
dense_125 (Dense)            (None, 1)                 21        
Total params: 587,555
Trainable params: 587,555
Non-trainable params: 0
_______________________________________________

In [13]:
#lstm.fit(X_all,y_all)
splitter=LeaveOneOut()
EPOCHS=5
BATCH = UNIFORM_NUM_PATIENTS//4
print("Num splits:",splitter.get_n_splits(X_all))
truth = []
predicted = []
cv_iter = 0
for train_index,test_index in splitter.split(X_all,y_all):
    lstm = build_model()
    lstm = compile_model(lstm)
    X_train = X_all[train_index]
    X_test = X_all[test_index]
    y_train = y_all[train_index]
    y_test = y_all[test_index]
    X=tf.convert_to_tensor(X_train, dtype=tf.float32)
    y=tf.convert_to_tensor(y_train, dtype=tf.float32)
    lstm.fit(X, y, batch_size=BATCH, epochs=EPOCHS)
    truth.append(int(y_test[0]))
    score = lstm.predict(X_test)[0][0]
    print("CV:",cv_iter,"Truth:",y_test,"Score:",score)
    predicted.append(int(score+0.49999))  # use less than 1/2 to avoid exceeding 1
    cv_iter += 1
print("truth\n",truth)
print("predicted\n",predicted)

Num splits: 40
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CV: 0 Truth: [1.] Score: 0.074603885
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CV: 1 Truth: [0.] Score: 0.00039882495
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CV: 2 Truth: [0.] Score: 0.0020905838
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CV: 3 Truth: [0.] Score: 0.003132457
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CV: 4 Truth: [1.] Score: 0.97562295
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CV: 5 Truth: [1.] Score: 0.018055504
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CV: 6 Truth: [0.] Score: 0.99743134
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CV: 7 Truth: [0.] Score: 0.0014561032
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CV: 8 Truth: [0.] Score: 0.0014199803
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CV: 9 Truth: [1.] Score: 0.9782825
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CV: 10 Truth: [0.] Score: 0.99118185
Epoch 1/5
Epoch 2/5
Epoch 3

In [14]:
confusion = confusion_matrix(truth,predicted)
confusion

array([[22,  4],
       [ 5,  9]])

## Useful links
Masking and Padding with Keras  
https://www.tensorflow.org/guide/keras/masking_and_padding

Train and test with Keras  
https://www.tensorflow.org/guide/keras/train_and_evaluate
