# Opioid Data - LSTM
HW #2 Part 2 - Timeseries.  
Use all rows per patient from about 30 consecutive days.
Standardize all of it.
Train LSTM classifier.

## Load data structures and scale features

In [1]:
from os import listdir
import numpy as np
import pandas as pd
import time

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import confusion_matrix

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import LSTM

In [2]:
pathR='data/ChunkedData_R/'
pathN='data/chunkedData_NR/'

In [3]:
# globals
labels_list = []
features_df = pd.DataFrame()
NUM_PATIENTS=0

In [4]:
# Read one CSV file. 
# Load global lists
def load_patient (filepath,cohort,patient_name):
    global labels_df
    global features_df
    one_patient = pd.read_csv(filepath)
    rows,cols = one_patient.shape
    features_df = features_df.append(one_patient)
    for rec in range(0,rows):
        one_label=(cohort,patient_name,one_patient.loc[rec]['Date'])
        labels_list.append(one_label)

# Read directory of CSV files (R or NR). 
# Given directory, load all the patients in that directory.
# We use filenames as patient names.
def load_cohort (cohort,directory):
    global NUM_PATIENTS
    file_names = listdir(directory)
    NUM_PATIENTS = 0
    for fp in file_names:
        dfp = directory+fp
        one_name = fp.split('.')[0]  # strip away .csv suffix
        one_name = one_name[6:]    # strip away Daily_ prefix
        one_patient = load_patient(dfp,cohort,one_name)
        NUM_PATIENTS += 1

In [5]:
load_cohort('R',pathR)
load_cohort('N',pathN)
features_df = features_df.drop('Date',axis=1) 

In [6]:
# Standardize features by shifting the mean to zero and scaling to unit variance.
# Subtract the mean and divide by the std.dev: z = (x - u) / s
def scale_features(df):
    scaled = StandardScaler().fit_transform(df.values)
    scaled_df = pd.DataFrame(scaled, index=df.index, columns=df.columns)
    return scaled_df
scaled_features = scale_features(features_df)

In [7]:
# Returns 3-tuple (cohort,name,date)
def patient_by_index(ndx):
    prev_name='XXX'
    name_index=-1
    for i in range(0,len(labels_list)):
        (cohort,name,date)=labels_list[i]
        if not name == prev_name:
            prev_name = name
            name_index = name_index+1
        if name_index == ndx:
            return (cohort,name,date)
    return None
    
# Returns dataframe for one patient [n_dates,n_features]
def features_by_patient_index(ndx):
    prev_name='XXX'
    name_index=-1
    min=1000000
    max=-1
    for i in range(0,len(labels_list)):
        (cohort,name,date)=labels_list[i]
        if not name == prev_name:
            prev_name = name
            name_index = name_index+1
        if name_index == ndx:
            if i<min:
                min=i
            if i>max:
                max=i
    one_p = features_df.iloc[min:max+1]
    return (one_p)

In [8]:
# Returns a list of {1=R,0=NR}
def get_all_cohorts():
    all = []
    for i in range(0,len(labels_list)):
        (cohort,name,date) = patient_by_index(i)
        y = 0
        if cohort=='R':
            y=1
        all.append(y)
    return all

# Returns list of list of list:
# list of all patients, 
# where each patient is a list of daily records,
# where record is a list of 259 feature values.
def get_all_patients():
    pass

In [9]:
# Demo
ndx=1
my_feat = features_by_patient_index(ndx)
print("Patient number:",ndx)
print("Patient cohort, name, start date:",patient_by_index(ndx))
print("Num records:",len(my_feat))
print("Num columns:",len(features_df.columns))

Patient number: 1
Patient cohort, name, start date: ('R', '2027_S2', '2020-03-12')
Num records: 29
Num columns: 259


## LSTM

In [10]:
EMBED_DIM=32
NEURONS=32
def build_model():
    #embed_layer = keras.layers.Embedding(
    #    input_dim=INPUT_DIM, output_dim=EMBED_DIM, mask_zero=True)
    rnn1_layer = keras.layers.LSTM(NEURONS, return_sequences=True, 
          input_shape=[None,259,]) # shape=[dates,features,none]=[29,259,]
    rnn2_layer = keras.layers.LSTM(NEURONS, return_sequences=False)
    dense1_layer = keras.layers.Dense(NEURONS)
    dense2_layer = keras.layers.Dense(NEURONS)
    output_layer = keras.layers.Dense(1, activation="sigmoid")
    ann = keras.models.Sequential()
    #ann.add(embed_layer)
    ann.add(rnn1_layer)
    ann.add(rnn2_layer)
    ann.add(dense1_layer)
    ann.add(dense2_layer)
    ann.add(output_layer)
    compiled = compile_model(ann)
    return compiled

# Default weight initializers.
# Keras Dense default = Glorot aka Xavier uniform initializer

def compile_model(model):
    bc=tf.keras.losses.BinaryCrossentropy(from_logits=False)
    model.compile(loss=bc, optimizer="adam", metrics=["accuracy"])
    return model

In [11]:
# Demo
if False:
    t1=[4,2,6]
    t2=[7,1,3]
    t3=[5,5,0]
    t4=[9,3,6]
    p1=[t1,t2,t3,t4]
    data=[p1,p1]
    data
    # samples, timesteps, features
    # for this demo, set rnn layer 1 input shape to       input_shape=[4,3,]
    INPUT_DIM=3
    lstm = build_model()
    lstm = compile_model(lstm)
    lstm.summary()
    lstm.predict(data)

In [12]:
INPUT_DIM=len(features_df.columns)
lstm = build_model()
lstm = compile_model(lstm)
lstm.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, None, 32)          37376     
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense (Dense)                (None, 32)                1056      
_________________________________________________________________
dense_1 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 47,841
Trainable params: 47,841
Non-trainable params: 0
_________________________________________________________________


In [13]:
ndx=1
my_feat = features_by_patient_index(ndx)
print("Patient number:",ndx)
print("Patient cohort, name, start date:",patient_by_index(ndx))
print("Num records:",len(my_feat))
print("Num columns:",INPUT_DIM)
p1=features_by_patient_index(1)
list1=p1.values.tolist()
print(type(list1))
print(len(list1))
print(type(list1[0]))
print(len(list1[0]))
data=[list1,list1]
lstm.predict(data)
# When n_dates = None, prediction is 0.54342705

Patient number: 1
Patient cohort, name, start date: ('R', '2027_S2', '2020-03-12')
Num records: 29
Num columns: 259
<class 'list'>
29
<class 'list'>
259


array([[0.48673052],
       [0.48673052]], dtype=float32)

In [14]:
labels=[0,1]
lstm.fit(data,labels)



<tensorflow.python.keras.callbacks.History at 0x7fec872e6190>

In [15]:
def convert_predictions(model_outs):
    preds=[]
    for x in model_outs:
        y = 0
        if x[0]>=0.5:
            y=1
        preds.append(y)
    return preds

In [16]:
model_output=lstm.predict(data)
predicted=convert_predictions(model_output)
predicted

[1, 1]

In [17]:
confusion = confusion_matrix(labels,predicted)
confusion

array([[0, 1],
       [0, 1]])