# Opioid Data - LSTM
HW #2 Part 2 - Timeseries.  
Use all rows per patient from about 30 consecutive days.
Standardize all of it.
Train LSTM classifier.

## Load data structures and scale features

In [1]:
from os import listdir
import numpy as np
import pandas as pd
import time

from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import LSTM

In [2]:
pathR='data/ChunkedData_R/'
pathN='data/chunkedData_NR/'
CLASS_SEPARATOR=13  # data[:13] vs data[13:]
WITH_VARIANCE_COLUMNS=True   # Use mean and variance per patient

In [3]:
COL_LIST = ['cohort','patient_name','date']
labels_list = []
features_df = pd.DataFrame()
# Read one CSV file. 
# Load global lists
def load_patient (filepath,cohort,patient_name):
    global labels_df
    global features_df
    one_patient = pd.read_csv(filepath)
    rows,cols = one_patient.shape
    features_df = features_df.append(one_patient)
    for rec in range(0,rows):
        one_label=(cohort,patient_name,one_patient.loc[rec]['Date'])
        labels_list.append(one_label)

In [4]:
# Read directory of CSV files (R or NR). 
# Given directory, load all the patients in that directory.
# We use filenames as patient names.
def load_cohort (cohort,directory):
    file_names = listdir(directory)
    for fp in file_names:
        dfp = directory+fp
        one_name = fp.split('.')[0]  # strip away .csv suffix
        one_name = one_name[6:]    # strip away Daily_ prefix
        one_patient = load_patient(dfp,cohort,one_name)

In [5]:
load_cohort('R',pathR)
load_cohort('N',pathN)
features_df = features_df.drop('Date',axis=1) 

In [6]:
# Standardize features by shifting the mean to zero and scaling to unit variance.
# Subtract the mean and divide by the std.dev: z = (x - u) / s
def scale_features(df):
    scaled = StandardScaler().fit_transform(df.values)
    scaled_df = pd.DataFrame(scaled, index=df.index, columns=df.columns)
    return scaled_df
scaled_features = scale_features(features_df)

In [7]:
# Returns dataframe.
def patient_by_index(ndx):
    prev_name='XXX'
    name_index=-1
    for i in range(0,len(labels_list)):
        (cohort,name,date)=labels_list[i]
        if not name == prev_name:
            prev_name = name
            name_index = name_index+1
        if name_index == ndx:
            return (cohort,name,date)
    return None
    
def features_by_patient_index(ndx):
    prev_name='XXX'
    name_index=-1
    min=1000000
    max=-1
    for i in range(0,len(labels_list)):
        (cohort,name,date)=labels_list[i]
        if not name == prev_name:
            prev_name = name
            name_index = name_index+1
        if name_index == ndx:
            if i<min:
                min=i
            if i>max:
                max=i
    one_p = features_df.iloc[min:max+1]
    return (one_p)

In [8]:
# Demo
ndx=1
my_feat = features_by_patient_index(ndx)
print("Patient number:",ndx)
print("Patient cohort, name, start date:",patient_by_index(ndx))
print("Num records:",len(my_feat))

Patient number: 1
Patient cohort, name, start date: ('R', '2027_S2', '2020-03-12')
Num records: 29


## LSTM

In [11]:
INPUT_DIM=259
INPUT_DIM=3
EMBED_DIM=32
NEURONS=32
def build_model():
    embed_layer = keras.layers.Embedding(
        input_dim=INPUT_DIM, output_dim=EMBED_DIM, mask_zero=True)
    rnn1_layer = keras.layers.LSTM(NEURONS, return_sequences=True, 
          input_shape=[4,3,])
    rnn2_layer = keras.layers.LSTM(NEURONS, return_sequences=False)
    dense1_layer = keras.layers.Dense(NEURONS)
    dense2_layer = keras.layers.Dense(NEURONS)
    output_layer = keras.layers.Dense(1, activation="sigmoid")
    ann = keras.models.Sequential()
    #ann.add(embed_layer)
    ann.add(rnn1_layer)
    ann.add(rnn2_layer)
    ann.add(dense1_layer)
    ann.add(dense2_layer)
    ann.add(output_layer)
    compiled = compile_model(ann)
    return compiled

def compile_model(model):
    bc=tf.keras.losses.BinaryCrossentropy(from_logits=False)
    model.compile(loss=bc, optimizer="adam", metrics=["accuracy"])
    return model

In [12]:
lstm = build_model()
lstm = compile_model(lstm)
lstm.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 4, 32)             4608      
_________________________________________________________________
lstm_3 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_3 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_4 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 33        
Total params: 15,073
Trainable params: 15,073
Non-trainable params: 0
_________________________________________________________________


In [13]:
t1=[4,2,6]
t2=[7,1,3]
t3=[5,5,0]
t4=[9,3,6]
p1=[t1,t2,t3,t4]
data=[p1]
data
# samples, timesteps, features

[[[4, 2, 6], [7, 1, 3], [5, 5, 0], [9, 3, 6]]]

In [14]:
lstm.predict(data)

array([[0.47582704]], dtype=float32)