# Opioid Data - LSTM
HW #2 Part 2 - Timeseries.  
Use all rows per patient from about 30 consecutive days.
Standardize all of it.
Train LSTM classifier.

## Load data structures and scale features

In [1]:
from os import listdir
import numpy as np
import pandas as pd
import time

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import confusion_matrix

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Masking
from keras.preprocessing.sequence import pad_sequences

## From data file to Pandas dataframe

In [2]:
pathR='data/ChunkedData_R/'
pathN='data/chunkedData_NR/'

In [3]:
def initialize_globals():
    global patient_list, labels_list, features_df,UNIFORM_NUM_DAYS
    UNIFORM_NUM_DAYS = 29  # most daily records of any patient
    patient_list = [] # list of 40 (cohort,patient_ID,first_row,last_row+1)
    labels_list = [] # list of 1004 (cohort,patient_ID,date)
    features_df = pd.DataFrame()  # 1004 rows of 259 numeric values

In [4]:
# Read one CSV file. 
# Load global lists
def load_patient (filepath,cohort,patient_ID):
    global patient_list
    global labels_df
    global features_df
    one_patient = pd.read_csv(filepath)
    rows,cols = one_patient.shape
    features_df = features_df.append(one_patient)
    first_insert = len(labels_list)
    last_insert = first_insert + rows
    for rec in range(0,rows):
        one_label=(cohort,patient_ID,one_patient.loc[rec]['Date'])
        labels_list.append(one_label)
    patient_list.append([cohort,patient_ID,first_insert,last_insert])

# Read directory of CSV files (R or NR). 
# Given directory, load all the patients in that directory.
# We use filenames as patient names.
def load_cohort (cohort,directory):
    file_names = listdir(directory)
    for fp in file_names:
        dfp = directory+fp
        one_name = fp.split('.')[0]  # strip away .csv suffix
        one_name = one_name[6:]    # strip away Daily_ prefix
        one_patient = load_patient(dfp,cohort,one_name)

# Standardize features by shifting the mean to zero and scaling to unit variance.
# Subtract the mean and divide by the std.dev: z = (x - u) / s
def scale_features(df):
    scaled = StandardScaler().fit_transform(df.values)
    scaled_df = pd.DataFrame(scaled, index=df.index, columns=df.columns)
    return scaled_df

In [5]:
initialize_globals()
load_cohort('R',pathR)
load_cohort('N',pathN)
features_df = features_df.drop('Date',axis=1) 
scaled_features = scale_features(features_df)
features_df = None
print("Patients:",len(patient_list))
print("Label rows:",len(labels_list))
print("Feature rows:",len(scaled_features))

Patients: 40
Label rows: 1004
Feature rows: 1004


## Prepare numpy arrays as required by tensorflow

In [6]:
# Create numpy array of 40 1 or 0
def make_labels():
    # rely on global patient_list:
    # list of 40 (cohort,patient_ID,first_row,last_row+1)
    labels=[]
    for p in patient_list:
        cohort = p[0]
        y = 0
        if cohort=='R':
            y = 1
        labels.append(y)
        ary = np.array(labels).astype(np.float32)
    return ary

def make_data():
    patients=[]
    for p in patient_list:
        days=[]
        first = p[2]
        last = p[3]
        for r in range(first,last):
            day = scaled_features.iloc[r].values.tolist()
            days.append(day)
        patients.append(days)
    # The tensor data structure has fixed size for each dimension.
    # Thus we pad every patient to the maximum = UNIFORM_NUM_DAYS = 29.
    # The alternative is RaggedTensor which is slow and not fully supported.
    padded = pad_sequences(patients, padding="post")  
    ary = np.array(padded).astype(np.float32)
    return ary
        
y_all = make_labels()
X_all = make_data()
print("y values, X values:",len(y_all),len(X_all))
UNIFORM_NUM_FEATURES=len(X_all[0][0])
UNIFORM_NUM_DAYS=len(X_all[0])
print("Days, Features:",UNIFORM_NUM_DAYS,UNIFORM_NUM_FEATURES)

y values, X values: 40 40
Days, Features: 29 259


## LSTM

In [7]:
NEURONS=32
def build_model():
    embed_layer = Masking(mask_value=0.,input_shape=(UNIFORM_NUM_DAYS,UNIFORM_NUM_FEATURES))
    rnn1_layer = LSTM(NEURONS, return_sequences=True) 
    rnn2_layer = LSTM(NEURONS, return_sequences=False)
    dense1_layer = Dense(NEURONS)
    dense2_layer = Dense(NEURONS)
    output_layer = Dense(1, activation="sigmoid")
    ann = keras.models.Sequential()
    ann.add(embed_layer)
    ann.add(rnn1_layer)
    ann.add(rnn2_layer)
    ann.add(dense1_layer)
    ann.add(dense2_layer)
    ann.add(output_layer)
    compiled = compile_model(ann)
    return compiled

# Default weight initializers.
# Keras Dense default = Glorot aka Xavier uniform initializer

def compile_model(model):
    bc=tf.keras.losses.BinaryCrossentropy(from_logits=False)
    model.compile(loss=bc, optimizer="adam", metrics=["accuracy"])
    return model

In [8]:
lstm = build_model()
lstm = compile_model(lstm)
lstm.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking (Masking)            (None, 29, 259)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 29, 32)            37376     
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense (Dense)                (None, 32)                1056      
_________________________________________________________________
dense_1 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 47,841
Trainable params: 47,841
Non-trainable params: 0
____________________________________________________

In [9]:
lstm.fit(X_all,y_all)



<tensorflow.python.keras.callbacks.History at 0x7fc5fb874160>

In [10]:
def convert_predictions(model_outs):
    preds=[]
    for x in model_outs:
        y = 0
        if x[0]>=0.5:
            y=1
        preds.append(y)
    return preds

In [11]:
model_output=lstm.predict(data)
predicted=convert_predictions(model_output)
predicted

NameError: name 'data' is not defined

In [None]:
confusion = confusion_matrix(labels,predicted)
confusion