#  AE Modeling: Script 1

In [1]:
import pandas as pd
import numpy as np
import os
import json
import pickle
from sklearn.model_selection import train_test_split

## Download data from s3 bucket

In [7]:
! aws s3 cp s3://cmsai-mrk-amzn/CSVModelInputs/miguel/365-wo-death/ data/ae_raw/ --exclude "*" --include "ae_patients_365_2011*.csv" --recursive  # these files are death-removed

download: s3://cmsai-mrk-amzn/CSVModelInputs/miguel/365-wo-death/ae_patients_365_20110301.csv to data/ae_raw/ae_patients_365_20110301.csv
download: s3://cmsai-mrk-amzn/CSVModelInputs/miguel/365-wo-death/ae_patients_365_20110201.csv to data/ae_raw/ae_patients_365_20110201.csv
download: s3://cmsai-mrk-amzn/CSVModelInputs/miguel/365-wo-death/ae_patients_365_20110501.csv to data/ae_raw/ae_patients_365_20110501.csv
download: s3://cmsai-mrk-amzn/CSVModelInputs/miguel/365-wo-death/ae_patients_365_20110101.csv to data/ae_raw/ae_patients_365_20110101.csv
download: s3://cmsai-mrk-amzn/CSVModelInputs/miguel/365-wo-death/ae_patients_365_20110401.csv to data/ae_raw/ae_patients_365_20110401.csv
download: s3://cmsai-mrk-amzn/CSVModelInputs/miguel/365-wo-death/ae_patients_365_20110601.csv to data/ae_raw/ae_patients_365_20110601.csv
download: s3://cmsai-mrk-amzn/CSVModelInputs/miguel/365-wo-death/ae_patients_365_20110701.csv to data/ae_raw/ae_patients_365_20110701.csv
download: s3://cmsai-mrk-amzn/CSVM

In [10]:
!aws s3 cp s3://cmsai-mrk-amzn/CSVModelInputs/test_ae_patients_365_20111201.csv data/ae_raw/ae_patients_365_20111201.csv

download: s3://cmsai-mrk-amzn/CSVModelInputs/test_ae_patients_365_20111201.csv to data/ae_raw/ae_patients_365_20111201.csv


In [2]:
# Define the columns 
x_lst = [str(x) for x in range(365,-1,-1)]
y_lst = ['d_5990', 'd_78605', 'd_486', 'd_78650', 'd_78079', 'd_78900', 'd_78609', 'd_7862', 'd_1101',
         'd_78701', 'd_5789', 'd_78791', 'd_6826', 'd_78659', 'd_78907',
         'd_7840', 'd_28860', 'd_4660', 'd_6829', 'd_00845']

## Build vocab of events

In [8]:
import model_utils
from collections import Counter
import numbers
import time
import torch
import numbers
from collections import Counter
import torch
import torch.functional as F
from torch.nn.utils.rnn import pad_sequence
from torchtext.datasets import text_classification
from torchtext.vocab import Vocab

import metrics
import numpy as np

def build_vocab(df, feat_colnames, min_freq=1, specials=['<pad>', '<unk>'], specials_first=True):
    '''
    Create a vocabulary to be used with Script2. This maps all events to an index, including <pad> and <unk> and
    nan, which represents padding of sentences, unknown events, and no events respectively
    '''
    def build_counter(df, feat_colnames):
        counter = Counter()
        words = df[feat_colnames].values.ravel('K')
        print("start word number: ", words.shape)
        new_words = []
        for x in words:
            x = str(x).replace('d_s', 'd_')
            new_words.extend(x.split(', '))
        print("exact word number: " , len(new_words))
        counter.update(new_words)
        if not isinstance(min_freq, numbers.Number):
            raise ValueError(f'Something wrong with {min_freq}')
        return counter
  
    print("df shape: ", df.shape)
    positive_df = df[(df[y_lst] != 0).any(1)]
    print("postive shape: ", positive_df.shape)
    pos_counter = build_counter(positive_df, feat_colnames)
    print("nan number: ", pos_counter['nan'])

    vocab = Vocab(pos_counter, min_freq=min_freq, specials=specials, specials_first=specials_first)

    print('Completed vocabulary')
    return vocab

In [3]:
import model_utils
from collections import Counter
import numbers
import time
import torch
import numbers
from collections import Counter
import torch
import torch.functional as F
from torch.nn.utils.rnn import pad_sequence
from torchtext.datasets import text_classification
from torchtext.vocab import Vocab

import metrics
import numpy as np

counter = Counter()

def build_list(df, feat_colnames):
    words = df[feat_colnames].values.ravel('K')
    print("start word number: ", words.shape)
    new_words = []
    for x in words:
        x = str(x).replace('d_s', 'd_').replace(' ', '')
        new_words.extend(x.split(','))
    print("exact word number: " , len(new_words))
    return new_words
    
def build_vocab_mp(month):
    if month < 10:
        m = '0' + str(month)
    else:
        m = str(month)
    FP = 'data/ae_raw/ae_patients_365_2011' + m + '01.csv'
    ae_targets_365_df = pd.read_csv(FP)
    positive_df = ae_targets_365_df[(ae_targets_365_df[y_lst] != 0).any(1)]
    print("postive shape: ", positive_df.shape)
    return build_list(positive_df, x_lst[-120:])

# import multiprocessing
# from multiprocessing import Pool
# num_workers = multiprocessing.cpu_count()
# with Pool(processes=num_workers) as pool:
#     words = pool.map(build_vocab_mp, list(range(1, 13)))
    
# for word in words:
#     counter.update(word)
# print(len(counter))
# vocab = Vocab(counter, min_freq=1, specials=['<pad>', '<unk>'], specials_first=True)


In [9]:
torch.save(vocab, 'data/ae_pos_vocab_last120_whole_non3')

In [24]:
# read 11 months data and generate vocab
for i in range(2, 12):
    if i < 10:
        m = '0' + str(i)
    else:
        m = str(i)
    FP = 'data/ae_raw/ae_patients_365_2011' + m + '01.csv'
    ae_targets_365_df = pd.read_csv(FP)
    positive_df = ae_targets_365_df[(ae_targets_365_df[y_lst] != 0).any(1)]
    print("postive shape: ", positive_df.shape)
    counter.update(build_list(positive_df, x_lst[-90:]))    
vocab = Vocab(counter, min_freq=1, specials=['<pad>', '<unk>'], specials_first=True)
torch.save(vocab, 'data/ae_pos_vocab_last90_whole_non3')

  interactivity=interactivity, compiler=compiler, result=result)


month size:  768843225
99879
(1886796, 387)
postive shape:  (198397, 387)
start word number:  (17855730,)
exact word number:  24615926


  interactivity=interactivity, compiler=compiler, result=result)


month size:  763305255
100453
(1871912, 387)
postive shape:  (197781, 387)
start word number:  (17800290,)
exact word number:  25243592


  interactivity=interactivity, compiler=compiler, result=result)


month size:  757044756
100937
(1855251, 387)
postive shape:  (182998, 387)
start word number:  (16469820,)
exact word number:  24377172


  interactivity=interactivity, compiler=compiler, result=result)


month size:  751393395
101093
(1840492, 387)
postive shape:  (189135, 387)
start word number:  (17022150,)
exact word number:  25032906


  interactivity=interactivity, compiler=compiler, result=result)


month size:  745668504
101137
(1825655, 387)
postive shape:  (189194, 387)
start word number:  (17027460,)
exact word number:  25033896


  interactivity=interactivity, compiler=compiler, result=result)


month size:  739816290
101403
(1810267, 387)
postive shape:  (170463, 387)
start word number:  (15341670,)
exact word number:  22818562


  interactivity=interactivity, compiler=compiler, result=result)


month size:  733887450
101353
(1794997, 387)
postive shape:  (183100, 387)
start word number:  (16479000,)
exact word number:  24249841


  interactivity=interactivity, compiler=compiler, result=result)


month size:  727565418
101340
(1778674, 387)
postive shape:  (180554, 387)
start word number:  (16249860,)
exact word number:  23898030


  interactivity=interactivity, compiler=compiler, result=result)


month size:  718489881
101556
(1755007, 387)
postive shape:  (185024, 387)
start word number:  (16652160,)
exact word number:  24476747


  interactivity=interactivity, compiler=compiler, result=result)


month size:  708178266
101445
(1728473, 387)
postive shape:  (183489, 387)
start word number:  (16514010,)
exact word number:  24382711


In [26]:
print(len(vocab))

28362


## Build dataset and save for script 2

In [3]:
# Define the columns 
x_lst = [str(x) for x in range(365,-1,-1)]
y_lst = ['d_5990', 'd_78605', 'd_486', 'd_78650', 'd_78079', 'd_78900', 'd_78609', 'd_7862', 'd_1101',
         'd_78701', 'd_5789', 'd_78791', 'd_6826', 'd_78659', 'd_78907',
         'd_7840', 'd_28860', 'd_4660', 'd_6829', 'd_00845']

In [4]:
import time
import random
def build_dataset(df, vocab, feat_colnames, label_colnames, day_length=90, max_length=30, max_sentence_length=500):
    '''
    Subsets the entire dataset into a dataset to be used later.
    - Specific vocabulary
    - By number of days (whole dataset is 365)
    
    Returns a list of data and attributes needed by Script 2: patientid_dischargeid key, sequence of events, targets, and 
    mask (identifying padded regions)
    '''
    start_time = time.time()
    print("used days: ", feat_colnames[-day_length], feat_colnames[-1])
    
    data = df[feat_colnames[-day_length:]].to_numpy()
    labels = df[label_colnames].to_numpy()
    
    count = 0
    sequence = []
    valid_id = []
    pad_mask = []

    print("total size before: ", data.shape)
    for i in range(len(data)):
        sentence = []
        mask = []
        pos = []
        for j in range(len(data[i])-1, -1, -1):
            words = str(data[i][j])
            if words == 'nan':
                continue
            words = words.replace('d_s', 'd_').replace(' ', '').split(', ')
            words = sorted([vocab.stoi[w] if w in vocab.stoi else vocab.stoi['<unk>'] for w in words])
            
            if len(words) > max_length:
                words = words[:max_length]
        
            
            sentence = words + sentence

            if len(sentence) > max_sentence_length:
                sentence = sentence[-max_sentence_length:]
                break
                
        if len(sentence) == 0:
            if labels[i].any():
                count += 1
            continue
            
        valid_id.append(i)
        pad_l = (max_sentence_length - len(sentence))
        mask = [1] * len(sentence) + [0] * pad_l
        sentence = sentence + [vocab.stoi['<pad>']] * pad_l
        sequence.append(sentence)
        pad_mask.append(mask)
        
    finish_time = time.time()
    
    print('New dataset created')
    print("sequence length: ", len(sequence))
    print("empty events with nonzero labels: ", count)
    
    labels = labels[valid_id]
    patient_ids = df['patient_id'].to_numpy()[valid_id]
    pad_mask = np.array(pad_mask)
    sequence = np.array(sequence)
    print("time: ", finish_time - start_time)
    
    return [patient_ids, sequence, labels, pad_mask]



In [5]:
vocab = torch.load('data/ae_pos_vocab_last120_whole_non3')

In [11]:
import multiprocessing
from multiprocessing import Pool
num_workers = multiprocessing.cpu_count()

def generate_procdata(m):
    if m < 10:
        m = '0' + str(m)
    else:
        m = str(m)
    FP = 'data/ae_raw/ae_patients_365_2011' + m + '01.csv'
    ae_targets_365_df = pd.read_csv(FP)
    print("month size: ", str(m), ae_targets_365_df.size)
    
    whole_dataset = None
    whole_dataset = build_dataset(ae_targets_365_df, vocab, x_lst, y_lst, day_length=120, max_length=30, max_sentence_length=500)
    pickle.dump(whole_dataset, open("data/ae_process/np_ae_last120_vocab120_non3digit_month"  + m + ".pkl", 'wb'), protocol=4)  
    
with Pool(processes=num_workers) as pool:
    pool.map(generate_procdata, [12])

  self._target(*self._args, **self._kwargs)


month size:  12 749593458
used days:  119 0
total size before:  (1936934, 120)
New dataset created
sequence length:  1516165
empty events with nonzero labels:  5818
time:  225.47493076324463


In [None]:
print(type(words))

In [10]:
# generate the whole data from 11 months
for i in [1, 2, 3,4,5,6,7,8,9,10,11]:
    if i < 10:
        m = '0' + str(i)
    else:
        m = str(i)
    
    FP = 'data/ae_raw/ae_patients_365_2011' + m + '01.csv'
    ae_targets_365_df = pd.read_csv(FP)
    print("month size: ", str(i), ae_targets_365_df.size)
    
    whole_dataset = None
    whole_dataset = build_dataset(ae_targets_365_df, vocab, x_lst, y_lst, day_length=120, max_length=30, max_sentence_length=500)
    pickle.dump(whole_dataset, open("data/ae_process/np_ae_last120_non3digit_month"  + m + ".pkl", 'wb'), protocol=4)


  interactivity=interactivity, compiler=compiler, result=result)


month size:  2 730190052
used days:  119 0
total size before:  (1886796, 120)
New dataset created
sequence length:  1478943
empty events with nonzero labels:  8265
time:  839.4235293865204


  interactivity=interactivity, compiler=compiler, result=result)


month size:  3 724429944
used days:  119 0
total size before:  (1871912, 120)
New dataset created
sequence length:  1454134
empty events with nonzero labels:  8709
time:  874.7379584312439


  interactivity=interactivity, compiler=compiler, result=result)


month size:  4 717982137
used days:  119 0
total size before:  (1855251, 120)
New dataset created
sequence length:  1449697
empty events with nonzero labels:  7870
time:  856.9447572231293


  interactivity=interactivity, compiler=compiler, result=result)


month size:  5 712270404
used days:  119 0
total size before:  (1840492, 120)
New dataset created
sequence length:  1452099
empty events with nonzero labels:  8119
time:  321.42476511001587


  interactivity=interactivity, compiler=compiler, result=result)


month size:  6 706528485
used days:  119 0
total size before:  (1825655, 120)
New dataset created
sequence length:  1458385
empty events with nonzero labels:  7842
time:  323.964391708374


  interactivity=interactivity, compiler=compiler, result=result)


month size:  7 700573329
used days:  119 0
total size before:  (1810267, 120)
New dataset created
sequence length:  1462686
empty events with nonzero labels:  6913
time:  287.2219879627228


  interactivity=interactivity, compiler=compiler, result=result)


month size:  8 694663839
used days:  119 0
total size before:  (1794997, 120)
New dataset created
sequence length:  1461779
empty events with nonzero labels:  7383
time:  275.817809343338


  interactivity=interactivity, compiler=compiler, result=result)


month size:  9 688346838
used days:  119 0
total size before:  (1778674, 120)
New dataset created
sequence length:  1461131
empty events with nonzero labels:  7419
time:  288.7220561504364


  interactivity=interactivity, compiler=compiler, result=result)


month size:  10 679187709
used days:  119 0
total size before:  (1755007, 120)
New dataset created
sequence length:  1476727
empty events with nonzero labels:  7492
time:  279.44660115242004


  interactivity=interactivity, compiler=compiler, result=result)


month size:  11 668919051
used days:  119 0
total size before:  (1728473, 120)
New dataset created
sequence length:  1508506
empty events with nonzero labels:  6336
time:  279.8270537853241


## Random things to check

In [90]:
def get_count(df, vocab, feat_colnames, label_colnames, day_length=30, max_length=50, mode='eval', keep_length=6):
    # create dataset
    start_time = time.time()
    print("used days: ", feat_colnames[-day_length], feat_colnames[-1])
    data = df[feat_colnames[-day_length:]].to_numpy()
    sequence = []
    print("total size before: ", data.shape)
    max_count = 0
    max_day = 0
    days = 0
    mid_days = 0
    day_dict = {}
    labels = df[label_colnames].to_numpy()
    for i in range(len(data)):
        sentence = []
        if not labels[i].any():
            continue
        mask = []
        count = 0
        
        for j in range(len(data[i])):
            
            words = str(data[i][j]).replace('d_s', 'd_')
            if words == 'nan':
                continue
            words = words.split(', ')
            
            if len(words) > max_length:
                days += 1
            if len(words) > max_length - 5:
                mid_days += 1
            max_day = max(max_day, len(words))
            #print(max_day)
            count += len(words)
            '''words = sorted([vocab.stoi[w] if w in vocab.stoi else vocab.stoi['<unk>'] for w in words])
            
            if len(words) > max_length:
                words = words[:max_length]
            
            words = words + [vocab.stoi['<pad>']] * (max_length - len(words))
                    
            sentence.append(words)'''
        max_count = max(max_count, count)
        #if count == 0:
           # print("empty: ", i)
        if count in day_dict:
            day_dict[count] += 1
        else:
            day_dict[count] = 1
    print(max_count, max_day, days, mid_days)
    return day_dict

In [91]:
day_dict = get_count(np_ae, vocab, x_lst, y_lst, day_length=120, max_length=30)

used days:  119 0
total size before:  (1903423, 120)
1118 65 588 2134


In [140]:
row = random.randint(0, 1562223)
print(whole_dataset[3][row][:5])
print(whole_dataset[1][row][:5])

[[1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]]
[[  28   83 1182    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]
 [   2    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]
 [   2    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]
 [   2    0    0    0    0    0    0    0    0    0  

In [77]:
! aws s3 cp s3://cmsai-mrk-amzn/CSVModelInputs/miguel/ae_patients_365_20110101_wo_0s.csv data/

download: s3://cmsai-mrk-amzn/CSVModelInputs/miguel/ae_patients_365_20110101_wo_0s.csv to data/ae_patients_365_20110101_wo_0s.csv
