In [1]:
import time
import random
import torch 
import pandas as pd
import numpy as np
import torch.nn.functional as F
from torch import nn
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import Dataset, DataLoader
import boto3
import pickle
from sklearn.metrics import roc_auc_score
import matplotlib
import matplotlib.pyplot as plt
import sys
from torchvision import transforms
import math
import torch.optim as optim
import time
import json


In [2]:
def build_dataset(df, vocab, feat_colnames, label_colnames, day_length=90, max_length=30, max_sentence_length=500):
    '''
    Subsets the entire dataset into a dataset to be used later.
    - Specific vocabulary
    - By number of days (whole dataset is 365)
    
    Returns a list of data and attributes needed by Script 2: patientid_dischargeid key, sequence of events, targets, and 
    mask (identifying padded regions)
    '''
    start_time = time.time()
    print("used days: ", feat_colnames[-day_length], feat_colnames[-1])
    
    data = df[feat_colnames[-day_length:]].to_numpy()
    labels = df[label_colnames].to_numpy()
    
    count = 0
    sequence = []
    valid_id = []
    pad_mask = []
    dates = []
    index_dates = []
    
    whole_dates = df['index_date'].to_numpy()
    print("total size before: ", data.shape)
    for i in range(len(data)):
        sentence = []
        mask = []
        event_date = []
        for j in range(len(data[i])-1, -1, -1):
            words = str(data[i][j])
            if words == 'nan':
                continue
            words = words.replace('d_s', 'd_').replace(' ', '').split(',')
            words = sorted([vocab.stoi[w] if w in vocab.stoi else vocab.stoi['nan'] for w in words]) # training use "unk"
            
            if len(words) > max_length:
                words = words[:max_length]
            
            sentence = words + sentence
            
            event_date = [day_length-j-1] * len(words) + event_date

            if len(sentence) > max_sentence_length:
                sentence = sentence[-max_sentence_length:]
                event_date = event_date[-max_sentence_length:]
                break
                
        if len(sentence) == 0:
            if labels[i].any():
                count += 1
            continue
            
        valid_id.append(i)
        pad_l = (max_sentence_length - len(sentence))
        mask = [1] * len(sentence) + [0] * pad_l
        event_date = event_date + [-1] * pad_l
        sentence = sentence + [vocab.stoi['<pad>']] * pad_l
        sequence.append(sentence)
        pad_mask.append(mask)
        dates.append(event_date)
        index_dates.append(i)
        
        
    finish_time = time.time()
    
    print('New dataset created')
    print("sequence length: ", len(sequence))
    print("empty events with nonzero labels: ", count)
    
    labels = labels[valid_id]
    patient_ids = df['patient_id'].to_numpy()[valid_id]
    pad_mask = np.array(pad_mask)
    sequence = np.array(sequence)
    print("time: ", finish_time - start_time)
    
    return [patient_ids, sequence, labels, pad_mask, np.array(dates), whole_dates[index_dates]]



In [3]:
class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, num_classes, num_events=500, seq_length=120, dropout=0.5):
        '''
        Initialize a transformer model for adverse events. The model consists of the following:
        - Transformer encoder layers
        - Single 1D CNN layer
        - Final fully connected layer to determine probability of readmissions
        
        Args:
            
            ntoken: number of tokens in embedding layer (vocabulary size)
            ninp: embedding dimension (number of inputs)
            
            nhead: number of heads in transformers
            nhid: number of transformer linear dimensions
            
            nlayers: number of layers in transfromer
            
            num_classes: number of classes to predict (in this case, binary)
            
            seq_length: dimension of linear layer output
            num_events: maximum number of events per patient
            
            dropout: strength of regularization
        '''
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        print("parameters: embsize:{}, nhead:{}, nhid:{}, nlayers:{}, dropout:{}".format(ninp, nhead, nhid, nlayers, dropout))
        
        # Inputs into transformer: mask for padding and embeddings
        self.src_mask = None
        self.event_emb = nn.Embedding(ntoken, ninp)
        
        # Transformer layer
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        
         # CNN & fully connected layers
        
        self.ff = nn.Linear(int(num_events), int(seq_length))
        self.fc = nn.Linear(int(seq_length), num_classes)
        self.nonlinear = nn.ReLU()
        self.softmax = nn.Softmax(-1)
        self.Conv1d = nn.Conv1d(ninp, 1, 1, stride=1)
        
        # record
        self.ninp = ninp
        self.dropout = dropout
        self.num_events= num_events
        self.num_classes = num_classes
        
        # initalize weights
        self.init_weights()
    

    def init_weights(self):
        '''Initialize weights in embedding and fully connected layers'''
        initrange = 0.1
        self.event_emb.weight.data.uniform_(-initrange, initrange)
        
        self.fc.bias.data.zero_()
        self.fc.weight.data.uniform_(-initrange, initrange)
        
        self.ff.bias.data.zero_()
        self.ff.weight.data.uniform_(-initrange, initrange)

        
    def forward(self, src, mask=None, pos=None): 
        '''
        Forward propagation steps:
        - convert events into embedding vectors & positional encoding
        - transformer encoder layers
        - CNN layer
        - final 
        Notes:
        no position encoding here, no obvious clues for sequential or order found 
        '''        
        if mask is not None:
            #src_key_padding_mask needs boolean mask
            src_mask = (mask == 0)

        src = self.event_emb(src).transpose(0,1) * math.sqrt(self.ninp)
        
        trans_output = self.transformer_encoder(src, src_key_padding_mask=src_mask).transpose(0, 1).transpose(1,2)
        final_feature_map = self.Conv1d(trans_output).squeeze()
        
        out_mask = mask.float().masked_fill(mask == 0.0, float(-100.0)).masked_fill(mask == 1.0, float(0.0)).view(mask.size()[0], -1)
        # extract normalized feature importances per prediction
        importance_out = self.softmax(final_feature_map+out_mask)
        
        output = self.ff(final_feature_map)
        output = self.nonlinear(output)
        output = self.fc(output)

        return output, importance_out

In [4]:
class BuildDataset(Dataset):
    '''
    Read in dataset, if data is already split into train, test, and/or validation sets.
    
    ProcessData: extract input, labels, mask from an existing Python object (via pickle or otherwise)
    '''
    def __init__(self, data_file, event_length=500, data_list=None, mode='read'): 
        if mode != 'read' and data_list != None:
            self.data, self.label, self.mask = self.ProcessData(data_list, event_length)
            
    def ProcessData(self, data_list, event_length):
        input_data, labels, mask = data_list[0][:,-event_length:], data_list[1], data_list[2][:,-event_length:]
        return input_data, labels, mask
    
    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        return torch.tensor(self.data[idx]), torch.tensor(self.label[idx]), torch.tensor(self.mask[idx])

In [11]:
# Read data from the saved pickle file, the order is [patietn_id, input_data, labels, mask_for_padding, position(no used here)]
def ReadData(file_dir, event_length):
    with open(file_dir, 'rb') as f:
        ids, data, label, mask = pickle.load(f)
        ids = ids.astype(str)
        cut_data = data[:,-event_length:]
        cut_mask = mask[:,-event_length:]
        label = label.astype(int)
    return ids, cut_data, label, cut_mask#, cut_pos

In [6]:
def EopochVal(model, dataloader, device="cuda", metric='auc'):
    '''
    Evaluate model performance, called by ModelProcess function
    
    Returns predictions, metrics and importance scores
    '''
    epoch_loss = 0
    epoch_metric = 0
    
    model.eval()
    
    order_labels = None
    prediction_scores = None
    events = None
    important_scores = None
    
    for idx, [seq, labels, mask] in enumerate(dataloader):
        with torch.no_grad():
            labels = labels.squeeze().float()
            seq, labels, mask = seq.cuda(), labels.cuda(), mask.cuda()
            predictions, importance = model(seq, mask=mask)

            loss = criterion(predictions, labels)

            if order_labels is None:
                order_labels = labels.cpu().numpy()
                prediction_scores = torch.sigmoid(predictions).detach().cpu().numpy()
                events = seq.cpu().numpy()
                important_scores = importance.detach().cpu().numpy()
            else:
                order_labels = np.concatenate((order_labels, labels.cpu().numpy()))
                prediction_scores =np.concatenate((prediction_scores, torch.sigmoid(predictions).detach().cpu().numpy()))
                events = np.concatenate((events, seq.cpu().numpy()))
                important_scores = np.concatenate((important_scores, importance.detach().cpu().numpy()))

            epoch_loss += loss.item()
        del predictions
        del importance
        torch.cuda.empty_cache()
        
    if metric == 'acc':
        epoch_metric = get_average_accuracy(prediction_scores, order_labels)
    elif metric == 'auc':
        epoch_metric = roc_auc_score(order_labels, prediction_scores)
        
    return epoch_loss / len(dataloader), epoch_metric, [order_labels, events, important_scores, prediction_scores]

In [7]:
def TestProcess(model, dataloaders):

        
    if torch.cuda.is_available():
        device="cuda"
    else:
        device="cpu"
    
    print("device: ", device)
    
    #optimizer = optim.AdamW(model.parameters())

    final_test_metric = 0.0

    epoch_test_loss, epoch_test_metric, importance_results = EopochVal(model, dataloaders, device="cuda", metric='auc')
    #test_loss.append(epoch_test_loss)
    print('epoch_test_loss:', np.mean(epoch_test_loss), 'epoch_test_metric:', np.mean(epoch_test_metric))

    final_test_metric = epoch_test_metric
    final_importance_results = importance_results
    #torch.save(model.module.state_dict(), './model_weights/month-{}_emsize-{}_head-{}_layers-{}_valauc-{}.pth'.format(month, emsize, nhead, nlayers, np.round(epoch_val_metric, decimals=3)))
        #scheduler.step()
        
    return final_test_metric, final_importance_results

In [8]:
#! aws s3 cp s3://cmsai-mrk-amzn/pretest\ phase/pretest\ input\ files\ to\ model/ae_patients_365_20120601.csv pretest_ae_patients_365_20120601.csv

In [31]:
FP = './data/ae_data.csv'
ae_targets_365_df = pd.read_csv(FP)

  interactivity=interactivity, compiler=compiler, result=result)


In [32]:
ae_targets_365_df.head()

Unnamed: 0,index_date,patient_id,365,364,363,362,361,360,359,358,...,d_78791,d_6826,d_78659,d_78907,d_7840,d_28860,d_4660,d_6829,d_00845,index_date.1
0,20120601,100000099,,,,,,,,,...,0,0,0,0,0,0,0,0,0,20120601
1,20120601,100000315,,,,,,,"d_4241, d_4241, d_42832, d_7852, h_99214",,...,0,0,0,0,0,0,0,0,0,20120601
2,20120601,100000379,"d_V5861, d_V5861, h_85610",,,,,,,,...,0,0,0,0,0,0,0,0,0,20120601
3,20120601,100000437,,,,,,,,,...,0,0,0,0,0,0,0,0,0,20120601
4,20120601,100000559,"d_V5861, d_V5861, h_85610, h_90999, h_A4657, h...",,"h_90999, h_J2501",,,"h_90999, h_J2501",,"d_5856, d_5856, d_V5861, d_V5861, d_V5869, d_V...",...,0,0,0,0,0,0,0,0,0,20120601


In [33]:
# remove death
indeces = set()
x_lst = [str(x) for x in range(365,-1,-1)]
y_lst = ['d_5990', 'd_78605', 'd_486', 'd_78650', 'd_78079', 'd_78900', 'd_78609', 'd_7862', 'd_1101',
         'd_78701', 'd_5789', 'd_78791', 'd_6826', 'd_78659', 'd_78907',
         'd_7840', 'd_28860', 'd_4660', 'd_6829', 'd_00845']
for i in x_lst:
    indeces.update(ae_targets_365_df[ae_targets_365_df[i].str.contains('death', na=False)].index)
print(len(indeces)) #55568
ae_targets_365_nd_df = ae_targets_365_df[~ae_targets_365_df.index.isin(indeces)]
print(ae_targets_365_nd_df.shape) #(1563578, 370)

863
(5378701, 389)


In [34]:
print(ae_targets_365_nd_df.shape)

(5378701, 389)


In [35]:

vocab = torch.load('data/ae_pos_vocab_last90_whole_non3')

# Define the columns 
x_lst = [str(x) for x in range(365,-1,-1)]
y_lst = ['d_5990', 'd_78605', 'd_486', 'd_78650', 'd_78079', 'd_78900', 'd_78609', 'd_7862', 'd_1101',
         'd_78701', 'd_5789', 'd_78791', 'd_6826', 'd_78659', 'd_78907',
         'd_7840', 'd_28860', 'd_4660', 'd_6829', 'd_00845']

whole_ids, whole_data, whole_labels, whole_mask, whole_dates, index_dates = build_dataset(ae_targets_365_nd_df, vocab, x_lst, y_lst, day_length=120, max_length=30, max_sentence_length=500)


used days:  119 0
total size before:  (5378701, 120)
New dataset created
sequence length:  4484933
empty events with nonzero labels:  27681
time:  1036.592157125473


In [36]:
test_batch_size = 30000
test_dataset = BuildDataset('', event_length=500, data_list=[whole_data, whole_labels, whole_mask], mode='load')
test_dataloader = DataLoader(test_dataset, batch_size=test_batch_size, shuffle=False, num_workers=16)

In [37]:
vocab = torch.load('data/ae_pos_vocab_last90_whole_non3')
ntokens = len(vocab.stoi) # the size of vocabulary
emsize = 16 # embedding dimension
nhid = 32 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 1 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 1 # the number of heads in the multiheadattention models
dropout = 0.1 # the dropout value
n_class = 20

test_metrics = []
results = []


#criterion to use
criterion = nn.BCEWithLogitsLoss(pos_weight=None).cuda()
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, n_class, num_events=500, seq_length=120, dropout=0.5)
model.load_state_dict(torch.load('model_weights/month-11_emsize-16_head-1_layers-1_valauc-0.783.pth'))
if torch.cuda.is_available():
    model = model.cuda()
if torch.cuda.device_count()>1:
    model = nn.DataParallel(model)
test_metric, final_importance_results = TestProcess(model, test_dataloader)

final_importance_results.insert(0, whole_ids)

parameters: embsize:16, nhead:1, nhid:32, nlayers:1, dropout:0.5
device:  cuda
epoch_test_loss: 0.051057802826787034 epoch_test_metric: 0.747287918203524


In [38]:
labels, scores = np.array(final_importance_results[1]), np.array(final_importance_results[-1])
import metrics
df = metrics.compute_metrics(labels, scores, target_names=None, risk_list=[0.5, 1, 2, 5])

In [39]:
df

Unnamed: 0,auroc,avgpr,precis_0.5%,recall_0.5%,precis_1%,recall_1%,precis_2%,recall_2%,precis_5%,recall_5%,calib_mean,calib_mse
0,0.776551,0.121993,0.273311,0.060749,0.247871,0.110188,0.212689,0.189095,0.154299,0.342954,0.000563,0.020749
1,0.777013,0.09998,0.21262,0.048107,0.187291,0.084753,0.166524,0.150708,0.132292,0.299318,-0.000453,0.020941
2,0.828892,0.125873,0.266622,0.140296,0.211349,0.222423,0.152298,0.320553,0.088309,0.464674,0.004095,0.009494
3,0.695966,0.048408,0.095429,0.023995,0.092887,0.046711,0.08562,0.086112,0.070003,0.176014,0.000198,0.019167
4,0.695997,0.058119,0.10961,0.02269,0.104125,0.043109,0.09688,0.080218,0.084148,0.17419,-0.008426,0.023129
5,0.705206,0.031312,0.066667,0.026347,0.060803,0.04806,0.053568,0.084682,0.043077,0.170244,0.000876,0.012367
6,0.780299,0.058858,0.124147,0.052413,0.111126,0.093831,0.097615,0.164844,0.076037,0.321008,1.4e-05,0.011563
7,0.688497,0.036191,0.072464,0.023162,0.066421,0.042462,0.059822,0.076486,0.050926,0.162778,-0.004855,0.015203
8,0.887042,0.228406,0.312553,0.054779,0.308227,0.108042,0.294975,0.206792,0.262817,0.460617,-0.000854,0.023394
9,0.757729,0.028236,0.074961,0.077199,0.053155,0.109483,0.03932,0.161975,0.027983,0.288175,-0.000501,0.004939


In [40]:
def get_top_feat(events, importance_score, dates, vocab, top=10):
    
    indices = np.argsort(importance_score)[:,::-1][:,:top]
    i = np.arange(indices.shape[0]).reshape(indices.shape[0],1)
    top_scores = importance_score[i, indices]
    #print(top_scores.shape)
    top_scores = np.exp(top_scores)/np.sum(np.exp(top_scores), axis=1, keepdims=True)
    #print(np.sum(np.exp(top_scores), axis=1, keepdims=True).shape)
    top_events = events[i, indices]
    top_dates = dates[i, indices]
    f = lambda x: vocab.itos[x]
    vfunc = np.vectorize(f)
    top_names = vfunc(top_events) 
    output = np.dstack((top_names, top_scores, top_dates))
        
    return output

In [41]:
def create_df(final_importance_results, target_names, top=10):
    patient_id, labels, events, importance_score, probability_score = final_importance_results
    probs_df = pd.DataFrame()
    probs_df['patient_id'] = patient_id
    for i in range(20):
        probs_df[target_names[i]] = labels[:, i]
        probs_df[target_names[i] + '_Probs'] = probability_score[:, i]
    event_score = get_top_feat(events, importance_score, whole_dates, vocab, top=10)
    for i in range(top):
        probs_df['event_' + str(i+1)] = event_score[:, i, 0]
        probs_df['score_' + str(i+1)] = event_score[:, i, 1]
        probs_df['dates_' + str(i+1)] = event_score[:, i, 2]
    return probs_df

In [42]:
test_df = create_df(final_importance_results, y_lst)

In [44]:
test_df.head(20)

Unnamed: 0,patient_id,d_5990,d_5990_Probs,d_78605,d_78605_Probs,d_486,d_486_Probs,d_78650,d_78650_Probs,d_78079,...,dates_7,event_8,score_8,dates_8,event_9,score_9,dates_9,event_10,score_10,dates_10
0,100000099,0.0,0.072606,0.0,0.005401,0.0,0.015968,0.0,0.007524,0.0,...,15,d_17362,0.09835929,15,d_17362,0.09835929,15,d_17362,0.09835929,30
1,100000315,0.0,0.132548,0.0,0.012732,0.0,0.013535,0.0,0.018253,1.0,...,92,h_93280,0.099985495,22,d_5950,0.099975646,114,d_5950,0.099975646,114
2,100000379,0.0,0.012923,0.0,0.020842,0.0,0.009473,0.0,0.029747,0.0,...,43,d_1744,0.09993821,56,d_1744,0.09993821,56,h_4177F,0.099926025,17
3,100000437,0.0,0.006942,0.0,0.001616,0.0,0.000609,0.0,0.003745,0.0,...,28,d_V5419,0.09997104,21,d_V5419,0.09997104,21,d_V5419,0.09997104,14
4,100000559,0.0,0.070057,0.0,0.04791,0.0,0.07227,0.0,0.058664,0.0,...,10,h_A0428,0.099961355,10,h_A0428,0.099961355,36,h_A0428,0.099961355,38
5,100000905,0.0,0.007223,0.0,0.007878,0.0,0.002185,0.0,0.020066,0.0,...,109,d_72210,0.0999083,109,h_82306,0.09987372,58,d_34290,0.09986265,44
6,100001193,0.0,0.005848,0.0,0.003948,0.0,0.001426,0.0,0.00843,0.0,...,115,d_43310,0.0999918,115,d_25000,0.09996737,56,d_25000,0.09996737,53
7,100001633,0.0,0.006565,0.0,0.006862,0.0,0.002416,0.0,0.012752,0.0,...,74,d_2392,0.09976251,8,d_2392,0.09976251,8,d_2724,0.09969918,74
8,100001759,0.0,0.035086,1.0,0.009079,1.0,0.006181,0.0,0.01361,0.0,...,53,d_78830,0.099996015,14,d_7917,0.09999229,9,d_7917,0.09999229,9
9,100001871,0.0,0.011194,0.0,0.009376,0.0,0.003273,0.0,0.015286,0.0,...,40,d_25000,0.09990656,109,d_25000,0.09990656,109,h_99214,0.09988528,109


In [18]:
test_df.columns[:41]

Index(['patient_id', 'd_5990', 'd_5990_Probs', 'd_78605', 'd_78605_Probs',
       'd_486', 'd_486_Probs', 'd_78650', 'd_78650_Probs', 'd_78079',
       'd_78079_Probs', 'd_78900', 'd_78900_Probs', 'd_78609', 'd_78609_Probs',
       'd_7862', 'd_7862_Probs', 'd_1101', 'd_1101_Probs', 'd_78701',
       'd_78701_Probs', 'd_5789', 'd_5789_Probs', 'd_78791', 'd_78791_Probs',
       'd_6826', 'd_6826_Probs', 'd_78659', 'd_78659_Probs', 'd_78907',
       'd_78907_Probs', 'd_7840', 'd_7840_Probs', 'd_28860', 'd_28860_Probs',
       'd_4660', 'd_4660_Probs', 'd_6829', 'd_6829_Probs', 'd_00845',
       'd_00845_Probs'],
      dtype='object')

In [45]:
probability_df = test_df[test_df.columns[:41]]
event_df = test_df[['patient_id'] + list(test_df.columns[41:])]

In [53]:
probability_df.head()

Unnamed: 0,patient_id,d_5990,d_5990_Probs,d_78605,d_78605_Probs,d_486,d_486_Probs,d_78650,d_78650_Probs,d_78079,...,d_7840_Probs,d_28860,d_28860_Probs,d_4660,d_4660_Probs,d_6829,d_6829_Probs,d_00845,d_00845_Probs,index_date
0,100000099,0.0,0.072606,0.0,0.005401,0.0,0.015968,0.0,0.007524,0.0,...,0.002237,0.0,0.000742,0.0,0.004378,0.0,0.001584,0.0,0.000246,20120601
1,100000315,0.0,0.132548,0.0,0.012732,0.0,0.013535,0.0,0.018253,1.0,...,0.005328,0.0,0.001624,0.0,0.007198,0.0,0.004284,0.0,0.001491,20120601
2,100000379,0.0,0.012923,0.0,0.020842,0.0,0.009473,0.0,0.029747,0.0,...,0.00827,0.0,0.000843,0.0,0.007195,0.0,0.0037,0.0,0.000365,20120601
3,100000437,0.0,0.006942,0.0,0.001616,0.0,0.000609,0.0,0.003745,0.0,...,0.000804,0.0,5.7e-05,0.0,0.001817,0.0,0.000637,0.0,8e-06,20120601
4,100000559,0.0,0.070057,0.0,0.04791,0.0,0.07227,0.0,0.058664,0.0,...,0.012962,0.0,0.005074,0.0,0.004769,0.0,0.023712,0.0,0.006905,20120601


In [51]:
event_df.head()

Unnamed: 0,patient_id,event_1,score_1,dates_1,event_2,score_2,dates_2,event_3,score_3,dates_3,...,event_8,score_8,dates_8,event_9,score_9,dates_9,event_10,score_10,dates_10,index_date
0,100000099,d_5929,0.100788124,71,d_5929,0.100788124,71,d_5929,0.100788124,85,...,d_17362,0.09835929,15,d_17362,0.09835929,15,d_17362,0.09835929,30,20120601
1,100000315,h_87186,0.100095496,92,h_87088,0.09999611,114,h_87088,0.09999611,92,...,h_93280,0.099985495,22,d_5950,0.099975646,114,d_5950,0.099975646,114,20120601
2,100000379,d_2704,0.10008121,56,d_7010,0.100075185,84,d_7010,0.100075185,84,...,d_1744,0.09993821,56,d_1744,0.09993821,56,h_4177F,0.099926025,17,20120601
3,100000437,d_81601,0.10011583,42,d_81601,0.10011583,42,d_V5419,0.09997104,14,...,d_V5419,0.09997104,21,d_V5419,0.09997104,21,d_V5419,0.09997104,14,20120601
4,100000559,h_90960,0.10008023,0,h_90960,0.10008023,61,h_90960,0.10008023,31,...,h_A0428,0.099961355,10,h_A0428,0.099961355,36,h_A0428,0.099961355,38,20120601


In [52]:
event_df['index_date'] = index_dates
probability_df['index_date'] = index_dates

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [55]:
event_df.to_csv('./test/test_ae_event_importance_withIndex.csv', index=False)
probability_df.to_csv('./test/test_ae_probability_withIndex.csv', index=False)

In [56]:
!aws s3 cp ./test/test_ae_event_importance_withIndex.csv s3://cmsai-mrk-amzn/test\ phase/test_ae_event_importance_withIndex_nan.csv

upload: test/test_ae_event_importance_withIndex.csv to s3://cmsai-mrk-amzn/test phase/test_ae_event_importance_withIndex_nan.csv


In [57]:
!aws s3 cp ./test/test_ae_probability_withIndex.csv s3://cmsai-mrk-amzn/test\ phase/test_ae_probability_withIndex_nan.csv

upload: test/test_ae_probability_withIndex.csv to s3://cmsai-mrk-amzn/test phase/test_ae_probability_withIndex_nan.csv


In [1]:
!aws s3 cp AE_Testing_E2E_full_test_submitted.ipynb s3://cmsai-mrk-amzn/test\ phase/

upload: ./AE_Testing_E2E_full_test_submitted.ipynb to s3://cmsai-mrk-amzn/test phase/AE_Testing_E2E_full_test_submitted.ipynb
