In [5]:
import os
import numpy as np
import pandas as pd
import torch
import sagemaker
import torch
import torchtext
import sagemaker

In [83]:
def get_best_hpo_jobs(hpo_path_pattern, folds, data_all):
    """Get best training job for each fold."""
    output_path = hpo_path_pattern.format(data_all)
    if os.path.exists(output_path):
        print('Best training jobs file for each fold already created.')
        df = pd.read_csv(output_path)
        return df, output_path
    
    df = None
    columns = None
    best_params = []
    hpo_fname = ''
    for fold in folds:
        hpo_path = hpo_path_pattern.format(fold)
        df_hpo = pd.read_csv(hpo_path)
        if columns is None:
            columns = df_hpo.columns.tolist()
            columns.append('fold')
            hpo_fname = os.path.basename(hpo_path)
        val_aucs = df_hpo['FinalObjectiveValue'].tolist()
        max_auc = max(val_aucs)
        max_idx = val_aucs.index(max_auc)
        hpo_best_params = df_hpo.iloc[max_idx, :].tolist()
        hpo_best_params.append(fold)
        best_params.append(hpo_best_params)

    df = pd.DataFrame(best_params, columns=columns)
    
    output_dir = os.path.dirname(output_path)
    os.makedirs(output_dir, exist_ok=True)
   
    df.to_csv(output_path, index=False)
    return df, output_path


def get_best_params(df_hpo, criteria='avg'):
    """Get the parameters of the best hpo based on the given criteria.
    criteria possible values: ['min', 'max', 'avg']
    """
    auc_col = 'FinalObjectiveValue'
    val_aucs = df_hpo[auc_col].tolist()
    auc = None
    if criteria=='min':
        auc = min(val_aucs)
        idx = val_aucs.index(auc)
    elif criteria=='max':
        auc = max(val_aucs)
        idx = val_aucs.index(auc)
    elif criteria=='avg':
        df_hpo.sort_values(auc_col, inplace=True)
        idx = 2        
        auc = df_hpo[auc_col][idx]
    else:
        raise ValueError('Error! Invalid criteria: {}'.format(criteria))
    
    params = dict(df_hpo.iloc[idx, :12])
    int_params = ['max_delta_step', 'max_depth', 'num_round']
    for param in int_params:
        params[param] = int(params[param])
    return params, auc


def train_model(params, container, execution_role, instance_count, instance_type, output_path, 
                sagemaker_session, eval_metric, objective, scale_pos_weight, data_channels):
    """Train a model based on a given data and xgboost params."""
    xgb_model = sagemaker.estimator.Estimator(container,
                                        execution_role, 
                                        instance_count=instance_count, 
                                        instance_type=instance_type,
                                        output_path=output_path,
                                        sagemaker_session=sagemaker_session)
    
    xgb_model.set_hyperparameters(eval_metric=eval_metric,
                            objective=objective,
                            scale_pos_weight=scale_pos_weight, #For class imbalance
                            **params)
    
    xgb_model.fit(inputs=data_channels)
    
    job_name = xgb_model._current_job_name
    s3_model_path = os.path.join(output_path, job_name, 'output/model.tar.gz')
    return s3_model_path


def load_class_imbalances(class_imbalances_path):
    """Load class imbalances from json file."""
    with open(class_imbalances_path, 'r') as fp:
        class_imbalances = json.load(fp)
    return class_imbalances

In [76]:
from time import strftime, gmtime
import boto3
from sagemaker.image_uris import retrieve
import sagemaker
import json

ROOT_DIR = '/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/final-global/re/1000/'
DATA_DIR = os.path.join(ROOT_DIR, 'preprocessed')
TRAIN_DIR0 = os.path.join(ROOT_DIR, 'training')
TRAIN_DIR = os.path.join(ROOT_DIR, 'training2')
CLASS_IMBALANCE_PATH_PATTERN = os.path.join(DATA_DIR, '{}', 'class_imbalances.json')
HPO_SUMMARY_PATH_PATTERN = os.path.join(TRAIN_DIR0, str(NUM_FEATURES), '{}', 'hpo_results.csv')
TRAIN_RESULTS_PATH_PATTERN = os.path.join(TRAIN_DIR, str(NUM_FEATURES), '{}', 'train_results.csv')

LABEL = 'unplanned_readmission'
NUM_FEATURES = 100
FOLDS = ['fold_'+str(i) for i in range(5)]
DATA_ALL = 'all'
BEST_JOB_CRITERIA = 'avg'

#Bucket where the trained model is stored
BUCKET = 'cmsai-mrk-amzn'
#Directory prefix where the model training outputs is saved
now = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
DATA_PREFIX = 'FinalData/RE/Models/XGBoost/1000/training/data' #data_split, num_features
MODEL_PREFIX = 'FinalData/RE/Models/XGBoost/1000/training/models2' #data_split, num_features, time

###Algorithm config
ALGORITHM = 'xgboost'
REPO_VERSION = '1.2-1'

###HPO/training job config
TRAIN_INSTANCE_TYPE = 'ml.m4.16xlarge'
TRAIN_INSTANCE_COUNT = 2

EVALUATION_METRIC = 'auc'
OBJECTIVE = 'binary:logistic'
OBJECTIVE_METRIC_NAME = 'validation:auc'

### SageMaker Initialization
region = boto3.Session().region_name
role = sagemaker.get_execution_role()
smclient = boto3.Session().client('sagemaker')

sess = sagemaker.Session()

container = retrieve(ALGORITHM, region, version=REPO_VERSION)

In [75]:
df_hpo, hpo_all_path = get_best_hpo_jobs(HPO_SUMMARY_PATH_PATTERN, FOLDS, DATA_ALL)
print(df_hpo.shape)
df_hpo.head()

Best training jobs file for each fold already created.
(5, 19)


Unnamed: 0,alpha,colsample_bylevel,colsample_bynode,colsample_bytree,eta,gamma,lambda,max_delta_step,max_depth,min_child_weight,num_round,subsample,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds,fold
0,0.280914,0.439027,0.728951,0.687038,0.22816,3.849422,632.604601,9.0,4.0,117.825199,468.0,0.943038,sagemaker-xgboost-201202-0311-001-4a4e847c,Completed,0.63806,2020-12-02 03:14:23+00:00,2020-12-02 03:26:31+00:00,728.0,fold_0
1,1.692901,0.292333,0.345907,0.784256,0.111545,2.994191,849.851227,4.0,4.0,11.328254,360.0,0.83134,sagemaker-xgboost-201202-0424-016-9897b044,Completed,0.6396,2020-12-02 05:13:45+00:00,2020-12-02 05:23:15+00:00,570.0,fold_1
2,0.754618,0.236226,0.335963,0.848627,0.130928,4.673417,900.961673,7.0,8.0,38.211199,234.0,0.939755,sagemaker-xgboost-201202-0532-016-de502039,Completed,0.63827,2020-12-02 06:29:42+00:00,2020-12-02 06:40:34+00:00,652.0,fold_2
3,0.593411,0.230149,0.364672,0.930255,0.223471,2.399267,354.366831,10.0,4.0,104.172578,240.0,0.792599,sagemaker-xgboost-201202-0655-013-6eb27a01,Completed,0.6398,2020-12-02 07:29:36+00:00,2020-12-02 07:36:01+00:00,385.0,fold_3
4,0.057258,0.120515,0.846858,0.875614,0.106298,2.790908,433.184318,0.0,6.0,47.092693,497.0,0.639104,sagemaker-xgboost-201202-0800-020-ca20b47f,Completed,0.63794,2020-12-02 09:12:21+00:00,2020-12-02 09:26:27+00:00,846.0,fold_4


In [82]:
params, auc = get_best_params(df_hpo, criteria=BEST_JOB_CRITERIA)
print('Val Auc:', auc)
params

{'alpha': 0.7546184098808969,
 'colsample_bylevel': 0.2362259057715237,
 'colsample_bynode': 0.3359626273637306,
 'colsample_bytree': 0.848627105496981,
 'eta': 0.13092817103220053,
 'gamma': 4.6734167634608585,
 'lambda': 900.961672956788,
 'max_delta_step': 7,
 'max_depth': 8,
 'min_child_weight': 38.21119923527896,
 'num_round': 234,
 'subsample': 0.9397545071419916}

In [43]:
#Prepare the input train & validation data path
s3_train_path = 's3://{}/{}/{}/{}/train'.format(BUCKET, DATA_PREFIX, fold, NUM_FEATURES)
s3_val_path = 's3://{}/{}/{}/{}/val'.format(BUCKET, DATA_PREFIX, fold, NUM_FEATURES)
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=s3_train_path, content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=s3_val_path, content_type='csv')
s3_output_path = 's3://{}/{}/{}/{}/{}/output'.format(BUCKET, MODEL_PREFIX, now, NUM_FEATURES, fold)
#Load class imbalances
class_imbalance_path = CLASS_IMBALANCE_PATH_PATTERN.format(fold)
class_imbalances = load_class_imbalances(class_imbalance_path)
imb = class_imbalances[LABEL]
scale_pos_weight = float(imb[0])/imb[1] # negative/positive

data_channels = {'train': s3_input_train, 'validation': s3_input_validation}

In [45]:
model_output_path = train_model(params=params, 
                               container=container, 
                               execution_role=role, 
                               instance_count=TRAIN_INSTANCE_COUNT, 
                               instance_type=TRAIN_INSTANCE_TYPE, 
                               output_path=s3_output_path, 
                               sagemaker_session=sess, 
                               eval_metric=EVALUATION_METRIC, 
                               objective=OBJECTIVE, 
                               scale_pos_weight=scale_pos_weight, 
                               data_channels=data_channels)

2020-12-07 19:00:54 Starting - Starting the training job...
2020-12-07 19:00:58 Starting - Launching requested ML instances......
2020-12-07 19:02:13 Starting - Preparing the instances for training.........
2020-12-07 19:03:50 Downloading - Downloading input data
2020-12-07 19:03:50 Training - Downloading the training image...
2020-12-07 19:04:06 Training - Training image download completed. Training in progress.[35mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[35mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value auc to Json.[0m
[35mReturning the value itself[0m
[35mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[35mReturning the value itself[0m
[35mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[35mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[35mINFO:root:Determined delimiter of C

In [55]:
s3_otput_path = 's3://cmsai-mrk-amzn/FinalData/RE/Models/XGBoost/1000/training/models2/2020-12-07-18-21-40/100/fold_0/output/sagemaker-xgboost-2020-12-07-19-00-53-798/output/model.tar.gz'
print(s3_output_path)
print(model_ouput_path)

s3://cmsai-mrk-amzn/FinalData/RE/Models/XGBoost/1000/training/models2/2020-12-07-18-21-40/100/fold_0/output/sagemaker-xgboost-2020-12-07-19-00-53-798/output/model.tar.gz
s3://cmsai-mrk-amzn/FinalData/RE/Models/XGBoost/1000/training/models2/2020-12-07-18-21-40/100/fold_0/output


In [12]:
data_path1 = '/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/final-global/re/1000/raw/fold_0/test/raw_test_data_1000_30days_anony.csv'
data_path2 = '/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/final-global/re/1000/raw/fold_1/test/raw_test_data_1000_30days_anony.csv'
vocab_path1 = '/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/final-global/re/1000/raw/fold_0/vocab/vocab_1000_vall_30days'
vocab_path2 = '/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/final-global/re/1000/raw/fold_2/vocab/vocab_1000_vall_30days'

In [25]:
vocab1 = torch.load(vocab_path1)
vocab2 = torch.load(vocab_path2)

In [48]:
def combine_all_vocabs(data_dir, folds, data_all= 'all', vocab_fname='vocab_1000_vall_30days'):
    """Combine all vocabularies and save to disk."""
    def combine_vocabs(vocab1, vocab2):
        """Combine two vocabularies."""
        freqs = vocab1.freqs + vocab2.freqs
        vocab = torchtext.vocab.Vocab(freqs)
        return vocab

    output_dir = os.path.join(data_dir, data_all, 'vocab')
    output_path = os.path.join(output_dir, vocab_fname)
    if os.path.exists(output_path):
        print('Aggregated Vocab already saved to {}!'.format(output_path))
        return output_path

    print('Aggregating vocabularies...')
    vocab_all = None    
    for fold in folds:
        vocab_path = os.path.join(data_dir, fold, 'vocab', vocab_fname)
        if vocab_all is None:
            vocab_all = torch.load(vocab_path)
        else:
            vocab = torch.load(vocab_path)
            vocab_all = combine_vocabs(vocab_all, vocab)
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    torch.save(vocab_all, output_path)
    print('[SUCCESS] Aggregated Vocab saved to {}!'.format(output_path))
    
    return output_path

In [49]:
DATA_DIR = '/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/final-global/re/1000/raw/'
FOLDS = ['fold_'+str(i) for i in range(5)]

output_path = combine_all_vocabs(DATA_DIR, FOLDS)

Aggregating vocabularies...
Aggregated Vocab saved to /home/ec2-user/SageMaker/CMSAI/modeling/tes/data/final-global/re/1000/raw/all/vocab/vocab_1000_vall_30days!


In [30]:
print(type(vocab1.freqs))
print(type(vocab1.stoi))
print(type(vocab1.itos))

<class 'collections.Counter'>
<class 'collections.defaultdict'>
<class 'list'>


In [34]:
#vocab_all.freqs

In [None]:
#torchtext.vocab.Vocab()

In [None]:
#vocab1
         '<pad>': 1171535099,
         'd_20300': 27018,
         'd_20301': 896,
         'h_36430': 4829,
         'h_36592': 1251,
         'h_82247': 5563,
         'h_82310': 17543,
         'h_82374': 4992,
         'h_82435': 5523,
         'h_82565': 25502,
         'h_82784': 3166,
         'h_82977': 4531,
         'h_83615': 15606,
         'h_83735': 34406,
         'h_83883': 1245,
         'h_84075': 6631,
         'h_84100': 27546,
         'h_84132': 19468,
#vocab2
         '<pad>': 1171482512,
         'd_20501': 1206,
         'd_42789': 115446,
         'd_5990': 431618,
         'd_78650': 556214,
         'h_36591': 3458,
         'h_81001': 70454,
         'h_82310': 17786,
         'h_82374': 5036,
         'h_82435': 5568,
         'h_82550': 29469,
         'h_82553': 20323,
         'h_82565': 25622,
         'h_83735': 34241,
         'h_83880': 27401,
         'h_84132': 19476,
         'h_84295': 7684,
         'h_84484': 34140,
            
#vocab1+vocab2
         '<pad>': 2343017611,
         'd_20300': 54272,
         'd_20301': 1779,
         'h_36430': 9634,
         'h_36592': 2457,
         'h_82247': 11173,
         'h_82310': 35329,
         'h_82374': 10028,
         'h_82435': 11091,
         'h_82565': 51124,
         'h_82784': 6373,
         'h_82977': 9084,
         'h_83615': 31197,
         'h_83735': 68647,
         'h_83883': 2474,
         'h_84075': 13353,
         'h_84100': 55045,
         'h_84132': 38944,
         'h_84165': 32491,
         'h_84295': 15344,

In [37]:
word = 'h_82550'
vocab1.freqs[word] + vocab2.freqs[word], vocab_all.freqs[word]

(58860, 58860)

In [18]:
#vocab1.freqs

In [20]:
#vocab2.freqs

In [26]:
vocab1.extend(vocab2)

In [27]:
vocab1.freqs

Counter({'<pad>': 1171535099,
         'd_20300': 27018,
         'd_20301': 896,
         'h_36430': 4829,
         'h_36592': 1251,
         'h_82247': 5563,
         'h_82310': 17543,
         'h_82374': 4992,
         'h_82435': 5523,
         'h_82565': 25502,
         'h_82784': 3166,
         'h_82977': 4531,
         'h_83615': 15606,
         'h_83735': 34406,
         'h_83883': 1245,
         'h_84075': 6631,
         'h_84100': 27546,
         'h_84132': 19468,
         'h_84165': 16259,
         'h_84295': 7660,
         'h_84450': 9550,
         'h_84460': 10492,
         'h_84520': 19036,
         'h_85025': 296281,
         'h_85049': 1461,
         'h_86140': 8543,
         'h_87040': 15658,
         'h_87305': 55,
         'h_87449': 609,
         'h_87533': 9,
         'h_96372': 63764,
         'h_96374': 29182,
         'h_96375': 28701,
         'h_99214': 466345,
         'h_C9113': 1473,
         'h_J1440': 2011,
         'h_P9037': 343,
         'h_Q0179': 1154

In [None]:
anony_data_path = '/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/anonymize/RE/Data/Anonymized/365/readmission_input_targets_365_v2.csv'

In [None]:
df = pd.read_csv(anony_data_path, low_memory=False)
print(df.shape)

In [None]:
df.tail()

In [None]:
job_name = tuner.latest_tuning_job.name
my_tuner = sagemaker.HyperparameterTuningJobAnalytics(job_name)
df = my_tuner.dataframe()
df.shape #4x18

{'HyperParameterTuningJobName': 'sagemaker-xgboost-201120-2310', 'HyperParameterTuningJobArn': 'arn:aws:sagemaker:us-east-1:293790038521:hyper-parameter-tuning-job/sagemaker-xgboost-201120-2310', 'HyperParameterTuningJobConfig': {'Strategy': 'Bayesian', 'HyperParameterTuningJobObjective': {'Type': 'Maximize', 'MetricName': 'validation:auc'}, 'ResourceLimits': {'MaxNumberOfTrainingJobs': 4, 'MaxParallelTrainingJobs': 4}, 'ParameterRanges': {'IntegerParameterRanges': [{'Name': 'max_depth', 'MinValue': '1', 'MaxValue': '10', 'ScalingType': 'Auto'}, {'Name': 'num_round', 'MinValue': '200', 'MaxValue': '500', 'ScalingType': 'Auto'}, {'Name': 'max_delta_step', 'MinValue': '0', 'MaxValue': '10', 'ScalingType': 'Auto'}], 'ContinuousParameterRanges': [{'Name': 'eta', 'MinValue': '0.1', 'MaxValue': '0.5', 'ScalingType': 'Auto'}, {'Name': 'alpha', 'MinValue': '0', 'MaxValue': '2', 'ScalingType': 'Auto'}, {'Name': 'gamma', 'MinValue': '0', 'MaxValue': '5', 'ScalingType': 'Auto'}, {'Name': 'colsample_bylevel', 'MinValue': '0.1', 'MaxValue': '1.0', 'ScalingType': 'Auto'}, {'Name': 'colsample_bynode', 'MinValue': '0.1', 'MaxValue': '1.0', 'ScalingType': 'Auto'}, {'Name': 'colsample_bytree', 'MinValue': '0.5', 'MaxValue': '1.0', 'ScalingType': 'Auto'}, {'Name': 'lambda', 'MinValue': '0', 'MaxValue': '1000', 'ScalingType': 'Auto'}, {'Name': 'min_child_weight', 'MinValue': '0', 'MaxValue': '120', 'ScalingType': 'Auto'}, {'Name': 'subsample', 'MinValue': '0.5', 'MaxValue': '1.0', 'ScalingType': 'Auto'}], 'CategoricalParameterRanges': []}, 'TrainingJobEarlyStoppingType': 'Off'}, 'TrainingJobDefinition': {'StaticHyperParameters': {'_tuning_objective_metric': 'validation:auc', 'eval_metric': 'auc', 'objective': 'binary:logistic', 'rate_drop': '0.3', 'scale_pos_weight': '4.0'}, 'AlgorithmSpecification': {'TrainingImage': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.2-1', 'TrainingInputMode': 'File', 'MetricDefinitions': [{'Name': 'train:mae', 'Regex': '.*\\[[0-9]+\\].*#011train-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'validation:aucpr', 'Regex': '.*\\[[0-9]+\\].*#011validation-aucpr:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'train:merror', 'Regex': '.*\\[[0-9]+\\].*#011train-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'train:gamma-nloglik', 'Regex': '.*\\[[0-9]+\\].*#011train-gamma-nloglik:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'validation:mae', 'Regex': '.*\\[[0-9]+\\].*#011validation-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'validation:logloss', 'Regex': '.*\\[[0-9]+\\].*#011validation-logloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'train:mlogloss', 'Regex': '.*\\[[0-9]+\\].*#011train-mlogloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'train:cox-nloglik', 'Regex': '.*\\[[0-9]+\\].*#011train-cox-nloglik:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'validation:f1', 'Regex': '.*\\[[0-9]+\\].*#011validation-f1:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'validation:cox-nloglik', 'Regex': '.*\\[[0-9]+\\].*#011train-cox-nloglik:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'train:accuracy', 'Regex': '.*\\[[0-9]+\\].*#011train-accuracy:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'train:mse', 'Regex': '.*\\[[0-9]+\\].*#011train-mse:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'validation:poisson-nloglik', 'Regex': '.*\\[[0-9]+\\].*#011validation-poisson-nloglik:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'train:tweedie-nloglik', 'Regex': '.*\\[[0-9]+\\].*#011train-tweedie-nloglik:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'train:error', 'Regex': '.*\\[[0-9]+\\].*#011train-error:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'train:ndcg', 'Regex': '.*\\[[0-9]+\\].*#011train-ndcg:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'validation:map', 'Regex': '.*\\[[0-9]+\\].*#011validation-map:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'validation:auc', 'Regex': '.*\\[[0-9]+\\].*#011validation-auc:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'validation:gamma-deviance', 'Regex': '.*\\[[0-9]+\\].*#011validation-gamma-deviance:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'train:auc', 'Regex': '.*\\[[0-9]+\\].*#011train-auc:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'validation:error', 'Regex': '.*\\[[0-9]+\\].*#011validation-error:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'validation:merror', 'Regex': '.*\\[[0-9]+\\].*#011validation-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'train:poisson-nloglik', 'Regex': '.*\\[[0-9]+\\].*#011train-poisson-nloglik:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'train:rmse', 'Regex': '.*\\[[0-9]+\\].*#011train-rmse:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'train:logloss', 'Regex': '.*\\[[0-9]+\\].*#011train-logloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'validation:accuracy', 'Regex': '.*\\[[0-9]+\\].*#011validation-accuracy:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'train:aucpr', 'Regex': '.*\\[[0-9]+\\].*#011train-aucpr:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'validation:tweedie-nloglik', 'Regex': '.*\\[[0-9]+\\].*#011validation-tweedie-nloglik:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'validation:rmse', 'Regex': '.*\\[[0-9]+\\].*#011validation-rmse:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'train:gamma-deviance', 'Regex': '.*\\[[0-9]+\\].*#011train-gamma-deviance:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'validation:mse', 'Regex': '.*\\[[0-9]+\\].*#011validation-mse:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'validation:ndcg', 'Regex': '.*\\[[0-9]+\\].*#011validation-ndcg:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'train:f1', 'Regex': '.*\\[[0-9]+\\].*#011train-f1:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'validation:mlogloss', 'Regex': '.*\\[[0-9]+\\].*#011validation-mlogloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'train:map', 'Regex': '.*\\[[0-9]+\\].*#011train-map:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'validation:gamma-nloglik', 'Regex': '.*\\[[0-9]+\\].*#011validation-gamma-nloglik:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}, {'Name': 'ObjectiveMetric', 'Regex': '.*\\[[0-9]+\\].*#011validation-auc:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}]}, 'RoleArn': 'arn:aws:iam::293790038521:role/mrlpoc-sagemaker-role', 'InputDataConfig': [{'ChannelName': 'train', 'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://cmsai-mrk-amzn/CSVModelInputs/Tes/models/re/final-global/data/fold_0/100/unplanned_readmission/train', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}, {'ChannelName': 'validation', 'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://cmsai-mrk-amzn/CSVModelInputs/Tes/models/re/final-global/data/fold_0/100/unplanned_readmission/val', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}], 'OutputDataConfig': {'S3OutputPath': 's3://cmsai-mrk-amzn/CSVModelInputs/Tes/models/re/final-global/xgboost/fold_0/2020-11-20-23-09-58/100/unplanned_readmission/output'}, 'ResourceConfig': {'InstanceType': 'ml.m4.16xlarge', 'InstanceCount': 2, 'VolumeSizeInGB': 30}, 'StoppingCondition': {'MaxRuntimeInSeconds': 86400}, 'EnableNetworkIsolation': False, 'EnableInterContainerTrafficEncryption': False, 'EnableManagedSpotTraining': False}, 'HyperParameterTuningJobStatus': 'Completed', 'CreationTime': datetime.datetime(2020, 11, 20, 23, 10, 2, 977000, tzinfo=tzlocal()), 'HyperParameterTuningEndTime': datetime.datetime(2020, 11, 20, 23, 30, 50, 953000, tzinfo=tzlocal()), 'LastModifiedTime': datetime.datetime(2020, 11, 20, 23, 30, 50, 953000, tzinfo=tzlocal()), 'TrainingJobStatusCounters': {'Completed': 4, 'InProgress': 0, 'RetryableError': 0, 'NonRetryableError': 0, 'Stopped': 0}, 'ObjectiveStatusCounters': {'Succeeded': 4, 'Pending': 0, 'Failed': 0}, 'BestTrainingJob': {'TrainingJobName': 'sagemaker-xgboost-201120-2310-001-7d63ba25', 'TrainingJobArn': 'arn:aws:sagemaker:us-east-1:293790038521:training-job/sagemaker-xgboost-201120-2310-001-7d63ba25', 'CreationTime': datetime.datetime(2020, 11, 20, 23, 10, 13, tzinfo=tzlocal()), 'TrainingStartTime': datetime.datetime(2020, 11, 20, 23, 12, 41, tzinfo=tzlocal()), 'TrainingEndTime': datetime.datetime(2020, 11, 20, 23, 22, 56, tzinfo=tzlocal()), 'TrainingJobStatus': 'Completed', 'TunedHyperParameters': {'alpha': '0.3506179938150109', 'colsample_bylevel': '0.831342554812431', 'colsample_bynode': '0.47593142571804437', 'colsample_bytree': '0.5126519245758329', 'eta': '0.46287320167861645', 'gamma': '1.4073433566787807', 'lambda': '254.82578210226814', 'max_delta_step': '0', 'max_depth': '5', 'min_child_weight': '76.45545515753648', 'num_round': '461', 'subsample': '0.6143439303307552'}, 'FinalHyperParameterTuningJobObjectiveMetric': {'MetricName': 'validation:auc', 'Value': 0.5016300082206726}, 'ObjectiveStatus': 'Succeeded'}, 'ResponseMetadata': {'RequestId': 'a6cb03c4-489c-465e-8cfd-31ec69cd8c6a', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'a6cb03c4-489c-465e-8cfd-31ec69cd8c6a', 'content-type': 'application/x-amz-json-1.1', 'content-length': '8608', 'date': 'Mon, 23 Nov 2020 17:05:16 GMT'}, 'RetryAttempts': 0}}

In [None]:
old_version_train_path = '/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/readmissions_debug/train_1000.csv'
old_version_val_path = '/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/readmissions_debug/val_1000.csv'
new_version_train_path = '/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/readmissions_debug/raw_train_data_flatten.csv'
new_version_val_path = '/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/readmissions_debug/raw_test_data_flatten.csv'

In [None]:
print('Reading old version...')
df_tr_old = pd.read_csv(old_version_train_path, low_memory=False)
df_val_old = pd.read_csv(old_version_val_path, low_memory=False)
print('Aggregating old version...')
df = pd.concat([df_tr_old, df_val_old], ignore_index=True, axis=1)
print(df_val_old.shape, df.shape)
output_dir = os.path.dirname(old_version_train_path)
output_path = os.path.join(output_dir, 'old.csv')
df.to_csv(output_path, index=False)

print('Reading new version...')
df_tr_new = pd.read_csv(new_version_train_path, low_memory=False)
df_val_new = pd.read_csv(new_version_val_path, low_memory=False)
print('Aggregating new version...')
df = pd.concat([df_tr_new, df_val_new], ignore_index=True, axis=1)
print(df_val_new.shape, df.shape)
output_dir = os.path.dirname(new_version_train_path)
output_path = os.path.join(output_dir, 'new.csv')
df.to_csv(output_path, index=False)

In [None]:
print(df_val_old.shape)
df_val_old.head()

In [None]:
print(df_val_new.shape)
df_val_new.head()

In [None]:
data_path = '/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/final-global/re/1000/raw/fold_0/test/raw_test_data_flatten_30days.csv'
#vocab_path = '/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/final/re/fold_0/vocab/dev_vocab_d30_s30_vpos'
vocab_path = '/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/final-global/re/1000/raw/fold_0/vocab/vocab_d30_s30_vpos'

In [None]:
df = pd.read_csv(data_path)

In [None]:
print(df.shape)
df.head()

In [None]:
df.tail()

In [None]:
df['unplanned_readmission'].value_counts(normalize=True)

In [None]:
vocab = torch.load(vocab_path)

In [None]:
freq_vocab = vocab.freqs
del freq_vocab[0]
freq_vocab.most_common(10)

In [None]:
def get_frequent_features(vocab, num_features, codes_only=True, exclusion_list=[]): 
    """Get the most frequent codes/features."""
    num_exc = len(exclusion_list) + 100
    features = vocab.freqs.most_common(num_features + num_exc)
    if codes_only:
        features = [word[0] for word in features if word[0] not in exclusion_list and ('_' in word[0])]
    else:
        features = [word[0] for word in features if word[0] not in exclusion_list]
    features = [word for word in features if 'day' not in word] #Exclude day features
    features = features[:num_features]
    return features

In [None]:
num_features = 100
exclusion_list = ['nan', 'pad', 'unk']
codes_only = True

In [None]:
features = get_frequent_features(vocab, num_features, codes_only, exclusion_list)

In [None]:
print('Total features:', len(features))
features[:10]