In [1]:
import sagemaker
import boto3
import copy

sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)



#### 2. Setup image and instance type

In [2]:
pytorch_custom_image_name="ppi-extractor:gpu-1.0.0-201910130520"
instance_type = "ml.p3.8xlarge" 

In [3]:
docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(account_id, region, pytorch_custom_image_name)

#### 3. Configure train/ test and validation datasets

In [4]:
bucket = "aegovan-data"

In [5]:
plain_fullfile = "s3://{}/aimed/AIMedFull.json".format(bucket)
plain_trainfile = "s3://{}/aimed/AIMedtrain.json".format(bucket)
plain_valfile = "s3://{}/aimed/AIMedval.json".format(bucket)



processed_fullfile = "s3://{}/aimed/AIMedFull_preprocessed.json".format(bucket)
processed_trainfile = "s3://{}/aimed/AIMedFull_preprocessed.json".format(bucket)
processed_valfile="s3://{}/aimed/AIMedval_preprocessed.json".format(bucket)


ylhsieh_fullfile="s3://{}/aimed/AIMedFull_Ylhsieh.json".format(bucket)
    

embeddingfile="s3://{}/embeddings/bio_nlp_vec/PubMed-shuffle-win-2.bin.txt".format(bucket)
embed_dim=200

#Pretrainedbert
pretrained_bert="s3://{}/embeddings/bert/".format(bucket)


#Collobert embedding
coll_embeddingfile="s3://{}/embeddings/collobert/words_vocab_collabert.txt".format(bucket)
coll_embed_dim=50


pyyaslao_embeddingfile="s3://{}/embeddings/PubMed-and-PMC-w2v.bin.txt".format(bucket)
#embeddingfile="s3://{}/embeddings/bio_nlp_vec/PubMed-shuffle-win-30.bin.txt".format(bucket)

In [6]:
s3_output_path= "s3://{}/results/".format(bucket)
s3_code_path= "s3://{}/aimed_code".format(bucket)

### Start training

In [7]:
commit_id = "e582183f369e01418f651f5c664d52d1aeeac349"

In [8]:
docid = "docid"
labelid="isValid"

#### Inputs

In [9]:
bert_plain_inputs = {
    "train" : plain_fullfile,
    "PRETRAINED_BIOBERT" : pretrained_bert
}

embedding_plain_inputs = {
   "train" : plain_fullfile,
   "embedding" : embeddingfile
}

In [10]:
bert_processed_inputs = {
    "train" : processed_fullfile,
    "PRETRAINED_BIOBERT" : pretrained_bert

}

embedding_processed_inputs = {
   "train" : processed_fullfile,
   "embedding" : embeddingfile
}

In [11]:
bert_trainval_plain_inputs = {
    "train" : plain_trainfile,
    "val" : plain_valfile,
    "PRETRAINED_BIOBERT" : pretrained_bert

}

embedding_trainval_plain_inputs = {
    "train" : plain_trainfile,
    "val" : plain_valfile,
   "embedding" : embeddingfile

}

In [12]:
bert_trainval_processed_inputs = {
    "train" : processed_trainfile,
    "val" : processed_valfile,
    "PRETRAINED_BIOBERT" : pretrained_bert

}


embedding_trainval_processed_inputs = {
    "train" : processed_trainfile,
    "val" : processed_valfile,
    "embedding" : embeddingfile

}

In [13]:
ylhsieh_inputs = {
    "train" : ylhsieh_fullfile,
    "embedding" : embeddingfile
}

### Hyperparameters

In [14]:
use_loss_objective_metric = 1
loss_function_factory = "algorithms.top_k_cross_entropy_loss_factory.TopKCrossEntropyLossFactory"
top_k_loss = 32
batchsize = 32
lr = .0001
patience = 50

#### BILstm

In [15]:
bilstm_full_plain_overlap_hp = {
    "dataset":"PpiAimedDatasetFactory",
    "network" :"RelationExtractorBiLstmNetworkFactoryNoPos",
    "trainfile":plain_fullfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": batchsize,
    "epochs" : "1000",  
    "earlystoppingpatience":patience,
    "log-level" : "INFO",
    "learningrate":lr,
    "lstm_dropout":0.5,
    "pooling_kernel_size":3,
    "lstm_num_layers" :3,
    "lstm_hidden_size":64,
    "fc_layer_size":64,
    "fc_drop_out_rate":0.5,
    "labelfieldname":labelid,
    "commit_id" : commit_id,
    "use_loss_objective_metric": use_loss_objective_metric,
    "loss_func_factory_name":loss_function_factory,
    "top_k_loss" :top_k_loss
}
inputs_bilstm_full_plain_overlap_hp={
    "inputs":  embedding_plain_inputs,
    "hp": bilstm_full_plain_overlap_hp,
    "entry":"main_train_k_fold.py"
}

bilstm_full_processed_overlap_hp = {
    "dataset":"PpiAimedDatasetPreprocessedFactory",
    "network" :"RelationExtractorBiLstmNetworkFactoryNoPos",
    "trainfile":processed_fullfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": batchsize,
    "epochs" : "1000",  
    "earlystoppingpatience":patience,
    "log-level" : "INFO",
    "learningrate":lr,
    "lstm_dropout":0.5,
    "pooling_kernel_size":3,
    "lstm_num_layers" :3,
    "lstm_hidden_size":64,
    "fc_layer_size":64,
    "fc_drop_out_rate":0.5,
    "labelfieldname":labelid,
    "commit_id" : commit_id,
    "use_loss_objective_metric": use_loss_objective_metric,
    "loss_func_factory_name":loss_function_factory,
     "top_k_loss" :top_k_loss
}
inputs_bilstm_full_processed_overlap_hp={
    "inputs":  embedding_processed_inputs,
    "hp": bilstm_full_processed_overlap_hp,
    "entry":"main_train_k_fold.py"
}




# Unique
bilstm_full_plain_unique_hp = copy.deepcopy(bilstm_full_plain_overlap_hp)
bilstm_full_plain_unique_hp["docidfieldname"] = docid
inputs_bilstm_full_plain_unique_hp= {
    "inputs" :embedding_plain_inputs,
    "hp": bilstm_full_plain_unique_hp,
    "entry":"main_train_k_fold.py"
}

bilstm_full_processed_unique_hp = copy.deepcopy(bilstm_full_processed_overlap_hp)
bilstm_full_processed_unique_hp["docidfieldname"] = docid
inputs_bilstm_full_processed_unique_hp= {
    "inputs" :embedding_processed_inputs ,
    "hp": bilstm_full_processed_unique_hp,
    "entry":"main_train_k_fold.py"
}




#split
bilstm_trainval_processed_unique_hp = copy.deepcopy(bilstm_full_processed_overlap_hp)
bilstm_trainval_processed_unique_hp["valfile"] = processed_valfile.split("/")[-1]
bilstm_trainval_processed_unique_hp["trainfile"] = processed_trainfile.split("/")[-1]
inputs_bilstm_trainval_processed_unique_hp= {
    "inputs" :embedding_trainval_processed_inputs ,
    "hp": bilstm_trainval_processed_unique_hp,
    "entry":"main_train.py"
}

# Fill list
bilstm_inputs_hps = {"bilstm-full-processed-unique" : inputs_bilstm_full_processed_unique_hp,
           "bilstm-full-plain-unique":inputs_bilstm_full_plain_unique_hp,
           "bilstm-full-processed-overlap" : inputs_bilstm_full_processed_overlap_hp,
           "bilstm-full-plain-overlap":inputs_bilstm_full_plain_overlap_hp
           }

In [16]:
bilstm_ylsieh_overlap_hp = {
    "dataset":"PpiAimedDatasetFactoryYlhsieh",
    "network" :"RelationExtractorBiLstmNetworkFactoryNoPos",
    "trainfile":ylhsieh_fullfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": batchsize,
    "epochs" : "1000",  
    "earlystoppingpatience":patience,
    "log-level" : "INFO",
    "learningrate":lr,
    "lstm_dropout":0.5,
    "lstm_num_layers" :1,
    "lstm_hidden_size":400,
    "fc_drop_out_rate":0.5,
    "train_val_vocab_merge":1,
    "docidfieldname":None,
    "labelfieldname":labelid,
    "commit_id" : commit_id,
    "use_loss_objective_metric": use_loss_objective_metric,
    "loss_func_factory_name":loss_function_factory,
     "top_k_loss" :top_k_loss
}

#### Resnet

In [17]:
resnet_full_processed_overlap_hp = {
    "dataset":"PpiAimedDatasetPreprocessedFactory",
    "network" :"RelationExtractorSimpleResnetCnnPosNetworkFactory",
    "earlystoppingpatience" : patience,
    "trainfile":processed_fullfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": batchsize,
    "epochs" : "1000",   
    "log-level" : "INFO",
    "dropout_rate_cnn": 0.5,
    "pooling_kernel_size": 3,
    "pool_stride":1,
    "cnn_kernel_size":3,
    "cnn_num_layers":5,
    "cnn_output":256,
    "learningrate":lr,
    "weight_decay":.00001,
    "fc_layer_size": 512,
    "fc_drop_out_rate": 0.5,
    "input_drop_out_rate" : 0.2,
    "use_min_dict":0,
    "train_val_vocab_merge":0,
    "commit_id":commit_id,    
    "labelfieldname":labelid,
    "use_loss_objective_metric": use_loss_objective_metric,
    "loss_func_factory_name":loss_function_factory,
    "top_k_loss" :top_k_loss
}

inputs_resnet_full_processed_overlap_hp={
    "inputs":  embedding_processed_inputs,
    "hp": resnet_full_processed_overlap_hp,
    "entry":"main_train_k_fold.py"
}

resnet_full_plain_overlap_hp = {
    "dataset":"PpiAimedDatasetFactory",
    "network" :"RelationExtractorSimpleResnetCnnPosNetworkFactory",
    "earlystoppingpatience" : patience,
    "trainfile":plain_fullfile.split("/")[-1],
    "embeddingfile":embeddingfile.split("/")[-1],
    "embeddim":embed_dim,
    "batchsize": batchsize,
    "epochs" : "1000",   
    "log-level" : "INFO",
    "dropout_rate_cnn": 0.5,
    "pooling_kernel_size": 3,
    "pool_stride":1,
    "cnn_kernel_size":3,
    "cnn_num_layers":5,
    "cnn_output":256,
    "learningrate":lr,
    "weight_decay":.00001,
    "fc_layer_size": 512,
    "fc_drop_out_rate": 0.5,
    "input_drop_out_rate" : 0.2,
    "use_min_dict":0,
    "train_val_vocab_merge":0,
    "commit_id":commit_id,    
    "labelfieldname":labelid,
    "use_loss_objective_metric": use_loss_objective_metric,
    "loss_func_factory_name":loss_function_factory,
    "top_k_loss" :top_k_loss
}

inputs_resnet_full_plain_overlap_hp={
    "inputs":  embedding_plain_inputs,
    "hp": resnet_full_plain_overlap_hp,
    "entry":"main_train_k_fold.py"
}



#unique
resnet_full_plain_unique_hp = copy.deepcopy(resnet_full_plain_overlap_hp)
resnet_full_plain_unique_hp["docidfieldname"] = docid
inputs_resnet_full_plain_unique_hp = {
"inputs" :embedding_plain_inputs,
"hp":resnet_full_plain_unique_hp,
"entry":"main_train_k_fold.py"
}




resnet_full_processed_unique_hp = copy.deepcopy(resnet_full_processed_overlap_hp)
resnet_full_processed_unique_hp["docidfieldname"] = docid
inputs_resnet_full_processed_unique_hp = {
"inputs" :embedding_processed_inputs,
"hp":resnet_full_processed_unique_hp,
"entry":"main_train_k_fold.py"
}


#split
resnet_trainval_processed_unique_hp = copy.deepcopy(resnet_full_processed_overlap_hp)
resnet_trainval_processed_unique_hp["valfile"] = processed_valfile.split("/")[-1]
resnet_trainval_processed_unique_hp["trainfile"] = processed_trainfile.split("/")[-1]
inputs_resnet_trainval_processed_unique_hp= {
    "inputs" :embedding_trainval_processed_inputs ,
    "hp": resnet_trainval_processed_unique_hp,
    "entry":"main_train.py"
}



# Fill list
resnet_inputs_hps = {"resnet-full-processed-unique" : inputs_resnet_full_processed_unique_hp,
           "resnet-full-plain-unique":inputs_resnet_full_plain_unique_hp,
           "resnet-full-processed-overlap" : inputs_resnet_full_processed_overlap_hp,
           "resnet-full-plain-overlap":inputs_resnet_full_plain_overlap_hp
           }



#### Bert

In [18]:
accumulation_steps = 8

bert_full_processed_unique_hp = {
    "dataset":"PpiAimedDatasetPreprocessedFactory",
    "network" :"RelationExtractorBioBertFactory",
    "trainfile":processed_fullfile.split("/")[-1],
    "batchsize": batchsize/accumulation_steps,
    "accumulation_steps" : accumulation_steps,
    "epochs" : "1000",   
    "log-level" : "INFO",
    "learningrate":.00001,
    "earlystoppingpatience":20,
    "commit_id" : commit_id,
    "docidfieldname":"docid",
    "labelfieldname":"isValid",
    "use_loss_objective_metric": use_loss_objective_metric,
    "loss_func_factory_name":loss_function_factory,
    "top_k_loss" :top_k_loss


}

inputs_bert_full_processed_unique_hp = {
    "inputs" : bert_processed_inputs,
    "hp" : bert_full_processed_unique_hp,
    "entry":"main_train_bert_k_fold.py"
}

bert_full_plain_unique_hp = {
    "dataset":"PpiAimedDatasetFactory",
    "network" :"RelationExtractorBioBertFactory",
    "trainfile":plain_fullfile.split("/")[-1],
    "batchsize": batchsize/accumulation_steps,
    "accumulation_steps" : accumulation_steps,
    "epochs" : "1000",   
    "log-level" : "INFO",
    "learningrate":.00001,
    "earlystoppingpatience":20,
    "commit_id" : commit_id,
    "docidfieldname":"docid",
    "labelfieldname":"isValid",
    "use_loss_objective_metric": use_loss_objective_metric,
    "loss_func_factory_name":loss_function_factory,
    "top_k_loss" :top_k_loss


}

inputs_bert_full_plain_unique_hp = {
    "inputs" : bert_plain_inputs,
    "hp": bert_full_plain_unique_hp,
    "entry":"main_train_bert_k_fold.py"
}


bert_full_processed_overlap_hp = copy.deepcopy( bert_full_processed_unique_hp)
bert_full_processed_overlap_hp["docidfieldname"] = None
inputs_bert_full_processed_overlap_hp = copy.deepcopy( inputs_bert_full_processed_unique_hp)
inputs_bert_full_processed_overlap_hp["hp"] = bert_full_processed_overlap_hp



bert_full_plain_overlap_hp = copy.deepcopy( bert_full_plain_unique_hp)
bert_full_plain_overlap_hp["docidfieldname"] = None
inputs_bert_full_plain_overlap_hp = copy.deepcopy( inputs_bert_full_plain_unique_hp)
inputs_bert_full_plain_overlap_hp["hp"] = bert_full_plain_overlap_hp


bert_inputs_hps = {"bert-full-processed-unique" : inputs_bert_full_processed_unique_hp,
           "bert-full-plain-unique":inputs_bert_full_plain_unique_hp,
           "bert-full-processed-overlap" : inputs_bert_full_processed_overlap_hp,
           "bert-full-plain-overlap":inputs_bert_full_plain_overlap_hp
           }

In [19]:
all_hps = {}
for k in bert_inputs_hps:
    all_hps[k] = bert_inputs_hps[k]
    
for k in resnet_inputs_hps:
    all_hps[k] = resnet_inputs_hps[k]
    
for k in bilstm_inputs_hps:
    all_hps[k] = bilstm_inputs_hps[k]

In [20]:
metric_definitions = [{"Name": "TrainLoss",
                     "Regex": "###score: train_loss### (\d*[.]?\d*)"}
                    ,{"Name": "ValidationLoss",
                     "Regex": "###score: val_loss### (\d*[.]?\d*)"}
                    ,{"Name": "TrainFScore",
                     "Regex": "###score: train_fscore### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationFScore",
                     "Regex": "###score: val_fscore### (\d*[.]?\d*)"}
                    ]

In [21]:
!git log -1 | head -1
!git log -1 | head -5 | tail -1

commit e582183f369e01418f651f5c664d52d1aeeac349
    Add additional args


In [22]:
from sagemaker.pytorch import PyTorch
import pprint

restrict_job_type = None    
restrict_job_type= "resnet-full-processed-unique"
pp = pprint.PrettyPrinter(indent=4)

for job_type in resnet_inputs_hps:
    if restrict_job_type is not None and restrict_job_type != job_type: continue
        
    base_job_name="aimed-" + job_type

    hyperparameters = all_hps[job_type]["hp"]
    inputs = all_hps[job_type]["inputs"]
    entry_point= all_hps[job_type]["entry"]
    
    pp.pprint("----Jobname: {}----".format(base_job_name))
    pp.pprint(hyperparameters)
    pp.pprint(inputs)
    pp.pprint(entry_point)
    
    git_config = {'repo': 'https://github.com/elangovana/PPI-typed-relation-extractor.git',
              'branch': 'master',
              'commit': hyperparameters["commit_id"]
             }

    

    estimator = PyTorch(
                   entry_point=entry_point,
                    source_dir = 'source/algorithms',
                    dependencies =['source/algorithms', 'source/datasets', 'source/preprocessor', 'source/modelnetworks', 'source/metrics'],
                    role=role,
                    framework_version ="1.0.0",
                    py_version='py3',
                    #git_config= git_config,
                    image_name= docker_repo,
                    train_instance_count=1,
                    train_instance_type=instance_type,
                    hyperparameters =hyperparameters,
                    output_path=s3_output_path,
                    metric_definitions=metric_definitions,
                    #train_use_spot_instances = True
                    train_volume_size=30,
                    code_location=s3_code_path,
                    train_max_run = 60 * 60 * 24 * 4,
                    base_job_name = base_job_name)
    
    estimator.fit(inputs, wait=False)


'----Jobname: aimed-resnet-full-processed-unique----'
{   'batchsize': 32,
    'cnn_kernel_size': 3,
    'cnn_num_layers': 5,
    'cnn_output': 256,
    'commit_id': 'e582183f369e01418f651f5c664d52d1aeeac349',
    'dataset': 'PpiAimedDatasetPreprocessedFactory',
    'docidfieldname': 'docid',
    'dropout_rate_cnn': 0.5,
    'earlystoppingpatience': 50,
    'embeddim': 200,
    'embeddingfile': 'PubMed-shuffle-win-2.bin.txt',
    'epochs': '1000',
    'fc_drop_out_rate': 0.5,
    'fc_layer_size': 512,
    'input_drop_out_rate': 0.2,
    'labelfieldname': 'isValid',
    'learningrate': 0.0001,
    'log-level': 'INFO',
    'loss_func_factory_name': 'algorithms.top_k_cross_entropy_loss_factory.TopKCrossEntropyLossFactory',
    'network': 'RelationExtractorSimpleResnetCnnPosNetworkFactory',
    'pool_stride': 1,
    'pooling_kernel_size': 3,
    'top_k_loss': 32,
    'train_val_vocab_merge': 0,
    'trainfile': 'AIMedFull_preprocessed.json',
    'use_loss_objective_metric': 1,
    'use_min