In [1]:
import sagemaker
import boto3
from uuid import uuid4
import os
import shutil

sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)
max_runs=1

In [2]:
temp_dir = "temp"

In [3]:
transformer_examples_dir = os.path.join(temp_dir, "hugging_face_example")


### 2. Setup image and instance type

In [4]:
custom_image_name=f"huggingface-pytorch-training:1.9.1-transformers4.12.3-gpu-py38-cu111-ubuntu20.04"
image_account_id="763104351884"
instance_type = "ml.p3.2xlarge"
instance_type_gpu_map = {"ml.p3.8xlarge":4, "ml.p3.2xlarge": 1, "ml.p3.16xlarge":8}
instance_count=1

In [5]:
docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(image_account_id, region, custom_image_name)

### 3. Configure train/ test and validation datasets



In [6]:
bucket = "aegovan-data"

In [28]:
pretrained_bert="s3://{}/embeddings/bert_base_cased/".format(bucket)


trainfile = "s3://{}/glue_dataset/train/multinli_1.0_train.jsonl".format(bucket)
# valfile="s3://{}/mnli_dataset/val/multinli_1.0_dev_matched.jsonl".format(bucket)

#trainfile = "s3://{}/mnli_dataset_mini/train/multinli.jsonl".format(bucket)
valfile="s3://{}/glue_dataset_mini/train/multinli.jsonl".format(bucket)

s3_model_path = "s3://aegovan-data/mnli_sagemakerresults/mnli-reverse-lang-bert-accuracy-2022-01-23-21-29-34-327/output/model.tar.gz"
s3_model_package_path = "s3://aegovan-data/models/mnli-reverse-lang-bert-accuracy-2022-01-23-21-29-34-327/output"
s3_model_config_vocab_path = "s3://aegovan-data/embeddings/bert_base_cased/"

s3_output_path= "s3://{}/glue_sagemakerresults/".format(bucket)
s3_code_path= "s3://{}/glue_code".format(bucket)
s3_checkpoint = "s3://{}/mnli_bert_checkpoint/{}".format(bucket, str(uuid4()))

## Run processing job training

### Get train

In [8]:
if os.path.exists(transformer_examples_dir):
    shutil.rmtree(transformer_examples_dir)
    os.makedirs(transformer_examples_dir)

In [9]:
!git clone https://github.com/huggingface/transformers $transformer_examples_dir
!git -C $transformer_examples_dir checkout tags/v4.12.3

Cloning into 'temp/hugging_face_example'...
remote: Enumerating objects: 99654, done.[K
remote: Counting objects: 100% (23/23), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 99654 (delta 7), reused 17 (delta 5), pack-reused 99631[K
Receiving objects: 100% (99654/99654), 84.61 MiB | 2.65 MiB/s, done.
Resolving deltas: 100% (72298/72298), done.
Note: switching to 'tags/v4.12.3'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at 3ea15d278 Style


In [10]:
from sagemaker.network import NetworkConfig
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.processing import FrameworkProcessor
from sagemaker.huggingface import HuggingFace


framework_processor = FrameworkProcessor(HuggingFace,
                                      framework_version=None,
                                      image_uri=docker_repo,
                                      code_location = s3_code_path, 
                                       py_version="py36",
                                       command=["python"],
                                       env={'mode': 'python', 'PYTHONPATH':'/opt/ml/code'},
                                       role=role,
                                       instance_type=instance_type,
                                       instance_count=instance_count,
                                       max_runtime_in_seconds= 5 * 24 * 60 * 60,
                                       volume_size_in_gb = 250,
                                       network_config=NetworkConfig(enable_network_isolation=False),
                                       base_job_name ="glue-processing"
                                       )



## Run base mnli

In [20]:

sm_local_input_models = "/opt/ml/processing/input/data/models"
sm_local_input_data = "/opt/ml/processing/input/data/jsonlines"
sm_local_input_vocab = "/opt/ml/processing/input/data/vocab"


sm_local_output = "/opt/ml/processing/output"



framework_processor.run(
        code=f'run_glue.py',
        source_dir=f'{transformer_examples_dir}/examples/pytorch/text-classification',
        arguments=[
            "--task_name", "mnli",
            "--model_name_or_path", "bert-base-cased",
            "--do_train", "1",
            "--do_eval","1",
            "--do_predict","1",
            "--max_seq_length", str(512),
            "--per_device_train_batch_size", str(8),
            "--gradient_accumulation_steps", str(4),
            "--learning_rate", str(2e-5),
            "--num_train_epochs", str(3),
            "--output_dir", sm_local_output,
            "--overwrite_output_dir", "1",
            "--load_best_model_at_end", "1",     # load the best model when finished training (default metric is loss)
            "--eval_steps","200",
            "--save_steps","200",
            "--evaluation_strategy","steps",
            "--disable_tqdm","1"
           
        ],

        inputs=[
#                 ProcessingInput(
#                     source=s3_input_data,
#                     s3_data_type = s3_data_type,
#                     destination=sm_local_input_data,
#                     s3_data_distribution_type="FullyReplicated"),

#                 ProcessingInput(
#                         source=s3_model_path,
#                         destination=sm_local_input_models,
#                         s3_data_distribution_type="FullyReplicated"),

#                 ProcessingInput(
#                         source=s3_input_vocab,
#                         destination=sm_local_input_vocab,
#                         s3_data_distribution_type="FullyReplicated")
            ],


        outputs=[ProcessingOutput(
                source=sm_local_output, 
                destination=s3_output_path,
                output_name='predictions')]
    )

[34m[INFO|trainer.py:540] 2022-02-12 19:57:20,966 >> The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.[0m
[34m[INFO|trainer.py:2243] 2022-02-12 19:57:20,969 >> ***** Running Evaluation *****[0m
[34m[INFO|trainer.py:2245] 2022-02-12 19:57:20,969 >>   Num examples = 9815[0m
[34m[INFO|trainer.py:2248] 2022-02-12 19:57:20,969 >>   Batch size = 8[0m
[34m02/12/2022 19:59:06 - INFO - datasets.metric - Removing /root/.cache/huggingface/metrics/glue/mnli/default_experiment-1-0.arrow[0m
[34m{'eval_loss': 0.545918345451355, 'eval_accuracy': 0.780539989811513, 'eval_runtime': 105.4771, 'eval_samples_per_second': 93.053, 'eval_steps_per_second': 11.633, 'epoch': 0.18}[0m
[34m[INFO|trainer.py:1995] 2022-02-12 19:59:06,446 >> Saving model checkpoint to /opt/ml/processing/output/checkpoint-2200[0m
[34m[INFO|configuration_utils.py:417] 2022-02-12 19:59:06,447 >> Conf

[34m[INFO|trainer.py:540] 2022-02-12 20:27:42,209 >> The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.[0m
[34m[INFO|trainer.py:2243] 2022-02-12 20:27:42,211 >> ***** Running Evaluation *****[0m
[34m[INFO|trainer.py:2245] 2022-02-12 20:27:42,211 >>   Num examples = 9815[0m
[34m[INFO|trainer.py:2248] 2022-02-12 20:27:42,211 >>   Batch size = 8[0m
[34m02/12/2022 20:29:27 - INFO - datasets.metric - Removing /root/.cache/huggingface/metrics/glue/mnli/default_experiment-1-0.arrow[0m
[34m{'eval_loss': 0.5076935887336731, 'eval_accuracy': 0.7969434538970963, 'eval_runtime': 104.8362, 'eval_samples_per_second': 93.622, 'eval_steps_per_second': 11.704, 'epoch': 0.28}[0m
[34m[INFO|trainer.py:1995] 2022-02-12 20:29:27,048 >> Saving model checkpoint to /opt/ml/processing/output/checkpoint-3400[0m
[34m[INFO|configuration_utils.py:417] 2022-02-12 20:29:27,048 >> Co

[34m02/12/2022 20:54:44 - INFO - datasets.metric - Removing /root/.cache/huggingface/metrics/glue/mnli/default_experiment-1-0.arrow[0m
[34m{'eval_loss': 0.5182867646217346, 'eval_accuracy': 0.8025471217524197, 'eval_runtime': 104.5524, 'eval_samples_per_second': 93.876, 'eval_steps_per_second': 11.736, 'epoch': 0.36}[0m
[34m[INFO|trainer.py:1995] 2022-02-12 20:54:44,152 >> Saving model checkpoint to /opt/ml/processing/output/checkpoint-4400[0m
[34m[INFO|configuration_utils.py:417] 2022-02-12 20:54:44,153 >> Configuration saved in /opt/ml/processing/output/checkpoint-4400/config.json[0m
[34m[INFO|modeling_utils.py:1058] 2022-02-12 20:54:44,812 >> Model weights saved in /opt/ml/processing/output/checkpoint-4400/pytorch_model.bin[0m
[34m[INFO|tokenization_utils_base.py:2034] 2022-02-12 20:54:44,812 >> tokenizer config file saved in /opt/ml/processing/output/checkpoint-4400/tokenizer_config.json[0m
[34m[INFO|tokenization_utils_base.py:2040] 2022-02-12 20:54:44,812 >> Special t

[34m{'loss': 0.513, 'learning_rate': 1.7012168622338114e-05, 'epoch': 0.45}[0m
[34m[INFO|trainer.py:540] 2022-02-12 21:23:20,247 >> The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.[0m
[34m[INFO|trainer.py:2243] 2022-02-12 21:23:20,249 >> ***** Running Evaluation *****[0m
[34m[INFO|trainer.py:2245] 2022-02-12 21:23:20,249 >>   Num examples = 9815[0m
[34m[INFO|trainer.py:2248] 2022-02-12 21:23:20,249 >>   Batch size = 8[0m
[34m02/12/2022 21:25:04 - INFO - datasets.metric - Removing /root/.cache/huggingface/metrics/glue/mnli/default_experiment-1-0.arrow[0m
[34m{'eval_loss': 0.4832491874694824, 'eval_accuracy': 0.8120224146714213, 'eval_runtime': 104.6278, 'eval_samples_per_second': 93.809, 'eval_steps_per_second': 11.727, 'epoch': 0.46}[0m
[34m[INFO|trainer.py:1995] 2022-02-12 21:25:04,878 >> Saving model checkpoint to /opt/ml/processing/output/checkpo

[34m02/12/2022 21:50:21 - INFO - datasets.metric - Removing /root/.cache/huggingface/metrics/glue/mnli/default_experiment-1-0.arrow[0m
[34m{'eval_loss': 0.4841580390930176, 'eval_accuracy': 0.8102903718797758, 'eval_runtime': 104.3836, 'eval_samples_per_second': 94.028, 'eval_steps_per_second': 11.755, 'epoch': 0.54}[0m
[34m[INFO|trainer.py:1995] 2022-02-12 21:50:21,456 >> Saving model checkpoint to /opt/ml/processing/output/checkpoint-6600[0m
[34m[INFO|configuration_utils.py:417] 2022-02-12 21:50:21,457 >> Configuration saved in /opt/ml/processing/output/checkpoint-6600/config.json[0m
[34m[INFO|modeling_utils.py:1058] 2022-02-12 21:50:22,139 >> Model weights saved in /opt/ml/processing/output/checkpoint-6600/pytorch_model.bin[0m
[34m[INFO|tokenization_utils_base.py:2034] 2022-02-12 21:50:22,139 >> tokenizer config file saved in /opt/ml/processing/output/checkpoint-6600/tokenizer_config.json[0m
[34m[INFO|tokenization_utils_base.py:2040] 2022-02-12 21:50:22,139 >> Special t

[34m02/12/2022 23:01:09 - INFO - datasets.metric - Removing /root/.cache/huggingface/metrics/glue/mnli/default_experiment-1-0.arrow[0m
[34m{'eval_loss': 0.4597398042678833, 'eval_accuracy': 0.8180336220071319, 'eval_runtime': 104.4995, 'eval_samples_per_second': 93.924, 'eval_steps_per_second': 11.742, 'epoch': 0.77}[0m
[34m[INFO|trainer.py:1995] 2022-02-12 23:01:09,002 >> Saving model checkpoint to /opt/ml/processing/output/checkpoint-9400[0m
[34m[INFO|configuration_utils.py:417] 2022-02-12 23:01:09,003 >> Configuration saved in /opt/ml/processing/output/checkpoint-9400/config.json[0m
[34m[INFO|modeling_utils.py:1058] 2022-02-12 23:01:09,675 >> Model weights saved in /opt/ml/processing/output/checkpoint-9400/pytorch_model.bin[0m
[34m[INFO|tokenization_utils_base.py:2034] 2022-02-12 23:01:09,675 >> tokenizer config file saved in /opt/ml/processing/output/checkpoint-9400/tokenizer_config.json[0m
[34m[INFO|tokenization_utils_base.py:2040] 2022-02-12 23:01:09,675 >> Special t

[34m{'loss': 0.4663, 'learning_rate': 1.3481095176010431e-05, 'epoch': 0.98}[0m
[34m[INFO|trainer.py:540] 2022-02-13 00:05:08,852 >> The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.[0m
[34m[INFO|trainer.py:2243] 2022-02-13 00:05:08,854 >> ***** Running Evaluation *****[0m
[34m[INFO|trainer.py:2245] 2022-02-13 00:05:08,854 >>   Num examples = 9815[0m
[34m[INFO|trainer.py:2248] 2022-02-13 00:05:08,854 >>   Batch size = 8[0m
[34m02/13/2022 00:06:53 - INFO - datasets.metric - Removing /root/.cache/huggingface/metrics/glue/mnli/default_experiment-1-0.arrow[0m
[34m{'eval_loss': 0.43200671672821045, 'eval_accuracy': 0.8288334182373918, 'eval_runtime': 104.559, 'eval_samples_per_second': 93.87, 'eval_steps_per_second': 11.735, 'epoch': 0.98}[0m
[34m[INFO|trainer.py:1995] 2022-02-13 00:06:53,414 >> Saving model checkpoint to /opt/ml/processing/output/checkpo

[34m{'loss': 0.353, 'learning_rate': 1.2394611038678836e-05, 'epoch': 1.14}[0m
[34m[INFO|trainer.py:540] 2022-02-13 00:55:49,312 >> The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.[0m
[34m[INFO|trainer.py:2243] 2022-02-13 00:55:49,314 >> ***** Running Evaluation *****[0m
[34m[INFO|trainer.py:2245] 2022-02-13 00:55:49,315 >>   Num examples = 9815[0m
[34m[INFO|trainer.py:2248] 2022-02-13 00:55:49,315 >>   Batch size = 8[0m
[34m02/13/2022 00:57:33 - INFO - datasets.metric - Removing /root/.cache/huggingface/metrics/glue/mnli/default_experiment-1-0.arrow[0m
[34m{'eval_loss': 0.46772637963294983, 'eval_accuracy': 0.8269994905756495, 'eval_runtime': 104.5142, 'eval_samples_per_second': 93.911, 'eval_steps_per_second': 11.74, 'epoch': 1.14}[0m
[34m[INFO|trainer.py:1995] 2022-02-13 00:57:33,829 >> Saving model checkpoint to /opt/ml/processing/output/checkpo

[34m02/13/2022 01:43:06 - INFO - datasets.metric - Removing /root/.cache/huggingface/metrics/glue/mnli/default_experiment-1-0.arrow[0m
[34m{'eval_loss': 0.45735758543014526, 'eval_accuracy': 0.8283239938869078, 'eval_runtime': 105.0996, 'eval_samples_per_second': 93.388, 'eval_steps_per_second': 11.675, 'epoch': 1.29}[0m
[34m[INFO|trainer.py:1995] 2022-02-13 01:43:06,905 >> Saving model checkpoint to /opt/ml/processing/output/checkpoint-15800[0m
[34m[INFO|configuration_utils.py:417] 2022-02-13 01:43:06,906 >> Configuration saved in /opt/ml/processing/output/checkpoint-15800/config.json[0m
[34m[INFO|modeling_utils.py:1058] 2022-02-13 01:43:07,579 >> Model weights saved in /opt/ml/processing/output/checkpoint-15800/pytorch_model.bin[0m
[34m[INFO|tokenization_utils_base.py:2034] 2022-02-13 01:43:07,580 >> tokenizer config file saved in /opt/ml/processing/output/checkpoint-15800/tokenizer_config.json[0m
[34m[INFO|tokenization_utils_base.py:2040] 2022-02-13 01:43:07,580 >> Spec

[34m{'loss': 0.3596, 'learning_rate': 1.0764884832681443e-05, 'epoch': 1.39}[0m
[34m[INFO|trainer.py:540] 2022-02-13 02:11:41,239 >> The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.[0m
[34m[INFO|trainer.py:2243] 2022-02-13 02:11:41,241 >> ***** Running Evaluation *****[0m
[34m[INFO|trainer.py:2245] 2022-02-13 02:11:41,241 >>   Num examples = 9815[0m
[34m[INFO|trainer.py:2248] 2022-02-13 02:11:41,241 >>   Batch size = 8[0m

[34m02/13/2022 02:13:25 - INFO - datasets.metric - Removing /root/.cache/huggingface/metrics/glue/mnli/default_experiment-1-0.arrow[0m
[34m{'eval_loss': 0.445220410823822, 'eval_accuracy': 0.8318899643402955, 'eval_runtime': 104.6784, 'eval_samples_per_second': 93.763, 'eval_steps_per_second': 11.722, 'epoch': 1.39}[0m
[34m[INFO|trainer.py:1995] 2022-02-13 02:13:25,919 >> Saving model checkpoint to /opt/ml/processing/output/checkp

[34m[INFO|trainer.py:540] 2022-02-13 03:22:32,439 >> The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.[0m
[34m[INFO|trainer.py:2243] 2022-02-13 03:22:32,441 >> ***** Running Evaluation *****[0m
[34m[INFO|trainer.py:2245] 2022-02-13 03:22:32,441 >>   Num examples = 9815[0m
[34m[INFO|trainer.py:2248] 2022-02-13 03:22:32,441 >>   Batch size = 8[0m
[34m02/13/2022 03:24:17 - INFO - datasets.metric - Removing /root/.cache/huggingface/metrics/glue/mnli/default_experiment-1-0.arrow[0m
[34m{'eval_loss': 0.4462297558784485, 'eval_accuracy': 0.8342333163525216, 'eval_runtime': 104.7816, 'eval_samples_per_second': 93.671, 'eval_steps_per_second': 11.71, 'epoch': 1.61}[0m
[34m[INFO|trainer.py:1995] 2022-02-13 03:24:17,223 >> Saving model checkpoint to /opt/ml/processing/output/checkpoint-19800[0m
[34m[INFO|configuration_utils.py:417] 2022-02-13 03:24:17,224 >> Co

[34m02/13/2022 04:29:59 - INFO - datasets.metric - Removing /root/.cache/huggingface/metrics/glue/mnli/default_experiment-1-0.arrow[0m
[34m{'eval_loss': 0.4555748999118805, 'eval_accuracy': 0.8332144676515537, 'eval_runtime': 104.4354, 'eval_samples_per_second': 93.982, 'eval_steps_per_second': 11.749, 'epoch': 1.83}[0m
[34m[INFO|trainer.py:1995] 2022-02-13 04:29:59,990 >> Saving model checkpoint to /opt/ml/processing/output/checkpoint-22400[0m
[34m[INFO|configuration_utils.py:417] 2022-02-13 04:29:59,991 >> Configuration saved in /opt/ml/processing/output/checkpoint-22400/config.json[0m
[34m[INFO|modeling_utils.py:1058] 2022-02-13 04:30:00,681 >> Model weights saved in /opt/ml/processing/output/checkpoint-22400/pytorch_model.bin[0m
[34m[INFO|tokenization_utils_base.py:2034] 2022-02-13 04:30:00,682 >> tokenizer config file saved in /opt/ml/processing/output/checkpoint-22400/tokenizer_config.json[0m
[34m[INFO|tokenization_utils_base.py:2040] 2022-02-13 04:30:00,682 >> Speci

[34m02/13/2022 05:40:47 - INFO - datasets.metric - Removing /root/.cache/huggingface/metrics/glue/mnli/default_experiment-1-0.arrow[0m
[34m{'eval_loss': 0.5132477283477783, 'eval_accuracy': 0.8367804381049414, 'eval_runtime': 105.2381, 'eval_samples_per_second': 93.265, 'eval_steps_per_second': 11.659, 'epoch': 2.05}[0m
[34m[INFO|trainer.py:1995] 2022-02-13 05:40:47,553 >> Saving model checkpoint to /opt/ml/processing/output/checkpoint-25200[0m
[34m[INFO|configuration_utils.py:417] 2022-02-13 05:40:47,554 >> Configuration saved in /opt/ml/processing/output/checkpoint-25200/config.json[0m
[34m[INFO|modeling_utils.py:1058] 2022-02-13 05:40:48,242 >> Model weights saved in /opt/ml/processing/output/checkpoint-25200/pytorch_model.bin[0m
[34m[INFO|tokenization_utils_base.py:2034] 2022-02-13 05:40:48,242 >> tokenizer config file saved in /opt/ml/processing/output/checkpoint-25200/tokenizer_config.json[0m
[34m[INFO|tokenization_utils_base.py:2040] 2022-02-13 05:40:48,243 >> Speci

[34m02/13/2022 06:41:25 - INFO - datasets.metric - Removing /root/.cache/huggingface/metrics/glue/mnli/default_experiment-1-0.arrow[0m
[34m{'eval_loss': 0.49645334482192993, 'eval_accuracy': 0.8392256749872644, 'eval_runtime': 104.6611, 'eval_samples_per_second': 93.779, 'eval_steps_per_second': 11.724, 'epoch': 2.25}[0m
[34m[INFO|trainer.py:1995] 2022-02-13 06:41:25,903 >> Saving model checkpoint to /opt/ml/processing/output/checkpoint-27600[0m
[34m[INFO|configuration_utils.py:417] 2022-02-13 06:41:25,904 >> Configuration saved in /opt/ml/processing/output/checkpoint-27600/config.json[0m
[34m[INFO|modeling_utils.py:1058] 2022-02-13 06:41:26,583 >> Model weights saved in /opt/ml/processing/output/checkpoint-27600/pytorch_model.bin[0m
[34m[INFO|tokenization_utils_base.py:2034] 2022-02-13 06:41:26,584 >> tokenizer config file saved in /opt/ml/processing/output/checkpoint-27600/tokenizer_config.json[0m
[34m[INFO|tokenization_utils_base.py:2040] 2022-02-13 06:41:26,584 >> Spec

[34m[INFO|trainer.py:540] 2022-02-13 07:45:30,833 >> The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.[0m
[34m[INFO|trainer.py:2243] 2022-02-13 07:45:30,835 >> ***** Running Evaluation *****[0m
[34m[INFO|trainer.py:2245] 2022-02-13 07:45:30,835 >>   Num examples = 9815[0m
[34m[INFO|trainer.py:2248] 2022-02-13 07:45:30,835 >>   Batch size = 8[0m
[34m02/13/2022 07:47:15 - INFO - datasets.metric - Removing /root/.cache/huggingface/metrics/glue/mnli/default_experiment-1-0.arrow[0m
[34m{'eval_loss': 0.5095880031585693, 'eval_accuracy': 0.8385124808965868, 'eval_runtime': 105.0672, 'eval_samples_per_second': 93.416, 'eval_steps_per_second': 11.678, 'epoch': 2.46}[0m
[34m[INFO|trainer.py:1995] 2022-02-13 07:47:15,903 >> Saving model checkpoint to /opt/ml/processing/output/checkpoint-30200[0m
[34m[INFO|configuration_utils.py:417] 2022-02-13 07:47:15,904 >> C

[34m[INFO|trainer.py:540] 2022-02-13 08:46:18,425 >> The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.[0m
[34m[INFO|trainer.py:2243] 2022-02-13 08:46:18,427 >> ***** Running Evaluation *****[0m
[34m[INFO|trainer.py:2245] 2022-02-13 08:46:18,427 >>   Num examples = 9815[0m
[34m[INFO|trainer.py:2248] 2022-02-13 08:46:18,427 >>   Batch size = 8[0m
[34m02/13/2022 08:48:03 - INFO - datasets.metric - Removing /root/.cache/huggingface/metrics/glue/mnli/default_experiment-1-0.arrow[0m
[34m{'eval_loss': 0.5006418228149414, 'eval_accuracy': 0.8396332144676516, 'eval_runtime': 104.685, 'eval_samples_per_second': 93.757, 'eval_steps_per_second': 11.721, 'epoch': 2.66}[0m
[34m[INFO|trainer.py:1995] 2022-02-13 08:48:03,113 >> Saving model checkpoint to /opt/ml/processing/output/checkpoint-32600[0m
[34m[INFO|configuration_utils.py:417] 2022-02-13 08:48:03,113 >> Co

[34m{'loss': 0.242, 'learning_rate': 7.149065623641896e-07, 'epoch': 2.89}[0m
[34m[INFO|trainer.py:540] 2022-02-13 10:02:13,109 >> The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.[0m
[34m[INFO|trainer.py:2243] 2022-02-13 10:02:13,111 >> ***** Running Evaluation *****[0m
[34m[INFO|trainer.py:2245] 2022-02-13 10:02:13,111 >>   Num examples = 9815[0m
[34m[INFO|trainer.py:2248] 2022-02-13 10:02:13,111 >>   Batch size = 8[0m
[34m02/13/2022 10:03:57 - INFO - datasets.metric - Removing /root/.cache/huggingface/metrics/glue/mnli/default_experiment-1-0.arrow[0m
[34m{'eval_loss': 0.5020843744277954, 'eval_accuracy': 0.8398369842078451, 'eval_runtime': 104.604, 'eval_samples_per_second': 93.83, 'eval_steps_per_second': 11.73, 'epoch': 2.9}[0m
[34m[INFO|trainer.py:1995] 2022-02-13 10:03:57,716 >> Saving model checkpoint to /opt/ml/processing/output/checkpoint-3

## Run with reverse train

### Run model packaging

In [29]:
from sagemaker.network import NetworkConfig
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.processing import FrameworkProcessor
from sagemaker.huggingface import HuggingFace


framework_processor = FrameworkProcessor(HuggingFace,
                                      framework_version=None,
                                      image_uri=docker_repo,
                                       code_location = s3_code_path, 
                                       py_version="py36",
                                       command=["python"],
                                       env={'mode': 'python', 'PYTHONPATH':'/opt/ml/code'},
                                       role=role,
                                       instance_type="ml.m5.large",
                                       instance_count=1,
                                       max_runtime_in_seconds= 5 * 24 * 60 * 60,
                                       volume_size_in_gb = 250,
                                       network_config=NetworkConfig(enable_network_isolation=False),
                                       base_job_name ="model-packaging"
                                       )

sm_local_input_model = "/opt/ml/processing/input/data/model"
sm_local_input_config_vocab = "/opt/ml/processing/input/data/config_vocab"


sm_local_output = "/opt/ml/processing/output"

framework_processor.run(
        code=f'model_package_bert_utils.py',
        source_dir=f'../src/utils',
        arguments=[
            "--modeltarfile", f"{sm_local_input_model}/model.tar.gz" ,
            "--modelconfigfile", f"{sm_local_input_config_vocab}/config.json",
            "--vocabfile",f"{sm_local_input_config_vocab}/vocab.txt",
            "--outdir",sm_local_output
          
        ],

        inputs=[
                ProcessingInput(
                    source=s3_model_path,
                    s3_data_type = "S3Prefix",
                    destination=sm_local_input_model,
                    s3_data_distribution_type="FullyReplicated"),

                ProcessingInput(
                        source=s3_model_config_vocab_path,
                        destination=sm_local_input_config_vocab,
                        s3_data_distribution_type="FullyReplicated")

            ],


        outputs=[ProcessingOutput(
                source=sm_local_output, 
                destination=s3_model_package_path,
                output_name='predictions')]
    )


Job Name:  model-packaging-2022-02-14-03-38-34-680
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/mnli_sagemakerresults/mnli-reverse-lang-bert-accuracy-2022-01-23-21-29-34-327/output/model.tar.gz', 'LocalPath': '/opt/ml/processing/input/data/model', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'input-2', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/embeddings/bert_base_cased/', 'LocalPath': '/opt/ml/processing/input/data/config_vocab', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/glue_code/model-packaging-2022-02-14-03-38-34-680/source/sourcedir.tar.gz', 'LocalPath': '/opt/ml/processing/input/code/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionT

### Train with reverse mnli

In [None]:
from sagemaker.network import NetworkConfig
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.processing import FrameworkProcessor
from sagemaker.huggingface import HuggingFace


framework_processor = FrameworkProcessor(HuggingFace,
                                      framework_version=None,
                                      image_uri=docker_repo,
                                      code_location = s3_code_path, 
                                       py_version="py36",
                                       command=["python"],
                                       env={'mode': 'python', 'PYTHONPATH':'/opt/ml/code'},
                                       role=role,
                                       instance_type=instance_type,
                                       instance_count=instance_count,
                                       max_runtime_in_seconds= 5 * 24 * 60 * 60,
                                       volume_size_in_gb = 250,
                                       network_config=NetworkConfig(enable_network_isolation=False),
                                       base_job_name ="glue-reverse-mnli"
                                       )






sm_local_input_model = "/opt/ml/processing/input/data/model"
sm_local_input_data = "/opt/ml/processing/input/data/jsonlines"
sm_local_input_vocab = "/opt/ml/processing/input/data/vocab"


sm_local_output = "/opt/ml/processing/output"



framework_processor.run(
        code=f'run_glue.py',
        source_dir=f'{transformer_examples_dir}/examples/pytorch/text-classification',
        arguments=[
            "--task_name", "mnli",
            "--model_name_or_path", sm_local_input_model,
            "--do_train", "1",
            "--do_eval","1",
            "--do_predict","1",
            "--max_seq_length", str(512),
            "--per_device_train_batch_size", str(8),
            "--gradient_accumulation_steps", str(4),
            "--learning_rate", str(2e-5),
            "--num_train_epochs", str(3),
            "--output_dir", sm_local_output,
            "--overwrite_output_dir", "1",
            "--load_best_model_at_end", "1",     # load the best model when finished training (default metric is loss)
            "--eval_steps","200",
            "--save_steps","200",
            "--evaluation_strategy","steps",
            "--disable_tqdm","1"
           
        ],

        inputs=[
#                 ProcessingInput(
#                     source=s3_input_data,
#                     s3_data_type = s3_data_type,
#                     destination=sm_local_input_data,
#                     s3_data_distribution_type="FullyReplicated"),

                ProcessingInput(
                        source=s3_model_package_path,
                        destination=sm_local_input_model,
                        s3_data_distribution_type="FullyReplicated"),

#                 ProcessingInput(
#                         source=s3_input_vocab,
#                         destination=sm_local_input_vocab,
#                         s3_data_distribution_type="FullyReplicated")
            ],


        outputs=[ProcessingOutput(
                source=sm_local_output, 
                destination=s3_output_path,
                output_name='predictions')]
    )


Job Name:  glue-reverse-mnli-2022-02-14-03-46-02-535
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/models/mnli-reverse-lang-bert-accuracy-2022-01-23-21-29-34-327/output', 'LocalPath': '/opt/ml/processing/input/data/model', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/glue_code/glue-reverse-mnli-2022-02-14-03-46-02-535/source/sourcedir.tar.gz', 'LocalPath': '/opt/ml/processing/input/code/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'entrypoint', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://aegovan-data/glue_code/glue-reverse-mnli-2022-02-14-03-46-02-535/source/runproc.sh', 'LocalPath': '/opt/ml/processing/input/entrypoint', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3Dat

[34m#015Downloading:   0%|          | 0.00/313M [00:00<?, ?B/s]#015Downloading:   0%|          | 2.05k/313M [00:00<4:16:00, 20.4kB/s]#015Downloading:   0%|          | 55.3k/313M [00:00<16:14, 321kB/s]   #015Downloading:   0%|          | 139k/313M [00:00<09:21, 557kB/s] #015Downloading:   0%|          | 296k/313M [00:00<05:27, 954kB/s]#015Downloading:   0%|          | 609k/313M [00:00<03:00, 1.73MB/s]#015Downloading:   0%|          | 1.25M/313M [00:00<01:34, 3.30MB/s]#015Downloading:   1%|          | 2.53M/313M [00:00<00:48, 6.38MB/s]#015Downloading:   2%|▏         | 5.09M/313M [00:00<00:24, 12.4MB/s]#015Downloading:   3%|▎         | 9.31M/313M [00:00<00:14, 21.6MB/s]#015Downloading:   4%|▍         | 13.5M/313M [00:01<00:10, 27.5MB/s]#015Downloading:   6%|▌         | 17.7M/313M [00:01<00:09, 31.7MB/s]#015Downloading:   7%|▋         | 21.7M/313M [00:01<00:08, 34.5MB/s]#015Downloading:   8%|▊         | 25.9M/313M [00:01<00:07, 36.3MB/s]#015Downloading:  10%|▉         | 30.1M/313M [00:01<

[34m[INFO|modeling_utils.py:1607] 2022-02-14 03:53:15,961 >> All model checkpoint weights were used when initializing BertForSequenceClassification.[0m
[34mYou should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.[0m
[34m02/14/2022 03:53:16 - INFO - datasets.arrow_dataset - Caching processed dataset at /root/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-89e6c98322b0322d.arrow[0m
[34m#015Running tokenizer on dataset:   0%|          | 0/393 [00:00<?, ?ba/s]#015Running tokenizer on dataset:   0%|          | 1/393 [00:00<02:41,  2.42ba/s]#015Running tokenizer on dataset:   1%|          | 2/393 [00:00<01:45,  3.71ba/s]#015Running tokenizer on dataset:   1%|          | 3/393 [00:00<01:36,  4.02ba/s]#015Running tokenizer on dataset:   1%|          | 4/393 [00:00<01:24,  4.58ba/s]#015Running tokenizer on dataset:   1%|▏         | 5/393 [00:01<01:18,  4.97ba/s]#015Run

[34m393 [00:33<00:36,  5.48ba/s]#015Running tokenizer on dataset:  49%|████▉     | 193/393 [00:33<00:35,  5.58ba/s]#015Running tokenizer on dataset:  49%|████▉     | 194/393 [00:34<00:35,  5.64ba/s]#015Running tokenizer on dataset:  50%|████▉     | 195/393 [00:34<00:34,  5.68ba/s]#015Running tokenizer on dataset:  50%|████▉     | 196/393 [00:34<00:36,  5.35ba/s]#015Running tokenizer on dataset:  50%|█████     | 197/393 [00:34<00:35,  5.49ba/s]#015Running tokenizer on dataset:  50%|█████     | 198/393 [00:34<00:34,  5.62ba/s]#015Running tokenizer on dataset:  51%|█████     | 199/393 [00:34<00:33,  5.74ba/s]#015Running tokenizer on dataset:  51%|█████     | 200/393 [00:35<00:33,  5.79ba/s]#015Running tokenizer on dataset:  51%|█████     | 201/393 [00:35<00:33,  5.81ba/s]#015Running tokenizer on dataset:  51%|█████▏    | 202/393 [00:35<00:32,  5.83ba/s]#015Running tokenizer on dataset:  52%|█████▏    | 203/393 [00:35<00:32,  5.85ba/s]#015Running tokenizer on dataset:  52%|█████▏    | 204

[34m#015Running tokenizer on dataset:   0%|          | 0/10 [00:00<?, ?ba/s]#015Running tokenizer on dataset:  10%|█         | 1/10 [00:00<00:01,  4.84ba/s]#015Running tokenizer on dataset:  20%|██        | 2/10 [00:00<00:01,  4.48ba/s]#015Running tokenizer on dataset:  30%|███       | 3/10 [00:00<00:01,  5.06ba/s]#015Running tokenizer on dataset:  40%|████      | 4/10 [00:00<00:01,  5.41ba/s]#015Running tokenizer on dataset:  50%|█████     | 5/10 [00:00<00:00,  5.59ba/s]#015Running tokenizer on dataset:  60%|██████    | 6/10 [00:01<00:00,  5.70ba/s]#015Running tokenizer on dataset:  70%|███████   | 7/10 [00:01<00:00,  5.77ba/s]#015Running tokenizer on dataset:  80%|████████  | 8/10 [00:01<00:00,  5.80ba/s]#015Running tokenizer on dataset:  90%|█████████ | 9/10 [00:01<00:00,  5.74ba/s]#015Running tokenizer on dataset: 100%|██████████| 10/10 [00:01<00:00,  5.98ba/s]#015Running tokenizer on dataset: 100%|██████████| 10/10 [00:01<00:00,  5.62ba/s][0m
[34m02/14/2022 03:54:31 - INFO - __