# 1 main.SM_Multigpu Distributed Training-ScriptMode-DALLE
---

본 모듈에서는 Amzaon SageMaker API을 효과적으로 이용하기 위해 multigpu-distributed 학습을 위한 PyTorch 프레임워크 자체 구현만으로 모델 훈련을 수행해 봅니다.

In [55]:
install_needed = True  # should only be True once
# install_needed = False

In [56]:
import sys
import IPython

if install_needed:
    print("installing deps and restarting kernel")
#     !{sys.executable} -m pip install -U split-folders tqdm albumentations crc32c wget
    !{sys.executable} -m pip install 'sagemaker[local]' --upgrade
    !{sys.executable} -m pip install -U bokeh smdebug sagemaker-experiments gdown
    !{sys.executable} -m pip install -U sagemaker torch torchvision
    !/bin/bash ./local/local_mode_setup.sh
    IPython.Application.instance().kernel.do_shutdown(True)

installing deps and restarting kernel
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m
nvidia-docker2 already installed. We are good to go!
Stopping docker: [60G[[0;32m  OK  [0;39m]
Starting docker:	.[60G[[0;32m  OK  [0;39m]
SageMaker instance route table setup is ok. We are good to go.
SageMaker instance routing for Docker is ok. We are good to go!


## 2. 환경 설정

<p>Sagemaker 학습에 필요한 기본적인 package를 import 합니다. </p>
<p>boto3는 HTTP API 호출을 숨기는 편한 추상화 모델을 가지고 있고, Amazon EC2 인스턴스 및 S3 버켓과 같은 AWS 리소스와 동작하는 파이선 클래스를 제공합니다. </p>
<p>sagemaker python sdk는 Amazon SageMaker에서 기계 학습 모델을 교육 및 배포하기 위한 오픈 소스 라이브러리입니다.</p>

In [1]:
import joblib
import matplotlib.pyplot as plt
import sagemaker
# import splitfolders

import datetime
import glob
import os
import time
import warnings

from smexperiments.experiment import Experiment
from smexperiments.trial import Trial

# import wget
# import tarfile
import shutil

import boto3
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torchvision

# from tqdm import tqdm
from time import strftime
from PIL import Image
from torch.utils.data import Dataset
from torchvision import datasets, transforms

from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorch

from sagemaker.debugger import (Rule,
                                rule_configs,
                                ProfilerConfig, 
                                FrameworkProfile, 
                                DetailedProfilingConfig, 
                                DataloaderProfilingConfig, 
                                PythonProfilingConfig)

warnings.filterwarnings('ignore')
%config InlineBackend.figure_format = 'retina'

In [2]:
role = get_execution_role()

In [3]:
sagemaker.__version__

'2.59.2'

In [4]:
def create_experiment(experiment_name):
    try:
        sm_experiment = Experiment.load(experiment_name)
    except:
        sm_experiment = Experiment.create(experiment_name=experiment_name,
                                          tags=[
                                              {
                                                  'Key': 'multigpu',
                                                  'Value': 'yes'
                                              },
                                              {
                                                  'Key': 'multinode',
                                                  'Value': 'yes'
                                              },
                                          ])

In [5]:
def create_trial(experiment_name, set_param, i_type, i_cnt, spot):
    create_date = strftime("%m%d-%H%M%s")
    
    spot = 's' if spot else 'd'
    i_tag = 'test'
    if i_type == 'ml.p3.16xlarge':
        i_tag = 'p3'
    elif i_type == 'ml.p3dn.24xlarge':
        i_tag = 'p3dn'
    elif i_type == 'ml.p4d.24xlarge':
        i_tag = 'p4d'    
        
    trial = "-".join([i_tag,str(i_cnt),spot])
       
    sm_trial = Trial.create(trial_name=f'{experiment_name}-{trial}-{create_date}',
                            experiment_name=experiment_name)

    job_name = f'{sm_trial.trial_name}'
    return job_name

In [6]:
bucket = 'bucket-exp-dalle-210914'
code_location = f's3://{bucket}/sm_codes'
output_path = f's3://{bucket}/vqgan_poc/output/' 

In [7]:
metric_definitions=[
     {'Name': 'train:lr', 'Regex': 'lr - (.*?),'},
     {'Name': 'train:Loss', 'Regex': 'loss -(.*?),'},
]

In [8]:
from sagemaker.debugger import Rule, ProfilerRule, rule_configs

rules=[ 
    Rule.sagemaker(rule_configs.loss_not_decreasing()),
    Rule.sagemaker(rule_configs.overfit()),
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
]

In [9]:
# !gdown https://drive.google.com/uc?id=1vF8Ht0VThpobtmShD52_INhpIgy6eEXq
# !gdown https://drive.google.com/uc?id=1kaIqFwTLD7Ml3ib9NQpjoUSD4FUD21-I

In [10]:
# !rm -rf dataset
# !mkdir dataset
# !unzip birds.zip -d dataset/
# !tar zxvf CUB_200_2011.tgz -C dataset/

In [11]:
hyperparameters = {
        't' : True,
        'base' : '/opt/ml/code/configs/faceshq_vqgan_test.yaml',
        'output_s3' : output_path,
#         'gpus' : 8
    }

experiment_name = 'vqgan-poc-exp2'
# instance_type = 'ml.p4d.24xlarge'  # 'ml.p3.16xlarge', 'ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'local_gpu'
instance_type = 'local_gpu'

instance_count = 1
do_spot_training = False
max_wait = None
max_run = 1*60*60

In [12]:
if instance_type =='local_gpu':
    from sagemaker.local import LocalSession
    from pathlib import Path

    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}
#     s3_data_path = 'file:///home/ec2-user/SageMaker/test-fsx'
    s3_data_path = 'file:///home/ec2-user/SageMaker/dataset/BIRDS'
    source_dir = f'{Path.cwd()}/taming-transformers'
else:
    sess = boto3.Session()
    sagemaker_session = sagemaker.Session()
    sm = sess.client('sagemaker')
#     bucket_name = 'dataset-cyj-coco-210410'
#     s3_data_path = f's3://{bucket_name}/dataset1'
#     s3_data_path = 's3://dataset-cyj-us-east-1/CUB-BIRD'
    s3_data_path = 's3://dataset-cyj-us-east-1/conceptual_captions/validation'
    source_dir = 'taming-transformers'


In [13]:
image_uri = None
distribution = None
train_job_name = 'sagemaker'


train_job_name = 'smp-dist'
distribution = {}

# if hyperparameters.get('sagemakermp'):
#     distribution['smdistributed'] = { "modelparallel": {
#                                               "enabled":True,
#                                               "parameters": {
#                                                   "partitions": hyperparameters['num_partitions'],
#                                                   "microbatches": hyperparameters['num_microbatches'],
#                                                   "placement_strategy": hyperparameters['placement_strategy'],
#                                                   "pipeline": hyperparameters['pipeline'],
#                                                   "optimize": hyperparameters['optimize'],
#                                                   "ddp": hyperparameters['ddp'],
#                                               }
#                                           }
#                                       }


# distribution["smdistributed"]={ 
#                     "dataparallel": {
#                         "enabled": True
#                     }
#             }
distribution["mpi"]={
                    "enabled": False,
#                     "processes_per_host": 8, # Pick your processes_per_host
#                     "custom_mpi_options": "-verbose -x orte_base_help_aggregate=0 "
              }

if do_spot_training:
    max_wait = max_run

print("train_job_name : {} \ntrain_instance_type : {} \ntrain_instance_count : {} \nimage_uri : {} \ndistribution : {}".format(train_job_name, instance_type, instance_count, image_uri, distribution))    

train_job_name : smp-dist 
train_instance_type : local_gpu 
train_instance_count : 1 
image_uri : None 
distribution : {'mpi': {'enabled': False}}


In [14]:
# image_uri = '763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.8.1-gpu-py36-cu111-ubuntu18.04'

In [58]:
%%time

# all input configurations, parameters, and metrics specified in estimator 
# definition are automatically tracked
estimator = PyTorch(
    entry_point='main.py',
    source_dir=source_dir,
    role=role,
    sagemaker_session=sagemaker_session,
    framework_version='1.8.1', 
    py_version='py36',
#     image_uri=image_uri,
    instance_count=instance_count,
    instance_type=instance_type,
    volume_size=1024,
    code_location = code_location,
    output_path=output_path,
    hyperparameters=hyperparameters,
#     distribution=distribution,
    disable_profiler=True,
    debugger_hook_config=False,
    metric_definitions=metric_definitions,
#     rules=rules,
    max_run=max_run,
    use_spot_instances=do_spot_training,  # spot instance 활용
    max_wait=max_wait,
#     subnets=['subnet-05c77affac40aa7f3'],  # 0fbae6d01dc673923 (IAD12) subnet-0c775b056a6e540ee  , 	subnet-05b7d4713e03d2bfe , subnet-0b731e2124d43368d  ## subnet-v0d589322c4853e860
#     security_group_ids=['sg-05ee89dd5a66c25f5'],  # sg-0b945c6599df74ec6 sg-04e9a37dbd74e3ade 	sg-04d095a9088c808c4
)

CPU times: user 192 µs, sys: 30 µs, total: 222 µs
Wall time: 228 µs


In [59]:
!sudo rm -rf ./taming-transformers/wandb
!sudo rm -rf ./taming-transformers/logs/*

In [60]:
# Configure FSx Input for your SageMaker Training job

from sagemaker.inputs import FileSystemInput

file_system_directory_path= '/ksmjfbmv'

file_system_id='fs-0ffed11a31906f7ee'

file_system_access_mode='rw'
file_system_type='FSxLustre'
train_fs = FileSystemInput(file_system_id=file_system_id,
                                    file_system_type=file_system_type,
                                    directory_path=file_system_directory_path,
                                    file_system_access_mode=file_system_access_mode)

In [61]:
# input_data = sagemaker.inputs.TrainingInput(
#         s3_data=s3_data_path,
#         distribution='ShardedByS3Key',
#         s3_data_type='S3Prefix',
#         input_mode='File',
#         shuffle_config=sagemaker.inputs.ShuffleConfig(123)
#         )

In [62]:
if instance_type =='local_gpu':
    inputs={'training': s3_data_path}
else:
    inputs={'training': train_fs}

In [69]:
create_experiment(experiment_name)
job_name = create_trial(experiment_name, hyperparameters, instance_type, instance_count, do_spot_training)

# Now associate the estimator with the Experiment and Trial
estimator.fit(
    inputs=inputs,
    job_name=job_name,
    experiment_config={
      'TrialName': job_name,
      'TrialComponentDisplayName': job_name,
    },
    wait=False,
)

INFO:sagemaker:Creating training-job with name: vqgan-poc-exp2-test-1-d-0919-13291632058180
INFO:sagemaker.local.local_session:Starting training job
INFO:sagemaker.local.image:No AWS credentials found in session but credentials from EC2 Metadata Service are available.
INFO:sagemaker.local.image:docker compose file: 
networks:
  sagemaker-local:
    name: sagemaker-local
services:
  algo-1-f1w08:
    command: train
    container_name: amvk3iobmi-algo-1-f1w08
    environment:
    - '[Masked]'
    - '[Masked]'
    image: 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.8.1-gpu-py36
    networks:
      sagemaker-local:
        aliases:
        - algo-1-f1w08
    runtime: nvidia
    stdin_open: true
    tty: true
    volumes:
    - /tmp/tmp2ctpldhs/algo-1-f1w08/output:/opt/ml/output
    - /tmp/tmp2ctpldhs/algo-1-f1w08/output/data:/opt/ml/output/data
    - /tmp/tmp2ctpldhs/algo-1-f1w08/input:/opt/ml/input
    - /tmp/tmp2ctpldhs/model:/opt/ml/model
    - /opt/ml/metadata:/opt/m

Creating amvk3iobmi-algo-1-f1w08 ... 
Creating amvk3iobmi-algo-1-f1w08 ... done
Attaching to amvk3iobmi-algo-1-f1w08
[36mamvk3iobmi-algo-1-f1w08 |[0m 2021-09-19 13:29:43,718 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
[36mamvk3iobmi-algo-1-f1w08 |[0m 2021-09-19 13:29:43,834 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
[36mamvk3iobmi-algo-1-f1w08 |[0m 2021-09-19 13:29:43,837 sagemaker_pytorch_container.training INFO     Invoking user training script.
[36mamvk3iobmi-algo-1-f1w08 |[0m 2021-09-19 13:29:43,838 sagemaker-training-toolkit INFO     Installing module with the following command:
[36mamvk3iobmi-algo-1-f1w08 |[0m /opt/conda/bin/python3.6 -m pip install . -r requirements.txt
[36mamvk3iobmi-algo-1-f1w08 |[0m Processing /opt/ml/code
[36mamvk3iobmi-algo-1-f1w08 |[0m Collecting albumentations==0.4.3
[36mamvk3iobmi-algo-1-f1w08 |[0m   Downloading albumentations-0.4.3.tar.gz (3.2 MB

In [48]:
# job_name_x = 'vqgan-poc-exp2-p4d-2-d-0613-03581623556727'

In [None]:
# sagemaker_session = sagemaker.Session()
# sagemaker_session.logs_for_job(job_name=job_name_x, wait=True)

In [None]:
sagemaker_session = sagemaker.Session()
sagemaker_session.logs_for_job(job_name=job_name, wait=True)

In [70]:
import glob

In [71]:
model_dir='test'

In [76]:
print(f"************** file : {glob.glob(model_dir+'/*')}")

************** file : []


In [84]:
!pip install g_mlp_pytorch

Collecting g_mlp_pytorch
  Downloading g_mlp_pytorch-0.0.16-py3-none-any.whl (5.2 kB)
Collecting einops>=0.3
  Downloading einops-0.3.0-py2.py3-none-any.whl (25 kB)
Installing collected packages: einops, g-mlp-pytorch
Successfully installed einops-0.3.0 g-mlp-pytorch-0.0.16


In [161]:
from omegaconf import OmegaConf

In [None]:
https://download.pytorch.org/models/vgg16-397923af.pth

In [526]:
URL_MAP = {
    "vgg_lpips": "https://heibox.uni-heidelberg.de/f/607503859c864bc1b30b/?dl=1"
}

CKPT_MAP = {
    "vgg_lpips": "vgg.pth"
}

MD5_MAP = {
    "vgg_lpips": "d507d7349b931f0638a25a48a722f98a"
}

In [527]:
import os, hashlib
import requests
from tqdm import tqdm

In [178]:
def download(url, local_path, chunk_size=1024):
    os.makedirs(os.path.split(local_path)[0], exist_ok=True)
    with requests.get(url, stream=True) as r:
        total_size = int(r.headers.get("content-length", 0))
        with tqdm(total=total_size, unit="B", unit_scale=True) as pbar:
            with open(local_path, "wb") as f:
                for data in r.iter_content(chunk_size=chunk_size):
                    if data:
                        f.write(data)
                        pbar.update(chunk_size)


def md5_hash(path):
    with open(path, "rb") as f:
        content = f.read()
    return hashlib.md5(content).hexdigest()

In [179]:
path = 'taming/modules/autoencoder/lpips/vgg.pth'

In [180]:
download('https://heibox.uni-heidelberg.de/f/607503859c864bc1b30b/?dl=1', path)

8.19kB [00:00, 385kB/s]                    


In [181]:
md5 = md5_hash(path)

In [182]:
assert md5 == MD5_MAP["vgg_lpips"], md5

AssertionError: d507d7349b931f0638a25a48a722f98a

In [160]:
# def get_ckpt_path(name, root, check=False):
#     assert name in URL_MAP
    path = os.path.join(root, CKPT_MAP[name])
    if not os.path.exists(path) or (check and not md5_hash(path) == MD5_MAP[name]):
        print("Downloading {} model from {} to {}".format(name, URL_MAP[name], path))
        download(URL_MAP[name], path)
        md5 = md5_hash(path)
        assert md5 == MD5_MAP[name], md5


In [None]:
def resource_check():
    import subprocess
    result = subprocess.run(['df', '-h'], stdout=subprocess.PIPE)
    print(result.stdout.decode('utf-8'))

In [None]:
# !pip install wandb

In [356]:
import wandb
run = wandb.init(
    project="test",  # 'dalle_train_transformer' by default
#     resume=RESUME,
#     config=model_config,
#             dir=wandb_dir
)

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


[34m[1mwandb[0m: Paste an API key from your profile and hit enter:  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/ec2-user/.netrc


In [47]:
def get_dir_size(path='.'):
    import os
    total = 0
    with os.scandir(path) as it:
        for entry in it:
            if entry.is_file():
                total += entry.stat().st_size
            elif entry.is_dir():
                total += get_dir_size(entry.path)
    return total

In [48]:
get_dir_size("./source_code")

11244848

In [445]:
src = './local/daemon.json'
target = './local_test/daemon.json'

In [447]:
shutil.copyfile(src, target)
# shutil.copytree(src, target, dirs_exist_ok=True)

'./local_test/daemon.json'

In [66]:
try:
    pritn
except Exception as ex:
    print("*********Error ********", ex)

*********Error ******** name 'pritn' is not defined


In [175]:
import sys
import traceback

try:
    ans = 1/0
except Exception as ex:
    # Get current system exception
    ex_type, ex_value, ex_traceback = sys.exc_info()

    # Extract unformatter stack traces as tuples
    trace_back = traceback.extract_tb(ex_traceback)

    # Format stacktrace
    stack_trace = list()

    for trace in trace_back:
        stack_trace.append("File : %s , Line : %d, Func.Name : %s, Message : %s" % (trace[0], trace[1], trace[2], trace[3]))

    print("Exception type : %s " % ex_type.__name__)
    print("Exception message : %s" %ex_value)
    print("Stack trace : %s" %stack_trace)

Exception type : ZeroDivisionError 
Exception message : division by zero
Stack trace : ['File : <ipython-input-175-d8b2aacbc88b> , Line : 5, Func.Name : <module>, Message : ans = 1/0']
