In [None]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.tensorflow import TensorFlow
import os
from sagemaker.debugger import TensorBoardOutputConfig
import boto3
from sagemaker.inputs import FileSystemInput
from datetime import datetime

## Setup AWS General ressources

In [None]:
role = 'arn:aws:iam::427352492539:role/SageMaker-execution-role'
#bucket = sagemaker_session.default_bucket()
sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_session.region_name
client = boto3.client("sts")
account = client.get_caller_identity()["Account"]

## Set up FSx

In [None]:
subnets = ["subnet-00a29ff9a8e0d9f80"]
file_system_id = 'fs-06263091c52ae2e8d'


file_system_directory_path = "/ctx6hbmv/datasets"  # highest directory must be root path, followed by local FSx Path
file_system_access_mode = "rw"
file_system_type = "FSxLustre"
train_fs = FileSystemInput(
    file_system_id=file_system_id,
    file_system_type=file_system_type,
    directory_path=file_system_directory_path,
    file_system_access_mode=file_system_access_mode,
)
security_group_ids = [
    "sg-04c12342ca0d49fd7"
]
data_channels = {"train": train_fs}

## Set up variables

In [None]:
date = datetime.now().strftime("%y%m%d-%H%M%S")
epochs = 5
instance_count = 1
entry_point = 'train.py'
device = 'gpu'

s3_uri_model = 's3://16062023-sagemaker-bucket-01/models/'
s3_uri_training_data = 's3://16062023-sagemaker-bucket-01/datasets/'
s3_uri = 's3://16062023-sagemaker-bucket-01'

instance_type = "ml.c4.xlarge" # choose instance

image_uri_cpu='763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-training:2.3.1-cpu-py37-ubuntu18.04'.format(region)

job_name = '{}-TensorFlow-Mnist-data-loading-{}-{}-{}-{}e'.format(
    instance_count,
    date,
    instance_type.replace('.','-').replace('ml-', ''),
    device,
    epochs)

## Set up Tensorboard

In [None]:
LOG_DIR="/opt/ml/output/tensorboard"

output_path = os.path.join(
    s3_uri, job_name
)

tensorboard_output_config = TensorBoardOutputConfig(
    s3_output_path=os.path.join(output_path, 'tensorboard'),
    container_local_output_path=LOG_DIR
)

## Construct the TensorFlow Estimator

In [None]:
estimator = TensorFlow(entry_point='train.py',
                      role=role,
                      instance_count=instance_count,
                      source_dir=".",
                      instance_type=instance_type,
                      image_uri=image_uri_cpu,
                      sagemaker_session=sagemaker_session,
                      framework_version="2.12",
                      py_version="py39",
                      subnets=subnets,
                      security_group_ids=security_group_ids,
                      model_dir=s3_uri_model,
                      tensorboard_output_config=tensorboard_output_config,
                      hyperparameters={
                          'epochs': epochs
                      },
                      script_mode=False)

## Start the training Job

In [None]:
estimator.fit(inputs=data_channels, wait=False,  # True makes notebook wait and logs output in real time
              job_name=job_name)