In [2]:
import os
import numpy as np
import pandas as pd
import sagemaker
from sagemaker.pytorch import PyTorch
 
sagemaker_session = sagemaker.Session()
bucket = <bucket name> 
prefix = "sagemaker/pytorch-bert-financetext"
role = sagemaker.get_execution_role()

output_path = f"s3://{bucket}/{prefix}"

Couldn't call 'get_role' to get Role ARN from role name AmazonSageMaker-ExecutionRole-20210425T210467 to get Role path.
Assuming role was created in SageMaker AWS console, as the name contains `AmazonSageMaker-ExecutionRole`. Defaulting to Role ARN with service-role in path. If this Role ARN is incorrect, please add IAM read permissions to your role or supply the Role Arn directly.


In [3]:
inputs_train = sagemaker_session.upload_data("./data/train.csv", bucket=bucket, key_prefix=prefix)
inputs_test = sagemaker_session.upload_data("./data/test.csv", bucket=bucket, key_prefix=prefix)


## Distributed training

In [6]:
estimator = PyTorch(
    entry_point="train-dis.py",
    source_dir="code",
    role=role,
    framework_version="1.6",
    py_version="py3",
    instance_count=2,  
    instance_type= "ml.g4dn.12xlarge", # "ml.p3.2xlarge",  ml.g4dn.12xlarge
    output_path=output_path,
    hyperparameters={
        "epochs": 10,
        "lr" : 5e-5,
        "num_labels": 3,
        "train_file": "train.csv",
        "test_file" : "test.csv",
        "MAX_LEN" : 315,
        "batch_size" : 64,
        "test_batch_size" : 10,
        "backend": "nccl"
    },
    
)
estimator.fit({"training": inputs_train, "testing": inputs_test}, logs = "None")



2021-09-25 23:40:43 Starting - Starting the training job
2021-09-25 23:40:45 Starting - Launching requested ML instances............
2021-09-25 23:41:49 Starting - Preparing the instances for training.................
2021-09-25 23:43:21 Downloading - Downloading input data..
2021-09-25 23:43:35 Training - Downloading the training image.............
2021-09-25 23:44:48 Training - Training image download completed. Training in progress.........................................................................................
2021-09-25 23:52:16 Uploading - Uploading generated training model..............
2021-09-25 23:53:31 Completed - Training job completed


In [7]:
model_data = estimator.model_data
print(model_data)

s3://sagemaker-studio-300165273893-gk8ivkp5ane/sagemaker/pytorch-bert-financetext/pytorch-training-2021-09-25-23-40-42-947/output/model.tar.gz


## Deployment

In [8]:
from sagemaker.pytorch.model import PyTorchModel 

pytorch_model = PyTorchModel(model_data=model_data,
                             role=role,
                             framework_version="1.3.1",
                             source_dir="code",
                             py_version="py3",
                             entry_point="inference.py")

predictor = pytorch_model.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")

---------------!

In [9]:
predictor.serializer = sagemaker.serializers.JSONSerializer()
predictor.deserializer = sagemaker.deserializers.JSONDeserializer()

In [11]:
result = predictor.predict("The market went up 15% today.  This is better than average")
print("predicted class: ", np.argmax(result))

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received server error (500) from model with message "Unable to load weights from pytorch checkpoint file. If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. ". See https://ca-central-1.console.aws.amazon.com/cloudwatch/home?region=ca-central-1#logEventViewer:group=/aws/sagemaker/Endpoints/pytorch-inference-2021-09-25-23-59-23-532 in account 300165273893 for more information.

In [12]:
predictor.delete_endpoint()