In [1]:
import os
import json

import boto3
import sagemaker
from sagemaker.huggingface import get_huggingface_llm_image_uri, HuggingFaceModel


# AWS Configuration

In [2]:
config_data  = json.load(open('config.json'))

In [3]:
config_data.keys()

dict_keys(['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_DEFAULT_REGION', 'ROLE_NAME', 'HF_TOKEN'])

In [4]:
AWS_ACCESS_KEY_ID = config_data['AWS_ACCESS_KEY_ID']
AWS_SECRET_ACCESS_KEY = config_data['AWS_SECRET_ACCESS_KEY']
AWS_DEFAULT_REGION = config_data['AWS_DEFAULT_REGION']
ROLE_NAME = config_data['ROLE_NAME']
HF_TOKEN = config_data['HF_TOKEN']

In [6]:
# Set up AWS credentials with environment variables
os.environ['AWS_ACCESS_KEY_ID'] = AWS_ACCESS_KEY_ID
os.environ['AWS_SECRET_ACCESS_KEY'] = AWS_SECRET_ACCESS_KEY
os.environ['AWS_DEFAULT_REGION'] = AWS_DEFAULT_REGION


In [None]:
try:
    role = sagemaker.get_execution_role() # this will not work in local environments. it supports only in sagemaker notebook instances.
    # That is the reason why we try to establish connection with iam using boto3
except ValueError as e:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName = ROLE_NAME)['Role']['Arn']



In [12]:
# retrieve the llm image uri

llm_image = get_huggingface_llm_image_uri(
    "huggingface",
    version = "0.9.3"
)

In [17]:
#sagemaker config

instance_type ="ml.g5.xlarge"
number_of_gpu = 1
health_check_timeout = 600
endpoint_name = "llama-2-endpoint"

In [14]:
# Define Model and Endpoint configuration parameter

config = {
    'HF_MODEL_ID': "meta-llama/Llama-2-7b-chat-hf",
    'SM_NUM_GPUS' :json.dumps(number_of_gpu),
    'MAX_INPUT_LENGTH': json.dumps(2048),
    'MAX_TOTAL_TOKENS' : json.dumps(4096),
    'MAX_BATCH_TOTAL_TOKENS': json.dumps(8192),
    'HUGGINGFACE_HUB_TOKEN': HF_TOKEN,
    'HF_MODEL_QUNATIZE': 'bitsandbytes',
}

In [15]:
# create Huggingfacemodel with the image uri

llm_model = HuggingFaceModel(
    name= 'llama-2-model',
    role = role,
    image_uri= llm_image,
    env = config
)

In [None]:

# Deploy model to an endpoint
llm = llm_model.deploy(
  endpoint_name=endpoint_name,
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
)

print('\nLLAMA 2 model deployed to Sagemaker')

Using already existing model: llama-2-model


---------------------