In [37]:
import json
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
	role = sagemaker.get_execution_role()
except ValueError:
	iam = boto3.client('iam')
	role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)
print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {sess.boto_region_name}")
    
# Hub Model configuration. https://huggingface.co/models
hub = {
    'HF_MODEL_ID':'mistralai/Mistral-7B-Instruct-v0.1',
#	'HF_MODEL_ID':'meta-llama/Meta-Llama-3-8B',
	'SM_NUM_GPUS': json.dumps(1),
	'HUGGING_FACE_HUB_TOKEN': '<your HF token>'
}




sagemaker role arn: arn:aws:iam::555043101106:role/Weka-Test-Stack-SageMakerExecutionRole-19XRWUDUCV474
sagemaker session region: us-east-1


In [38]:
# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
	image_uri=get_huggingface_llm_image_uri("huggingface",version="1.4.2"),
	env=hub,
	role=role, 
)

In [39]:
# deploy model to SageMaker Inference
llm = huggingface_model.deploy(
	initial_instance_count=1,
	instance_type="ml.g5.48xlarge",
	container_startup_health_check_timeout=300,
  )

-----------!

In [40]:
# send request
llm.predict({
	"inputs": "My name is Julien and I like to",
})

[{'generated_text': "My name is Julien and I like to spend time outside, walking in the mountains or cycling through the fields. I love photography, especially landscape and nature, capturing the beauty of the world around us. When I'm not out doors, I enjoy working in my home studio, experimenting with new techniques and styles. I also love music and spending time with friends and family."}]

In [41]:
llm.endpoint_name

'huggingface-pytorch-tgi-inference-2024-05-10-13-15-48-841'

In [9]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.40.2-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading transformers-4.40.2-py3-none-any.whl (9.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading safetensors-0.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading tokenizers-0

In [8]:
!pip install huggingface_hub

Collecting huggingface_hub
  Downloading huggingface_hub-0.23.0-py3-none-any.whl.metadata (12 kB)
Downloading huggingface_hub-0.23.0-py3-none-any.whl (401 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface_hub
Successfully installed huggingface_hub-0.23.0


In [10]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [27]:
from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-70b-chat-hf")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
tokenizer



tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

LlamaTokenizerFast(name_or_path='mistralai/Mistral-7B-Instruct-v0.1', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [42]:
import sys
import time
import concurrent.futures
from tqdm import tqdm
import json
 
sys.path.append("utils")
from get_metrics import get_metrics_from_cloudwatch
 
# Generation arguments
parameters = {
    "do_sample": True,
    "top_p": 0.6,
    "temperature": 0.9,
    "max_new_tokens": 250,
    "return_full_text": False,
}
 
# The function to perform a single request
def make_request(payload):
    try:
        llm.predict(
            data={
                "inputs": tokenizer.apply_chat_template(
                    [
                        {
                            "role": "user",
                            "content": payload
                        }
                    ],
                    tokenize=False,
                    add_generation_prompt=True,
                ),
                "parameters": parameters,
            }
        )
        return 200
    except Exception as e:
        print(e)
        return 500
    
# Main function to run the load test
def run_load_test(total_requests, concurrent_users):
    with concurrent.futures.ThreadPoolExecutor(max_workers=concurrent_users) as executor:
        # Prepare a list of the same inputs to hit multiple times
        tasks = ["Write a long story about llamas and why should protect them."] * total_requests
        start_time = time.time()
        
        # run the requests
        results = list(tqdm(executor.map(make_request, tasks), total=total_requests, desc="Running load test"))
        end_time = time.time()
        
        print(f"Total time for {total_requests} requests with {concurrent_users} concurrent users: {end_time - start_time:.2f} seconds")
        print(f"Successful rate: {results.count(200) / total_requests * 100:.2f}%")
        # Get the metrics
        metrics = get_metrics_from_cloudwatch(
            endpoint_name=llm.endpoint_name,
            st=int(start_time),
            et=int(end_time),
            cu=concurrent_users,
            total_requests=total_requests,
            boto3_session=sess.boto_session
        )
        # store results
        with open("results.json", "w") as f:
            json.dump(metrics, f)
        # print results
        # print(f"Llama 3 8B results on `g5.2xlarge`:")
        print(f"Mistral 7B Instruct results on `g5.48xlarge`:")
        print(f"Throughput: {metrics['Thorughput (tokens/second)']:,.2f} tokens/s")
        print(f"Latency p(50): {metrics['Latency (ms/token) p(50)']:,.2f} ms/token")
        return metrics


In [43]:
# Run the load test
concurrent_users = 5
number_of_requests = 100
res = run_load_test(number_of_requests, concurrent_users)

Running load test: 100%|██████████| 100/100 [02:58<00:00,  1.79s/it]


Total time for 100 requests with 5 concurrent users: 178.85 seconds
Successful rate: 100.00%
Waiting for logs to be available ...
Waiting for query to complete ...
Waiting for query to complete ...
Waiting for query to complete ...
Mistral 7B Instruct results on `g5.48xlarge`:
Throughput: 139.66 tokens/s
Latency p(50): 35.47 ms/token
