# Inference

Here wil will expand and finish inference testing.

In [1]:
pwd

'c:\\Users\\RaviB\\GitHub\\vegan-ai-nutritionist\\notebooks'

In [2]:
import os

os.chdir("../")

In [3]:
pwd

'c:\\Users\\RaviB\\GitHub\\vegan-ai-nutritionist'

## Loading ENV Variables

In [8]:
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")
sagemaker_role = os.getenv("SAGEMAKER_ROLE")
huggingface_access_token = os.getenv("HUGGINGFACE_ACCESS_TOKEN")

In [9]:
import boto3
import sagemaker

boto3_session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)

sess = sagemaker.Session(boto_session=boto3_session)

In [2]:
import json
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri
from sagemaker.huggingface import get_huggingface_llm_image_uri


 
# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="1.3.3"
)

print(llm_image)

max_input_length = 1024
max_total_tokens = 2048
number_of_gpu = 1

# Configuration for the model
config = {
    'HF_MODEL_ID': "/opt/ml/model",  # Path to where SageMaker stores the model
    'SM_NUM_GPUS': json.dumps(number_of_gpu),  # Number of GPUs used per replica (modify as needed)
    'MAX_INPUT_LENGTH': json.dumps(max_input_length),  # Max length of input text
    'MAX_TOTAL_TOKENS': json.dumps(max_total_tokens),  # Max length of the generation (including input text)
    'trust_remote_code': 'True',
    #'HF_MODEL_QUANTIZE': 'bitsandbytes'
}

763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi1.3.3-gpu-py310-cu121-ubuntu20.04


## Configuring MLflow

We need the model uri. We will want the latest one though, so we can go through the mlflow database and get it.

In [18]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow/mlflow.db")

runs = mlflow.search_runs(order_by=["start_time DESC"])

latest_run = runs.iloc[0]
latest_run

run_id                                            8d92bc21177343dfa2dd5c17a9b22321
experiment_id                                                                    0
status                                                                    FINISHED
artifact_uri                     file:///C:/Users/RaviB/GitHub/vegan-ai-nutriti...
start_time                                        2024-11-01 15:25:28.078000+00:00
end_time                                          2024-11-01 15:44:41.803000+00:00
params.job_name                                   falcon-qlora-2024-11-01-12-25-26
params.learning_rate                                                        0.0002
params.epochs                                                                    1
params.model_data_uri            s3://sagemaker-us-east-1-590184030535/falcon-q...
params.batch_size                                                                2
params.model_id                                          tiiuae/falcon-7b-instruct
tags

In [17]:
latest_run_id = latest_run.run_id
latest_run_id

'8d92bc21177343dfa2dd5c17a9b22321'

In [20]:
latest_run_params = mlflow.get_run(latest_run_id).data.params
latest_run_params

{'model_data_uri': 's3://sagemaker-us-east-1-590184030535/falcon-qlora-2024-11-01-12-25-26-2024-11-01-15-25-28-836/output/model.tar.gz',
 'model_id': 'tiiuae/falcon-7b-instruct',
 'job_name': 'falcon-qlora-2024-11-01-12-25-26',
 'epochs': '1',
 'batch_size': '2',
 'learning_rate': '0.0002'}

In [23]:
latest_run_params['model_data_uri']

's3://sagemaker-us-east-1-590184030535/falcon-qlora-2024-11-01-12-25-26-2024-11-01-15-25-28-836/output/model.tar.gz'

Now let's define a function to do this all in one step.

In [29]:
def get_model_data_uri(params_to_retrieve=None, run_number=0, mlflow_data_path="sqlite:///mlflow/mlflow.db"):
    mlflow.set_tracking_uri(mlflow_data_path)

    # Get the list of runs, sorted by start time in descending order
    runs = mlflow.search_runs(order_by=["start_time DESC"])

    # Check if there are any runs
    if runs.empty:
        raise ValueError("No runs found in the specified MLflow database.")

    # Select the specified run
    selected_run_id = runs.iloc[run_number].run_id

    # Get the parameters of the selected run
    selected_run_params = mlflow.get_run(selected_run_id).data.params

    # If no specific parameters are provided, default to model_data_uri
    if params_to_retrieve is None:
        params_to_retrieve = ['model_data_uri']

    # Retrieve the requested parameters
    retrieved_params = {param: selected_run_params.get(param) for param in params_to_retrieve}

    return retrieved_params

In [32]:
s3_model_uri = get_model_data_uri()['model_data_uri']

llm_model = HuggingFaceModel(
    role=sagemaker_role,
    image_uri=llm_image,
    model_data=s3_model_uri,
    env=config,
    sagemaker_session=sess
)

Now we can deploy it.

In [34]:
llm = llm_model.deploy(
    initial_instance_count=1,
    instance_type="ml.g5.2xlarge",  # Choose an instance type that suits your model and budget
    container_startup_health_check_timeout=300  # Increase timeout for loading large models
)

--------------------*

Please check the troubleshooting guide for common errors: https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-python-sdk-troubleshooting.html#sagemaker-python-sdk-troubleshooting-create-endpoint


UnexpectedStatusException: Error hosting endpoint huggingface-pytorch-tgi-inference-2024-11-01-23-12-21-438: Failed. Reason: The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch logs for this endpoint.. Try changing the instance type or reference the troubleshooting page https://docs.aws.amazon.com/sagemaker/latest/dg/async-inference-troubleshooting.html

## Deploying Base Model

And if it fails, then we will deploy the base model.

In [None]:
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="1.3.3"
)

In [None]:
from sagemaker.huggingface import HuggingFaceModel

# Falcon 7b
hub = {'HF_MODEL_ID':'tiiuae/falcon-7b-instruct'}

# Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   env=hub,
   role=sagemaker_role,
   image_uri=llm_image,
   sagemaker_session=sess
)

In [None]:
predictor = huggingface_model.deploy(
	initial_instance_count=1,
	instance_type='ml.g5.2xlarge',
 	container_startup_health_check_timeout=300
)

## Deleting Endpoint

We need to delete the endpoint once we decide we no longer need it, so we must get the endpoint name from the mlflow folder first.

In [15]:
#we already have a function to get the data from mlflow sqlite databases
from modules.inference.src.model_utils import get_model_data_uri

In [24]:
latest_endpoint = get_model_data_uri(['model_name', 'endpoint_name'], run_number=0, mlflow_data_path="sqlite:///mlflow/endpoints.db")
latest_endpoint

{'model_name': 'huggingface-pytorch-tgi-inference-2024-11-02-18-25-29-952',
 'endpoint_name': 'huggingface-pytorch-tgi-inference-2024-11-02-18-25-31-152'}

In [25]:
latest_endpoint['model_name']

'huggingface-pytorch-tgi-inference-2024-11-02-18-25-29-952'

In [22]:
latest_endpoint['endpoint_name']

"HuggingFacePredictor: {'endpoint_name': 'huggingface-pytorch-tgi-inference-2024-11-02-18-07-05-569', 'sagemaker_session': <sagemaker.session.Session object at 0x00000235EEC4BDC0>, 'serializer': <sagemaker.base_serializers.JSONSerializer object at 0x00000235ED21F430>, 'deserializer': <sagemaker.base_deserializers.JSONDeserializer object at 0x00000235ED21E590>}"

Now delete it.

In [23]:
from modules.utils.utils import AWSConnector

aws_connector = AWSConnector()

aws_connector.delete_sagemaker_model('huggingface-pytorch-tgi-inference-2024-11-02-18-07-04-383')
aws_connector.delete_sagemaker_endpoint('huggingface-pytorch-tgi-inference-2024-11-02-18-07-05-569')

