# Model Inference using SPCS

Documentation: https://docs.snowflake.com/en/developer-guide/snowflake-ml/model-registry/container

#### Upgrade `snowflake-ml-python` package

In [None]:
! pip install snowflake-ml-python --upgrade -q

In [None]:
# Import python packages
import json
import os

import pandas as pd
import requests
import transformers

import snowflake.connector
from snowflake.ml import version
from snowflake.ml.registry import registry as registry_module
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
from snowflake.ml.model import target_platform

from snowflake.snowpark.context import get_active_session

session = get_active_session()
print("Snowflake ML version: ", version.VERSION)

#### Create a transformer pipeline model

In [None]:
llama_3_model = transformers.pipeline(
    model="meta-llama/Llama-3.1-8B-Instruct",
    task="text-generation",
    # TODO: Add your token to environment variable if not already set
    token=os.environ["HUGGINGFACE_TOKEN"],
    device_map="auto",
)

llama_3_model

In [None]:
registry = registry_module.Registry(session=session)
registry

In [None]:
mv = registry.log_model(
    model=llama_3_model,
    model_name="llama_3",
    version_name="V1",
    target_platforms=target_platform.SNOWPARK_CONTAINER_SERVICES_ONLY,
)
mv

#### Create a service from the logged model

In [None]:
service_name = "llama_3_service"
# TODO: Add your image repo here
image_repo = "<your-image-repo>"
# TODO: Add your compute pool here
service_compute_pool = "<your-compute-pool>"

In [None]:
mv.create_service(
    service_name="llama_3_service",
    image_repo=image_repo,
    service_compute_pool=service_compute_pool,
    # TODO: Modify number of GPUs here
    gpu_requests="1",
    ingress_enabled=True,
)

In [None]:
mv = registry.get_model("llama_3").version("V1")
mv

In [None]:
# List all services in a compute pool
services_df = mv.list_services()
inference_endpoint = services_df.iloc[0].inference_endpoint
inference_endpoint

#### Call the service function of the model

In [None]:
x = [
    [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "What is the capital of France?"},
    ]
]

x_df = pd.DataFrame([x], columns=["inputs"])
x_df

In [None]:
output_df = mv.run(
    X=x_df,
    service_name=service_name,
)
output_df

In [None]:
output_df.iloc[0][0]

#### Invoke the inference using REST API

In [None]:
def get_headers():
    # TODO: change the `pat_token` to your PAT token
    pat_token = "<PAT_token>"
    headers = {"Authorization": f'Snowflake Token="{pat_token}"'}
    return headers

In [None]:
# TODO: change the `url` to the service ingress URL
# this can be found in the "SHOW ENDPOINTS IN SERVICE LLAMA_3_SERVICE" sql query output above
url = f"http://{inference_endpoint}/--call--"

response = requests.post(
    url,
    json={"data": x},
    headers=get_headers(),
    timeout=15,
)

response.text