Article for inspiration: https://www.snowflake.com/blog/container-services-llama2-snowpark-ml/

In [65]:
from snowflake.snowpark.session import Session
from snowflake.ml.registry import model_registry
from snowflake.ml.model import deploy_platforms

import os
import json
import sys
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [66]:
connection_parameters = json.load(open('/Users/skhara/Documents/Code/creds_spcs.json'))
session = Session.builder.configs(connection_parameters).create()

#### Compute Pool
A compute pool is a collection of virtual machines or nodes which can have GPUs.
This process takes time.

#### Learn More:
https://medium.com/snowflake/snowpark-container-services-a-tech-primer-99ff2ca8e741#:~:text=my_image%3Alatest-,Compute%20Pools,-A%20service%20in

In [67]:
session.sql('''CREATE COMPUTE POOL SKHARA_COMPUTE_GPU3
MIN_NODES = 1
MAX_NODES = 1
INSTANCE_FAMILY = "GPU_3"
''').collect()

[Row(status='Compute Pool SKHARA_COMPUTE_GPU3 successfully created.')]

# 1.0 LLAMA Model Setup

## 1.1 Load LLAMA Model

In [68]:
HF_AUTH_TOKEN = "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" #Your token from Hugging Face

In [69]:
# from transformers import pipeline
from snowflake.ml.model.models import huggingface_pipeline

llama_model = huggingface_pipeline.HuggingFacePipelineModel(task="text-generation",
                                                            model="meta-llama/Llama-2-7b-chat-hf",
                                                            token=HF_AUTH_TOKEN,
                                                            return_full_text=False,
                                                            max_new_tokens=100)

In [70]:
sys.getsizeof(llama_model)

48

## 1.2 Register the model

In [71]:
registry_name = 'SKHARA' #Replace this with the name of a database that you have access to
schema_name = 'BUILD_REGISTRY'

model_registry.create_model_registry(session= session,
                                     database_name= registry_name,
                                     schema_name= schema_name)

registry = model_registry.ModelRegistry(session= session,
                                        database_name= registry_name,
                                        schema_name= schema_name)

create_model_registry() is in private preview since 0.2.0. Do not use it in production. 


In [72]:
MODEL_NAME = "LLAMA2_MODEL_7b_CHAT"
MODEL_VERSION = "1"

llama_model_ref= registry.log_model(
    model_name=MODEL_NAME,
    model_version=MODEL_VERSION,
    model=llama_model
)

llama_model_ref



<snowflake.ml.registry.model_registry.ModelReference at 0x105b355b0>

## 1.3 Deploy Model

Pre-req: create a GPU compute pool

In [84]:
# Check if compute pool is ready. It may take some time for the compute resource to be ready.
session.sql('''Show compute pools like 'SKHARA_%';''').collect()

[Row(name='SKHARA_COMPUTE_GPU3', state='IDLE', min_nodes=1, max_nodes=1, instance_family='GPU_3', num_services=0, num_jobs=0, auto_suspend_secs=3600, auto_resume='true', active_nodes=0, idle_nodes=1, created_on=datetime.datetime(2023, 10, 31, 13, 28, 56, 790000, tzinfo=<DstTzInfo 'America/Los_Angeles' PDT-1 day, 17:00:00 DST>), resumed_on=datetime.datetime(2023, 10, 31, 13, 28, 56, 802000, tzinfo=<DstTzInfo 'America/Los_Angeles' PDT-1 day, 17:00:00 DST>), updated_on=datetime.datetime(2023, 10, 31, 15, 35, 31, 274000, tzinfo=<DstTzInfo 'America/Los_Angeles' PDT-1 day, 17:00:00 DST>), owner='SPC_USER_ROLE', comment=None),
 Row(name='SKHARA_COMPUTE_GPU7', state='SUSPENDED', min_nodes=1, max_nodes=1, instance_family='GPU_7', num_services=0, num_jobs=0, auto_suspend_secs=0, auto_resume='true', active_nodes=0, idle_nodes=0, created_on=datetime.datetime(2023, 10, 16, 13, 42, 37, 390000, tzinfo=<DstTzInfo 'America/Los_Angeles' PDT-1 day, 17:00:00 DST>), resumed_on=datetime.datetime(2023, 10, 2

In [None]:
llama_model_ref.deploy(
    deployment_name="llama_predict",
    platform= deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES,
    options={"compute_pool": "SKHARA_COMPUTE_GPU3",
             "num_gpus": 1,
             # Remove the 'prebuilt_snowflake_image' argument below when running .deploy() for the first time
             #"prebuilt_snowflake_image": "sfsenorthamerica-fcto-spc.registry.snowflakecomputing.com/skhara/build_registry/snowml_repo/116da812e88f2751324c6a16eb00de3726ed06a3:latest"
            },
    permanent = True
)



# 2.0 Data Processing

We will load a JSON file to a Snowflake Table. For prediction purposes, we have two options - use Snowpark DataFrame or use Local Pandas DataFrame.

Snowpark dataframes allow us to work at scale and enable us to keep the data on server side without ever bringing data locally.

## 2.1 Load Data

In this lab, you do not have the required data in Snowflake so we will load a local .csv file to Snowflake using snowflake-snowpark library.

In [None]:
json_dataset = pd.read_json("data/frosty_transcripts_all.jsonl", lines=True).convert_dtypes()
json_dataset.head()

In [None]:
TABLE_NAME = "BUILD_HOL_DATA"
session.write_pandas(json_dataset, table_name=TABLE_NAME, auto_create_table=True, overwrite=True)

## 2.2 Input: Prompt Engineering

In [None]:
# I am going to bring 2 rows from the data to my local machine to create the prompt examples.
sdf_input = session.table('BUILD_HOL_DATA')
df_local = sdf_input.limit(2).to_pandas()
df_local.head()

In [None]:
prompt_prefix = f'''[INST] <<SYS>>
Your output will be parsed by a computer program as a JSON object. Please respond ONLY with valid json that conforms to this JSON schema:
{{
  "name": {{
    "type": "string",
    "description": "The name of the person calling"
  }},
  "location": {{
    "type": "string",
    "description": "The name of the location where the person is calling from."
  }},
  "toy_list": {{
    "type": "array",
    "description": "The list of toys requested by the person calling."
  }},
  "required": ["name", "location", "toy_list"]
}}

Example 1:
Input: "{df_local['transcript'].iloc[0]}"
Output: {{"name": {df_local['name'].iloc[0]}, "location": {df_local['location'].iloc[0]}, "toy_list": {df_local['toy_list'].iloc[0]}}}

Example 2:
Input: "{df_local['transcript'].iloc[1]}"
Output: {{"name": {df_local['name'].iloc[1]}, "location": {df_local['location'].iloc[1]}, "toy_list": {df_local['toy_list'].iloc[1]}}}
<</SYS>>

Input:

'''

prompt_suffix = " [/INST]"

In [27]:
import snowflake.snowpark.functions as F

input_df = sdf_input.with_column(
    '"inputs"',
    F.concat_ws(
        F.lit(" "), F.lit(prompt_prefix), F.col('"transcript"'), F.lit(prompt_suffix)
    ),
)

In [37]:
input_df.write.mode("overwrite").save_as_table("DATA_WITH_PROMPT")

In [None]:
# df_local['inputs'] = df_local['transcript'].apply(add_prompt)
# print(df_local['inputs'].iloc[3])

# 3.0 LLM Inference

## 3.1 Get Deployed Model

In [32]:
REGISTRY_NAME = 'SKHARA'
SCHEMA_NAME = 'BUILD_REGISTRY'
MODEL_NAME = 'LLAMA2_MODEL_7b_CHAT'
MODEL_VERSION = '7'
DEPLOYMENT_NAME = 'llama_predict'

In [33]:
registry = model_registry.ModelRegistry(session= session,
                                        database_name= REGISTRY_NAME,
                                        schema_name= SCHEMA_NAME)

In [34]:
model_list = registry.list_models()
model_list.to_pandas()

ModelRegistry.list_models() is in private preview since 0.2.0. Do not use it in production. 


Unnamed: 0,CREATION_CONTEXT,CREATION_ENVIRONMENT_SPEC,CREATION_ROLE,CREATION_TIME,ID,INPUT_SPEC,NAME,OUTPUT_SPEC,RUNTIME_ENVIRONMENT_SPEC,TYPE,URI,VERSION,ARTIFACT_IDS,DESCRIPTION,METRICS,TAGS,REGISTRATION_TIMESTAMP
0,,"{\n ""python"": ""3.9.17""\n}","""SPC_USER_ROLE""",2023-10-18 11:19:26.257000-07:00,d90f1c246de211eeae210a72b796458c,,LLAMA2_MODEL_7b_CHAT,,,huggingface_pipeline,sfc://SKHARA.BUILD_REGISTRY.SNOWML_MODEL_D90F1...,3,[],,,,2023-10-18 11:19:27.494000-07:00
1,,"{\n ""python"": ""3.9.17""\n}","""SPC_USER_ROLE""",2023-10-18 11:20:31.620000-07:00,0168781e6de311eeae210a72b796458c,,LLAMA2_MODEL_7b_CHAT,,,huggingface_pipeline,sfc://SKHARA.BUILD_REGISTRY.SNOWML_MODEL_01687...,4,[],,,,2023-10-18 11:20:33.306000-07:00
2,,"{\n ""python"": ""3.9.17""\n}","""SPC_USER_ROLE""",2023-10-23 08:23:21.668000-07:00,1454b4e671b811eeb25b0a72b796458c,,LLAMA2_MODEL_7b_CHAT,,,huggingface_pipeline,sfc://SKHARA.BUILD_REGISTRY.SNOWML_MODEL_1454B...,5,[],,,,2023-10-23 08:23:22.988000-07:00
3,,"{\n ""python"": ""3.9.17""\n}","""SPC_USER_ROLE""",2023-10-23 11:06:19.746000-07:00,d870c6ec71ce11ee9c1d0a72b796458c,,LLAMA2_MODEL_7b_CHAT,,,huggingface_pipeline,sfc://SKHARA.BUILD_REGISTRY.SNOWML_MODEL_D870C...,6,[],,,,2023-10-23 11:06:21.405000-07:00
4,,"{\n ""python"": ""3.9.17""\n}","""SPC_USER_ROLE""",2023-10-23 12:33:41.971000-07:00,0dfc6cb071db11ee9c1d0a72b796458c,,LLAMA2_MODEL_7b_CHAT,,,huggingface_pipeline,sfc://SKHARA.BUILD_REGISTRY.SNOWML_MODEL_0DFC6...,7,[],,,,2023-10-23 12:33:43.307000-07:00


In [35]:
model_list = registry.list_deployments(model_name = MODEL_NAME, model_version = MODEL_VERSION)
model_list.to_pandas()

ModelRegistry.list_deployments() is in private preview since 1.0.1. Do not use it in production. 


Unnamed: 0,MODEL_NAME,MODEL_VERSION,DEPLOYMENT_NAME,CREATION_TIME,TARGET_METHOD,TARGET_PLATFORM,SIGNATURE,OPTIONS,STAGE_PATH,ROLE
0,LLAMA2_MODEL_7b_CHAT,7,llama_predict,2023-10-23 12:35:19.101000-07:00,__call__,SNOWPARK_CONTAINER_SERVICES,"{\n ""inputs"": [\n {\n ""name"": ""inputs...","{\n ""compute_pool"": ""SKHARA_COMPUTE_GPU3"",\n ...",@SKHARA.BUILD_REGISTRY._SYSTEM_REGISTRY_DEPLOY...,"""SPC_USER_ROLE"""


In [36]:
model = model_registry.ModelReference(registry=registry, model_name=MODEL_NAME, model_version=MODEL_VERSION)

## 3.2 Inference using LLMs

In [41]:
sdf_data_prompt = session.table('DATA_WITH_PROMPT')
sdf_data_prompt.limit(2).to_pandas()

Unnamed: 0,transcript,name,location,toy_list,inputs
0,frosty: Hi there! This is Frosty. How can I he...,Alex,Houston,"[\n ""Barbie Science Lab Playset"",\n ""Pokémon...",[INST] <<SYS>>\nYour output will be parsed by ...
1,"frosty: Hello, happy holiday! How can I help y...",Amber,London,"[\n ""Dog-E"",\n ""2023 Holiday Fox 12-Inch Plu...",[INST] <<SYS>>\nYour output will be parsed by ...


In [50]:
session.sql('SHOW COMPUTE POOLS').collect()

[Row(name='FAZEEM_COMPUTE_POOL', state='ACTIVE', min_nodes=1, max_nodes=1, instance_family='STANDARD_1', num_services=2, num_jobs=0, auto_suspend_secs=3600, auto_resume='true', active_nodes=1, idle_nodes=0, created_on=datetime.datetime(2023, 7, 13, 11, 13, 46, 46000, tzinfo=<DstTzInfo 'America/Los_Angeles' PDT-1 day, 17:00:00 DST>), resumed_on=datetime.datetime(2023, 10, 2, 14, 48, 7, 832000, tzinfo=<DstTzInfo 'America/Los_Angeles' PDT-1 day, 17:00:00 DST>), updated_on=datetime.datetime(2023, 10, 31, 11, 50, 30, 70000, tzinfo=<DstTzInfo 'America/Los_Angeles' PDT-1 day, 17:00:00 DST>), owner='SPC_USER_ROLE', comment=None),
 Row(name='FSDEMO', state='ACTIVE', min_nodes=1, max_nodes=2, instance_family='STANDARD_1', num_services=2, num_jobs=0, auto_suspend_secs=0, auto_resume='true', active_nodes=1, idle_nodes=0, created_on=datetime.datetime(2023, 5, 17, 14, 17, 22, 289000, tzinfo=<DstTzInfo 'America/Los_Angeles' PDT-1 day, 17:00:00 DST>), resumed_on=datetime.datetime(2023, 10, 5, 6, 40, 4

In [51]:
res = model.predict(
    deployment_name= DEPLOYMENT_NAME,
    data= sdf_data_prompt
)

df_local = res.limit(5).to_pandas() #bring 5 rows locally

## 3.3 Output Processing
Ensure that processing code conforms to the JSON Structure provided during Prompt Engineering.

In [59]:
import json

def format_output(output_string):
    try:
        # Step 1: Parse the outer string to convert it to a list of dictionaries
        outer_list = json.loads(output_string)
        
        # Step 2: Extract the 'generated_text' value from the first dictionary in the list
        generated_text_str = outer_list[0]['generated_text']
        
        # Step 3: Locate the JSON object within the 'generated_text' value
        start_pos = generated_text_str.find('{')
        end_pos = generated_text_str.rfind('}')
        if start_pos == -1 or end_pos == -1:
            raise ValueError("No JSON object found in generated_text")
        json_str = generated_text_str[start_pos:end_pos + 1]
        
        # Step 4: Parse the JSON object to convert it to a dictionary
        json_obj = json.loads(json_str)
        
        return json_obj
    except:
        return 'Could not parse output'

In [61]:
for i in range(len(df_local)):
    print(f'\n\n **** Transcript # {i} ****')
    print(df_local['transcript'].iloc[i])
    print('\n')
    print(format_output(df_local['outputs'].iloc[i]))



 **** Transcript # 0 ****
frosty: Hi there! This is Frosty. How can I help you today?
caller: Hi Frosty, I want to make my holiday wish.
frosty: Of course! May I know your name, please?
caller: I'm Alex.
frosty: Hi Alex! Where are you calling from?
caller: From Houston.
frosty: Wonderful! Now, what's your holiday wish?
caller: I want the barbie science doll set and pokemon plushie.
frosty: Awesome choices, Alex! Your list has been added. Thanks for calling and have a jolly Holiday!


{'name': 'Alex', 'location': 'Houston', 'toy_list': ['Barbie Science Lab Playset', 'Pokémon 8-Inch Plush First Partner Three-Pack']}


 **** Transcript # 1 ****
frosty: Hello, happy holiday! How can I help you today?
caller: I'm Amber. I want to give my wish list.
frosty: Of course, Amber! What's on your wish list?
caller: robot dog and the fox plushie.
frosty: Brilliant choices, Amber! And where are you calling from? 
caller: From London.
frosty: Alright, Amber from London. Your list has been recorded. 

# 4.0 Clean Up

In [64]:
session.sql("ALTER COMPUTE POOL IF EXISTS SKHARA_COMPUTE_GPU3 STOP ALL").collect()

[Row(status='Statement executed successfully.')]