Article for inspiration: https://www.snowflake.com/blog/container-services-llama2-snowpark-ml/

# 0.0 Pre-Reqs

In [None]:
from snowflake.snowpark.session import Session
from snowflake.ml.registry import model_registry
from snowflake.ml.model import deploy_platforms

import os
import json
import sys
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [None]:
connection_parameters = json.load(open('/Users/skhara/Documents/Code/creds_spcs.json'))
session = Session.builder.configs(connection_parameters).create()

#### Compute Pool
A compute pool is a collection of virtual machines or nodes which can have GPUs.
This process takes time.

#### Learn More:
https://medium.com/snowflake/snowpark-container-services-a-tech-primer-99ff2ca8e741#:~:text=my_image%3Alatest-,Compute%20Pools,-A%20service%20in

In [None]:
session.sql('''CREATE COMPUTE POOL SKHARA_COMPUTE_GPU3
MIN_NODES = 1
MAX_NODES = 1
INSTANCE_FAMILY = "GPU_3"
''').collect()

# 1.0 LLAMA Model Setup

## 1.1 Load LLAMA Model

In [None]:
HF_AUTH_TOKEN = "" #Your token from Hugging Face

In [None]:
# from transformers import pipeline
from snowflake.ml.model.models import huggingface_pipeline

llama_model = huggingface_pipeline.HuggingFacePipelineModel(task="text-generation",
                                                            model="meta-llama/Llama-2-7b-chat-hf",
                                                            token=HF_AUTH_TOKEN,
                                                            return_full_text=False,
                                                            max_new_tokens=100)

In [None]:
sys.getsizeof(llama_model)

## 1.2 Register the model

In [None]:
registry_name = 'SKHARA' #Replace this with the name of a database that you have access to
schema_name = 'BUILD_REGISTRY'

model_registry.create_model_registry(session= session,
                                     database_name= registry_name,
                                     schema_name= schema_name)

registry = model_registry.ModelRegistry(session= session,
                                        database_name= registry_name,
                                        schema_name= schema_name)

In [None]:
MODEL_NAME = "LLAMA2_MODEL_7b_CHAT"
MODEL_VERSION = "1"

llama_model_ref= registry.log_model(
    model_name=MODEL_NAME,
    model_version=MODEL_VERSION,
    model=llama_model
)

llama_model_ref

## 1.3 Deploy Model

Pre-req: create a GPU compute pool

In [None]:
# Check if compute pool is ready. It may take some time for the compute resource to be ready.
session.sql('''Show compute pools like 'SKHARA_%';''').collect()

In [None]:
llama_model_ref.deploy(
    deployment_name="llama_predict",
    platform= deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES,
    options={"compute_pool": "SKHARA_COMPUTE_GPU3",
             "num_gpus": 1
            },
    permanent = True
)

# 2.0 Data Processing

We will load a JSON file to a Snowflake Table. For prediction purposes, we have two options - use Snowpark DataFrame or use Local Pandas DataFrame.

Snowpark dataframes allow us to work at scale and enable us to keep the data on server side without ever bringing data locally.

## 2.1 Load Data

In this lab, you do not have the required data in Snowflake so we will load a local .csv file to Snowflake using snowflake-snowpark library.

In [None]:
json_dataset = pd.read_json("data/frosty_transcripts_all.jsonl", lines=True).convert_dtypes()
json_dataset.head()

In [None]:
dict(json_dataset.iloc[0])

In [None]:
TABLE_NAME = "BUILD_HOL_DATA"
session.write_pandas(json_dataset, table_name=TABLE_NAME, auto_create_table=True, overwrite=True)

## 2.2 Input: Prompt Engineering

In [None]:
# I am going to bring 2 rows from the data to my local machine to create the prompt examples.
sdf_input = session.table('BUILD_HOL_DATA')
df_local = sdf_input.limit(2).to_pandas()
df_local.head()

In [None]:
prompt_prefix = f'''[INST] <<SYS>>
Your output will be parsed by a computer program as a JSON object. Please respond ONLY with valid json that conforms to this JSON schema:
{{
  "name": {{
    "type": "string",
    "description": "The name of the person calling"
  }},
  "location": {{
    "type": "string",
    "description": "The name of the location where the person is calling from."
  }},
  "toy_list": {{
    "type": "array",
    "description": "The list of toys requested by the person calling."
  }},
  "required": ["name", "location", "toy_list"]
}}

Example 1:
Input: "{df_local['transcript'].iloc[0]}"
Output: {{"name": {df_local['name'].iloc[0]}, "location": {df_local['location'].iloc[0]}, "toy_list": {df_local['toy_list'].iloc[0]}}}

Example 2:
Input: "{df_local['transcript'].iloc[1]}"
Output: {{"name": {df_local['name'].iloc[1]}, "location": {df_local['location'].iloc[1]}, "toy_list": {df_local['toy_list'].iloc[1]}}}
<</SYS>>

Input:

'''

prompt_suffix = " [/INST]"

In [None]:
import snowflake.snowpark.functions as F

input_df = sdf_input.with_column(
    '"inputs"',
    F.concat_ws(
        F.lit(" "), F.lit(prompt_prefix), F.col('"transcript"'), F.lit(prompt_suffix)
    ),
)

In [None]:
input_df.write.mode("overwrite").save_as_table("DATA_WITH_PROMPT")

In [None]:
# df_local['inputs'] = df_local['transcript'].apply(add_prompt)
# print(df_local['inputs'].iloc[3])

# 3.0 LLM Inference

## 3.1 Get Deployed Model

In [None]:
REGISTRY_NAME = 'SKHARA'
SCHEMA_NAME = 'BUILD_REGISTRY'
MODEL_NAME = 'LLAMA2_MODEL_7b_CHAT'
MODEL_VERSION = '7'
DEPLOYMENT_NAME = 'llama_predict'

In [None]:
registry = model_registry.ModelRegistry(session= session,
                                        database_name= REGISTRY_NAME,
                                        schema_name= SCHEMA_NAME)

In [None]:
model_list = registry.list_models()
model_list.to_pandas()

In [None]:
model_list = registry.list_deployments(model_name = MODEL_NAME, model_version = MODEL_VERSION)
model_list.to_pandas()

In [None]:
model = model_registry.ModelReference(registry=registry, model_name=MODEL_NAME, model_version=MODEL_VERSION)

## 3.2 Inference using LLMs

In [None]:
sdf_data_prompt = session.table('DATA_WITH_PROMPT')
sdf_data_prompt.limit(2).to_pandas()

In [None]:
session.sql('SHOW COMPUTE POOLS').collect()

In [None]:
res = model.predict(
    deployment_name= DEPLOYMENT_NAME,
    data= sdf_data_prompt
)

In [None]:
df_local = res.limit(5).to_pandas() #bring 5 rows locally

## 3.3 Output Processing
Ensure that processing code conforms to the JSON Structure provided during Prompt Engineering.

In [None]:
import json

def format_output(output_string):
    try:
        # Step 1: Parse the outer string to convert it to a list of dictionaries
        outer_list = json.loads(output_string)
        
        # Step 2: Extract the 'generated_text' value from the first dictionary in the list
        generated_text_str = outer_list[0]['generated_text']
        
        # Step 3: Locate the JSON object within the 'generated_text' value
        start_pos = generated_text_str.find('{')
        end_pos = generated_text_str.rfind('}')
        if start_pos == -1 or end_pos == -1:
            raise ValueError("No JSON object found in generated_text")
        json_str = generated_text_str[start_pos:end_pos + 1]
        
        # Step 4: Parse the JSON object to convert it to a dictionary
        json_obj = json.loads(json_str)
        
        return json_obj
    except:
        return 'Could not parse output'

In [None]:
for i in range(len(df_local)):
    print(f'\n\n **** Transcript # {i} ****')
    print(df_local['transcript'].iloc[i])
    print('\n')
    print(format_output(df_local['outputs'].iloc[i]))

# 4.0 Clean Up

In [None]:
session.sql("ALTER COMPUTE POOL IF EXISTS SKHARA_COMPUTE_GPU3 STOP ALL").collect()