## Install Prerequisites

Let's start by installing some packages we'll need.

**NOTE: You may need to restart the notebook after installing these for them to work right!**

In [None]:
!pip install "snowflake-connector-python[pandas]" "transformers~=4.31.0" "torch~=2.0.1"

## Imports

In [1]:
import getpass
import os
import shutil
from pathlib import Path

import numpy as np
import pandas as pd
import snowflake.connector
from transformers.models.bert.modeling_bert import BertModel
from transformers.models.bert.tokenization_bert_fast import BertTokenizerFast

  from .autonotebook import tqdm as notebook_tqdm


## Download The Embedding Model For Internet-Free Use

In [2]:
# Configs.
ARCHIVE_FORMAT = "tar"
MODEL_NAME = "intfloat/e5-base-v2"
SAVE_DIR_NAME = "e5_base_v2_assets"
SAVE_DIR = Path(SAVE_DIR_NAME)
ARCHIVE_FILE_NAME = f"{SAVE_DIR_NAME}.{ARCHIVE_FORMAT}"
TOKENISER_DIR = SAVE_DIR / "tokenizer"
MODEL_DIR = SAVE_DIR / "model"

In [3]:
# Download the tokenizer and model and save copies to specific local directories.
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Avoids warnings later.
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
model = BertModel.from_pretrained(MODEL_NAME)
assert isinstance(model, BertModel)  # This appeases the typechecker, if you're using one.
tokenizer.save_pretrained(TOKENISER_DIR)
model.save_pretrained(MODEL_DIR)

In [4]:
# Validate that our saved files work by loading from them.
tokenizer = BertTokenizerFast.from_pretrained(TOKENISER_DIR)
model = BertModel.from_pretrained(MODEL_DIR)

In [5]:
# Pack up the assets into an archive for uploading to a Snowflake stage as a single file.
_ = shutil.make_archive(
    base_name=SAVE_DIR_NAME,
    format=ARCHIVE_FORMAT,
    root_dir=SAVE_DIR.parent,
    base_dir=SAVE_DIR,
    verbose=True,
)

## Upload The Model To Snowflake

In [6]:
# Edit these parameters.
connection_params = {
    "account"   : "<your_account_identifier_goes_here>",
    "user"      : "<your_username_goes_here>",
    "role"      : "ACCOUNTADMIN",
}

# Establish and configure connection.
connection_params["password"] = getpass.getpass(f"Enter password:")
connection = snowflake.connector.connect(**connection_params)

Enter password: ········


In [7]:
# # First, let's create a new warehouse, DB, and schema to use for this quickstart.
connection.execute_string("create or replace warehouse text_embedding_quickstart_wh")
connection.execute_string("use warehouse text_embedding_quickstart_wh")
connection.execute_string("create or replace database text_embedding_quickstart_db")
connection.execute_string("use database text_embedding_quickstart_db")
connection.execute_string("create or replace schema text_embedding_quickstart_schema")
connection.execute_string("use schema text_embedding_quickstart_schema")

[<snowflake.connector.cursor.SnowflakeCursor at 0xffff25f39ac0>]

In [8]:
# Next, we'll create a new stage and upload our model archive file to it.
# NOTE: Be patient, this ~0.5GB upload can take several minutes over some internet connections.
stage_name = "text_embedding_quickstart_stage"
connection.execute_string(f"create or replace stage {stage_name}")
connection.execute_string(f"PUT 'file://{ARCHIVE_FILE_NAME}' @{stage_name}/")

## "Write" A Python UDF

Before we can locally test a Python UDF, we need to write one!

Normally this takes a fair amount of effort, but the theme of this guide is blazing through to a working system first, then optionally coming back to discuss how it all works under the hood. Therefore, all we need to do to "write" our UDF here is to invoke the next cell and let the `%%writefile` cell magic write our premade UDF implementation to disk as `udf_implementation.py`.

Feel free to pause and take as much time as you'd like reading the implementation below, but know that it's equally fine to just run the cell and move on for now.

In [9]:
%%writefile udf_implementation.py
import fcntl
import itertools
import shutil
import sys
import threading
from pathlib import Path
from typing import List
from typing import Tuple

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch import Tensor
from transformers.models.bert.modeling_bert import BertModel
from transformers.models.bert.tokenization_bert_fast import BertTokenizerFast


####
#### CONFIG
####
ARCHIVE_FORMAT = "tar"
SAVE_DIR_NAME = "e5_base_v2_assets"
MAX_BATCH_SIZE = 8
EMBEDDING_SIZE = 768
EMBEDDING_AS_BYTES_DTYPE = np.void(EMBEDDING_SIZE * np.float32().nbytes)

####
#### BUILDING BLOCKS
####


# Lock pattern adapted from "Unzipping a Staged File" official example.
# https://docs.snowflake.com/en/developer-guide/udf/python/udf-python-examples#unzipping-a-staged-file
# File lock lets us synchronize access to /tmp in parallelized execution.
class FileLock:
    def __enter__(self):
        self._lock = threading.Lock()
        self._lock.acquire()
        self._fd = open("/tmp/lockfile.LOCK", "w+")
        fcntl.lockf(self._fd, fcntl.LOCK_EX)

    def __exit__(self, type, value, traceback):
        self._fd.close()
        self._lock.release()


def _load_assets(archive_path: Path) -> Tuple[BertTokenizerFast, BertModel]:
    # Config.
    tmp = Path("/tmp")
    extracted_dir = tmp / SAVE_DIR_NAME
    tokenizer_dir = extracted_dir / "tokenizer"
    model_dir = extracted_dir / "model"

    # Extract and load, with a lock placed for concurrency sanity.
    with FileLock():
        assert archive_path.exists(), f"{archive_path} not found!"
        shutil.unpack_archive(archive_path, tmp)
        assert tokenizer_dir.exists(), "failed to extract tokenizer dir"
        assert model_dir.exists(), "failed to extract model dir"
        tokenizer = BertTokenizerFast.from_pretrained(
            tokenizer_dir, local_files_only=True
        )
        model = BertModel.from_pretrained(model_dir, local_files_only=True)
    assert isinstance(model, BertModel)  # Appease typechecker.
    return tokenizer, model


def _average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


def _embed_batch(
    tokenizer: BertTokenizerFast, model: BertModel, texts: List[str]
) -> np.ndarray:
    # Tokenize.
    batch_dict = tokenizer(
        texts, max_length=512, padding=True, truncation=True, return_tensors="pt"
    )

    # Embed.
    with torch.no_grad():
        model_outputs = model(**batch_dict)
        embeddings = F.normalize(
            _average_pool(model_outputs.last_hidden_state, batch_dict["attention_mask"]),  # type: ignore
            p=2,
            dim=1,
        )

    # All done!
    return embeddings.numpy()


def _byte_pack_embedding_matrix_rows(embedding_matrix: np.ndarray) -> np.ndarray:
    assert embedding_matrix.dtype == np.float32, "expect float32 embedding matrix"
    assert embedding_matrix.flags.c_contiguous, "expect c_contiguous embedding matrix"
    batch_embedding_as_bytes = embedding_matrix.ravel().view(EMBEDDING_AS_BYTES_DTYPE)
    assert batch_embedding_as_bytes.shape == (
        embedding_matrix.shape[0],
    ), "output shape"
    return batch_embedding_as_bytes


####
#### LOADING STATE
####

if "snowflake_import_directory" in sys._xoptions:
    # In Snowflake, the input path will be given like this.
    sf_import_dir = Path(sys._xoptions["snowflake_import_directory"])
    sf_archive_path = (sf_import_dir / SAVE_DIR_NAME).with_suffix(f".{ARCHIVE_FORMAT}")
else:
    # Locally, we can mock it here.
    sf_archive_path = Path(SAVE_DIR_NAME).with_suffix(f".{ARCHIVE_FORMAT}")

tokenizer, model = _load_assets(sf_archive_path)


####
#### DEFINING ACTUAL UDF
####
def embed(df: pd.DataFrame) -> np.ndarray:
    # Unpack and validate our inputs.
    assert df.columns == (0,), "expect single column"
    inputs = df[0].tolist()
    assert len(inputs) > 0, "expect one or more inputs"
    assert all(isinstance(input, str) for input in inputs), "expect string inputs"

    # Do internal batching according to the `batch_size` constant.
    input_iter = iter(inputs)
    batched_iter = iter(lambda: list(itertools.islice(input_iter, MAX_BATCH_SIZE)), [])

    # Run the embedding.
    # Note: We're byte-packing our float32 embedding vectors into binary scalars
    # so that we have a scalar output compatible with Snowflake BINARY type.
    i = 0
    result = np.ndarray(shape=len(inputs), dtype=EMBEDDING_AS_BYTES_DTYPE)
    for batch in batched_iter:
        n_in_batch = len(batch)
        embedding_matrix = _embed_batch(tokenizer=tokenizer, model=model, texts=batch)
        result[i : i + n_in_batch] = _byte_pack_embedding_matrix_rows(embedding_matrix)
        i = i + n_in_batch

    return result


# Make sure the UDF is vectorized.
embed._sf_vectorized_input = pd.DataFrame  # type: ignore
embed._sf_max_batch_size = 32  # type: ignore


Overwriting udf_implementation.py


## Locally Test Our UDF

Now that we have "written" a UDF implemenation, let's take the code for a spin locally before installing it in Snowpark.

**NOTE: Importing our udf implementation has the side-effect of loading the tokenizer and model from the tarfile on disk.** If you skipped the model downloading and archiving steps above, you will get an error in the next cell. Normally it is considered bad form to make a Python module load state on import, but in Snowpark this is commonly recommended [as a performance optimization](https://docs.snowflake.com/en/developer-guide/udf/python/udf-python-designing#put-expensive-initialization-in-the-module).

In [10]:
# NOTE: The `udf_implementation` module requires the model tarfile to exist at import time.
from udf_implementation import embed

In [11]:
df_mock = pd.DataFrame({0: ["test text", "another test text"] * 10})
result_array = embed(df_mock)
first_embedding = np.frombuffer(result_array[0], dtype=np.float32)
result_array.shape, first_embedding.shape, first_embedding[:10]

((20,),
 (768,),
 array([-0.01143286, -0.00579326, -0.02625675,  0.0083553 ,  0.03573489,
        -0.01909555,  0.02105159,  0.03357907, -0.00106065, -0.00575643],
       dtype=float32))

## Deploy Our UDF To Snowflake

Now that we have verified our UDF implementation works locally on the exact same model archive file we have stored in our Snowflake stage, the last step is to push our UDF implementation to Snowpark.

To do this, we first upload our UDF code to a Snowflake stage, then we invoke a `create function` SQL statement to tell Snowflake to use it.

In [12]:
# Upload assets and UDF code.
udf_implemenation_file = "udf_implementation.py"
connection.execute_string(f"PUT 'file://{udf_implemenation_file}' @{stage_name}/ OVERWRITE = true")

[<snowflake.connector.cursor.SnowflakeCursor at 0xffff5a7ec130>]

In [13]:
# Create the UDF.
create_sql = f"""
create or replace function warehouse_text_embed(s string)
returns binary
language python
runtime_version = '3.8'
packages = ('numpy', 'pandas', 'pytorch==2.0.1', 'transformers==4.29.2')
handler = '{Path(udf_implemenation_file).stem}.embed'
imports = ('@{stage_name}/{udf_implemenation_file}', '@{stage_name}/{ARCHIVE_FILE_NAME}')
"""
print(create_sql)
connection.execute_string(create_sql)


create or replace function warehouse_text_embed(s string)
returns binary
language python
runtime_version = '3.8'
packages = ('numpy', 'pandas', 'pytorch==2.0.1', 'transformers==4.29.2')
handler = 'udf_implementation.embed'
imports = ('@text_embedding_quickstart_stage/udf_implementation.py', '@text_embedding_quickstart_stage/e5_base_v2_assets.tar')



[<snowflake.connector.cursor.SnowflakeCursor at 0xffff5a216d30>]

In [14]:
# Use the UDF!
query = "select warehouse_text_embed('hello world!') as embeding"
df_result = connection.cursor().execute(query).fetch_pandas_all()
df_result

Unnamed: 0,EMBEDING
0,b'\x07\xfbS;\x95\x9e\x0c\xbcG^\x1a\xbd\xaf{Y\x...


In [15]:
# Validate the output is what we expect.
result_embedding = np.frombuffer(df_result.iat[0,0], dtype=np.float32)
expected_embedding = np.frombuffer(embed(pd.DataFrame({0: ["hello world!"]}))[0], dtype=np.float32)
result_embedding.shape, result_embedding[:10], np.all(expected_embedding == result_embedding)

((768,),
 array([ 0.00323457, -0.00858273, -0.03768757, -0.00331853,  0.0448132 ,
        -0.03027021,  0.03205844,  0.05281176, -0.00108271, -0.01863021],
       dtype=float32),
 True)

## BINARY vs. ARRAY Vectors

For storage and computational efficiency, our text embedding UDF stores embedding vectors as BINARY blobs. If you want to treat them as Snowflake ARRAY type instead, all it takes is one line of Javascript to convert them!

In [16]:
# Create a UDF to unpack arrays.
connection.execute_string("""
create or replace function unpack_array(B binary)
returns array
language javascript
as $$ return Array.from(new Float32Array(B.buffer)); $$;
""")

[<snowflake.connector.cursor.SnowflakeCursor at 0xffff599d5460>]

In [17]:
# Unpack from binary embeddings to array-type embeddings.
import json
connection.execute_string("""
create or replace temporary table tmp_emb (embedding binary) as
select warehouse_text_embed('hello world!') as embedding
""")
query = """
select embedding, unpack_array(embedding) as embedding_array from tmp_emb
"""
df_result = connection.cursor().execute(query).fetch_pandas_all()
print(
    df_result["EMBEDDING"].iat[0][:30],  # Vector as binary.
    json.loads(df_result["EMBEDDING_ARRAY"].iat[0])[:30]  # Vector as array (json string in Pandas).
)

b'\x07\xfbS;\x95\x9e\x0c\xbcG^\x1a\xbd\xaf{Y\xbb\x0b\x8e7=:\xf9\xf7\xbc\xb6O\x03=%Q' [0.003234566887840629, -0.008582730777561665, -0.03768756613135338, -0.003318529343232512, 0.04481319710612297, -0.03027020767331123, 0.03205844014883041, 0.05281176045536995, -0.001082708826288581, -0.01863021403551102, -0.02498815208673477, 0.04769045859575272, -0.08714178204536438, 0.018616558983922, -0.03132862970232964, 0.006546470336616039, 0.02404151484370232, -0.008242154493927956, 0.03759677708148956, -0.02006378769874573, -0.04824942350387573, -0.05238137021660805, 0.04970219358801842, -0.00879327766597271, 0.005695960950106382, 0.01114101614803076, -0.005848507396876812, 0.001621721195988357, -0.04690505191683769, -0.03712567314505577]
