In [2]:
from IPython.display import display, HTML , Markdown
from snowflake.snowpark.session import Session
from snowflake.snowpark.functions import udf, col, pandas_udf
import snowflake.snowpark.types as T
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from io import BytesIO
from PIL import Image
import pandas as pd
import numpy as np
from cachetools import cached
import base64, json ,logging
from tqdm import tqdm
import sklearn.metrics
# Import the commonly defined utility scripts using
# dynamic path include
import sys
sys.path.append('../python/lutils')
import sflk_base as L

display(Markdown("### Initialization"))
logging.basicConfig(stream=sys.stdout, level=logging.ERROR)

# Source various helper functions
%run ./scripts/notebook_helpers.py

# Define the project home directory, this is used for locating the config.ini file
PROJECT_HOME_DIR = '../..'
config = L.get_config(PROJECT_HOME_DIR)
session = L.connect_to_snowflake(PROJECT_HOME_DIR)

if(session == None):
   raise Exception(f'Unable to connect to snowflake. Validate connection information ')

session.use_role(f'''{config['APP_DB']['role']}''')
session.use_schema(f'''{config['APP_DB']['database']}.{config['APP_DB']['schema']}''')
session.use_warehouse(f'''{config['SNOW_CONN']['warehouse']}''')

df1 = session.sql('select current_account(), current_user() ,current_role() ,current_database() ,current_schema();').to_pandas()
display(df1)

### Initialization

Unnamed: 0,CURRENT_ACCOUNT(),CURRENT_USER(),CURRENT_ROLE(),CURRENT_DATABASE(),CURRENT_SCHEMA()
0,ANA95816,JPRUSA,SYSADMIN,FASHION,PUBLIC


In [3]:
output = session.sql('SHOW STAGES;').collect()
pd.DataFrame(output)

Unnamed: 0,created_on,name,database_name,schema_name,url,has_credentials,has_encryption_key,owner,comment,region,type,cloud,notification_channel,storage_integration,owner_role_type
0,2023-05-26 09:00:39.684000-07:00,MODEL_STG,FASHION,PUBLIC,,N,N,SYSADMIN,used for holding ml models.,,INTERNAL,,,,ROLE
1,2023-05-26 09:00:39.996000-07:00,UDF_STG,FASHION,PUBLIC,,N,N,SYSADMIN,,,INTERNAL,,,,ROLE


## Top n most similar images from a new input

This sproc returns the image names for the top n most similar images to an input that can be new.

Input is transformed and passed to the embedding model, which is loaded from staging. the SPROC returns a list of the top n most similar images to the provided input.

The sproc requires that the input image is already b64 encoded, e.g.

`with open(img_path, "rb") as img_file:`

`    img = base64.b64encode(img_file.read()).decode("utf-8")`



already encoded images can be found in the `DATA` column of the table `IMAGES_ENCODED`

In [318]:
def load_transform() -> object:
    from torchvision import transforms
    data_transforms = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    return data_transforms

def load_model(model_path: str) -> object:
    # Load pytorch model
    import torch 
    model = torch.load(model_path)
    model.eval()
    return model


def top_n_new(sim_df, n=10):
    return sim_df[['NEW']].sort_values('NEW', ascending=False)[1:].head(n) # top n

def sproc_recommender_model(session: Session, 
                            image: str,
                            table: str,
                            n: int,
                            ) -> T.Variant:
    import sys
    import json
    import base64
    from io import BytesIO
    from PIL import Image
    import numpy as np
    import torch
    from sklearn.metrics.pairwise import cosine_similarity
    
    
    # get embedding for new image
    IMPORT_DIRECTORY_NAME = "snowflake_import_directory"
    import_dir = sys._xoptions[IMPORT_DIRECTORY_NAME]
    model_name = 'embedding_model.pt'
    model = load_model(import_dir+model_name)
    transform = load_transform()
    img = Image.open(BytesIO(base64.b64decode(image)))
    img = transform(img).unsqueeze(0)
    #
    
    with torch.no_grad():
        embedding = model(img)#.squeeze()
        embedding = embedding['features'][0].squeeze().numpy().astype('float')
    # get most similar images
    
    # get prior embeddings
    sdf = session.table(table)
    embeddings = sdf.select('EMBEDDING', 'NAME').to_pandas()
    embeddings.loc[embeddings.shape[0]]=[json.dumps(list(embedding)), 'NEW']
    embeddings_np = list(embeddings['EMBEDDING'].apply(lambda x: np.array(json.loads(x))))
    # compute similarities
    #input_ = embeddings_np[0]
    similarity = cosine_similarity(embeddings_np)
    sim_df = pd.DataFrame(similarity, 
                      columns=embeddings.NAME, 
                      index=embeddings.NAME)
    #assert False, embeddings.index
    return list(top_n_new(sim_df, n=n).index)

# Registering the function as a Stored Procedure
sproc_recommender_model = session.sproc.register(func=sproc_recommender_model, 
                                                      name='sproc_recommender_model', 
                                                      is_permanent=True, 
                                                      replace=True,
                                                      stage_location='@MODEL_STG', 
                                                      packages=['snowflake-snowpark-python',
                                                                'numpy',
                                                                'pandas',
                                                                'scikit-learn',
                                                                'pillow', 
                                                                'torchvision', 
                                                               ], 
                                                      imports=['@MODEL_STG/embedding_model.pt'])

In [319]:
# to get an image for testing purposes
data_table = "IMAGES_ENCODED"
sdf = session.table(data_table)

In [320]:
test_images = sdf.select('DATA').limit(5).to_pandas()

In [321]:
output = sproc_recommender_model(test_images.DATA[1], 'IMAGES_ENCODED', 5, session=session,)

In [322]:
eval(output)

['undershirt_9bdac063-6c07-4bfc-a04a-e45224c503df.jpg',
 'undershirt_8a95a682-4a51-461e-9ad6-8783acee3034.jpg',
 'undershirt_34e679bc-137f-4cde-a732-9dba70a8fbae.jpg',
 'undershirt_0f58b637-b85c-4082-ba14-64c4dc120b93.jpg',
 'dress_4114a9c0-eea0-442c-af14-8df6c586a016.jpg']

## Top n most similar images to an already seen image

similar to above; however, the input image has already been seen and encoded into the table `IMAGES_ENCODED`.

Instead of providing the data for the input image only the name of the image needs to be provided. Since we already have the associated embedding for the image, we do not need to load or use the embedding model in this sproc.


In [304]:
def top_n(sim_df, item, n=10, best=True):
    return sim_df[[item]].sort_values(item, ascending=False)[1:].head(n) # top n

def sproc_existing_recommender_model(session: Session, 
                            item: str,
                            table: str,
                            n: int,
                            ) -> T.Variant:
    import json
    import numpy as np
    from sklearn.metrics.pairwise import cosine_similarity
    
    # get prior embeddings
    sdf = session.table(table)
    embeddings = sdf.select('EMBEDDING', 'NAME').to_pandas()
    embeddings_np = list(embeddings['EMBEDDING'].apply(lambda x: np.array(json.loads(x))))
    # compute similarities
    similarity=cosine_similarity(embeddings_np)

    sim_df = pd.DataFrame(similarity, 
                      columns=embeddings.NAME, 
                      index=embeddings.NAME)
    
    return list(top_n(sim_df, item, n=n).index)

# Registering the function as a Stored Procedure
sproc_existing_recommender_model = session.sproc.register(func=sproc_existing_recommender_model, 
                                                      name='sproc_existing_recommender_model', 
                                                      is_permanent=True, 
                                                      replace=True,
                                                      stage_location='@MODEL_STG', 
                                                      packages=['snowflake-snowpark-python',
                                                                'numpy',
                                                                'pandas',
                                                                'scikit-learn'], 
                                                      imports=['@MODEL_STG/model.onnx'])

In [305]:
test_image_names = sdf.select('NAME').limit(5).to_pandas()

In [306]:
test_image_names

Unnamed: 0,NAME
0,dress_bac70656-a50a-4a13-8c3f-63e3a0a28338.jpg
1,shirt_3d20c462-34a4-4501-9c91-f6d3a9684715.jpg
2,body_ff1b1bd2-1248-4863-b486-cf0fe8b3d210.jpg
3,hoodie_27895f13-d204-4cd0-9c05-816692747854.jpg
4,images_original_f77a41f0-e774-40a5-8442-2b7a02...


In [307]:
output = sproc_existing_recommender_model(test_image_names.NAME[0], 'IMAGES_ENCODED', 5, session=session)

In [308]:
eval(output)

['dress_cbf45350-1a65-4cec-abad-0f64a4a459eb.jpg',
 'dress_e4f8a796-8963-4a43-8e20-de6430d33780.jpg',
 'dress_cc66ab14-3372-4c9a-9171-9007d7154538.jpg',
 'dress_15eb72dd-9ab6-4c31-8b87-17f56e9f1290.jpg',
 'dress_f1c78283-2da0-45fc-8f20-4f874519d415.jpg']