## Imports and sflk session


In [263]:
from IPython.display import display, HTML , Markdown
from snowflake.snowpark.session import Session
from PIL import Image
import pandas as pd
import logging
import tqdm
import numpy as np
import os
import torch
from torch.utils.data import Dataset
from torchvision.io import read_image


# Import the commonly defined utility scripts using
# dynamic path include
import sys
sys.path.append('../python/lutils')
import sflk_base as L

display(Markdown("### Initialization"))
logging.basicConfig(stream=sys.stdout, level=logging.ERROR)

# Source various helper functions
%run ./scripts/notebook_helpers.py

# Define the project home directory, this is used for locating the config.ini file
PROJECT_HOME_DIR = '../../'
config = L.get_config(PROJECT_HOME_DIR)
session = L.connect_to_snowflake(PROJECT_HOME_DIR)

if(session == None):
   raise Exception(f'Unable to connect to snowflake. Validate connection information ')

session.use_role(f'''{config['APP_DB']['role']}''')
session.use_schema(f'''{config['APP_DB']['database']}.{config['APP_DB']['schema']}''')
session.use_warehouse(f'''{config['SNOW_CONN']['warehouse']}''')

df1 = session.sql('select current_account(), current_user() ,current_role() ,current_database() ,current_schema();').to_pandas()
display(df1)

### Initialization

Unnamed: 0,CURRENT_ACCOUNT(),CURRENT_USER(),CURRENT_ROLE(),CURRENT_DATABASE(),CURRENT_SCHEMA()
0,CIB92733,JPRUSA,SERVICESNOW_USER_ROLE,FASHION,PUBLIC


## Train/Test annotations 

To train our embedding model, we need a train and validation split. We elect a 90:10 split, where 90% of the data is used for training and 10% to validate model performance to identify the best model iteration and avoid overfitting.

We create a train and test annotation file containing the image names and labels (classes) for the images selected for training and validation. Since model training resizes images to 256x256 pixels, we are using our preprocessed images which have already been resized to have have a dimension of 256 on their shortest size. This saves time/bandwidth when pushing the images into staging, and uses less space in staging.  


In [44]:
# Create dataframe of all preprocessed images and labels
image_files = os.listdir('../../data_preprocessed')
image_labels = [x.replace('t_shirt', 'tshirt').split('_')[0] for x in image_files]

data = pd.DataFrame({
    'img_name': image_files,
    'label': image_labels
})
# randomly split into train and test with a 90:10 ratio
msk = np.random.rand(len(data)) < 0.9
train = data[msk]
test = data[~msk]

# save as csvs
train.to_csv('sim_train.csv', index=False)
test.to_csv('sim_test.csv', index=False)

## Upload data to to staging

The best way for our model to ingest images while training is to have them mounted in a volume on the training container created by Snowflake Container Services (SCS). SCS lets us mount any stage provided it is encrypted with `SNOWFLAKE_SSE` encryption. Thus, we create a new stage `Image_STG` with `SNOWFLAKE_SSE` encryption and push our preprocessed images to this stage. We also push both annotation files to this stage so that they can be leveraged by our training script.

In [110]:
#only run the first time to create the stage. do not run again, or we'll lose our images

#session.sql("CREATE OR REPLACE STAGE IMAGE_STG ENCRYPTION = (type = 'SNOWFLAKE_SSE')").collect()

[Row(status='Stage area IMAGE_STG successfully created.')]

In [115]:
pd.DataFrame(session.sql('SHOW STAGES').collect())

Unnamed: 0,created_on,name,database_name,schema_name,url,has_credentials,has_encryption_key,owner,comment,region,type,cloud,notification_channel,storage_integration,owner_role_type
0,2023-09-11 11:29:40.914000-07:00,FASHION_REPOSITORY,FASHION,PUBLIC,,N,N,SERVICESNOW_USER_ROLE,,,IMAGE REPOSITORY,,,,ROLE
1,2023-09-26 09:12:25.666000-07:00,IMAGE_STG,FASHION,PUBLIC,,N,N,SERVICESNOW_USER_ROLE,,,INTERNAL NO CSE,,,,ROLE
2,2023-08-31 14:55:29.865000-07:00,MODELS,FASHION,PUBLIC,,N,N,SERVICESNOW_USER_ROLE,,,INTERNAL NO CSE,,,,ROLE
3,2023-07-31 09:00:30.527000-07:00,MODEL_STG,FASHION,PUBLIC,,N,N,SERVICESNOW_USER_ROLE,,,INTERNAL,,,,ROLE
4,2023-09-13 13:18:56.893000-07:00,UDF_STG,FASHION,PUBLIC,,N,N,SERVICESNOW_USER_ROLE,,,INTERNAL,,,,ROLE


In [112]:
# this lets us resume from partial upload, always run before running the cell below

uploaded = pd.DataFrame(session.sql('LIST @IMAGE_STG/data_preprocessed').collect())
if 'name' in uploaded.columns:
    uploaded = uploaded.name.apply(lambda x: x.split('/')[-1]).values
else:
    uploaded=[]

In [113]:
# preprocessed images
failed = [] # the event any fail we can track which fail
for img in tqdm.tqdm(image_files):
    if img not in uploaded:
        try:
            session.sql(f"""PUT file:///Users/jprusa/documents/github/FASHION/data_preprocessed/{img} 
        @IMAGE_STG/data_preprocessed AUTO_COMPRESS=FALSE OVERWRITE=TRUE;""").collect()
        except:
            failed.append(img)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 5762/5762 [1:03:10<00:00,  1.52it/s]


In [159]:
# train and test annotation file
session.sql(f"""PUT file:///Users/jprusa/documents/github/FASHION/src/notebook/sim_train.csv
@IMAGE_STG AUTO_COMPRESS=FALSE OVERWRITE=TRUE;""").collect()
session.sql(f"""PUT file:///Users/jprusa/documents/github/FASHION/src/notebook/sim_test.csv
@IMAGE_STG AUTO_COMPRESS=FALSE OVERWRITE=TRUE;""").collect()

[Row(source='sim_test.csv', target='sim_test.csv', source_size=31655, target_size=31655, source_compression='NONE', target_compression='NONE', status='UPLOADED', message='')]

In [160]:
# check that everything is in place in staging
pd.DataFrame(session.sql('LIST @IMAGE_STG').collect())

Unnamed: 0,name,size,md5,last_modified
0,image_stg/data_preprocessed/blazer_00b8048d-63...,8820,6992a671d7e3c4a275c11480628abc17,"Tue, 26 Sep 2023 16:52:21 GMT"
1,image_stg/data_preprocessed/blazer_01a54355-b5...,9562,5ecc710646521b8f64ef441fc0cfae09,"Tue, 26 Sep 2023 17:11:40 GMT"
2,image_stg/data_preprocessed/blazer_03c6360d-73...,13572,7e7b4463fc0179868a922ca580cdd983,"Tue, 26 Sep 2023 16:59:38 GMT"
3,image_stg/data_preprocessed/blazer_06aaacf1-ba...,11016,888db1e4bec0a2c9a6bcdf1f8f3314a8,"Tue, 26 Sep 2023 16:17:20 GMT"
4,image_stg/data_preprocessed/blazer_094a6288-3f...,8488,1baba40dfa672e830fe03f12ebc4fcf4,"Tue, 26 Sep 2023 16:58:09 GMT"
...,...,...,...,...
5759,image_stg/data_preprocessed/undershirt_f8bf048...,12078,64aff4398e86193390d85209baa7d124,"Tue, 26 Sep 2023 16:30:51 GMT"
5760,image_stg/data_preprocessed/undershirt_fdee18e...,10073,c65a2896d1bea677640ef4aac6096e73,"Tue, 26 Sep 2023 16:23:12 GMT"
5761,image_stg/data_preprocessed/undershirt_ff80d9e...,12834,6b24f2a36a9806e6d6be80aba1fa4c26,"Tue, 26 Sep 2023 17:14:33 GMT"
5762,image_stg/sim_test.csv,31655,a8240f5d96bc475a7851387458371292,"Tue, 26 Sep 2023 19:16:51 GMT"


## Create and Push Docker Image
0. open a terminal window and `cd` into `/src/docker`

1. Using the Docker CLI, execute the docker build command, specifying the current working directory (.):

    `docker build --rm --platform linux/amd64 -t <image_name> .`

2. Tag the image with the image URL:
image_name = 'fashion:recommender'
repository_url = 'sfsenorthamerica-polaris2.registry.snowflakecomputing.com/fashion/public/fashion_repository'
db = 'fashion'
schema = 'public'
repo = 'fashion_repository'
user_name = 'jprusa'
    `docker tag <image_name> <repository_url>/<image_name>`

3. To authenticate Docker with the Snowflake registry, execute the docker login command:

    `docker login <repository_url> -u <user_name>`

4. To upload the image to the image repository, execute the following docker push command:

    `docker push <repository_url>/<image_name>`


In [85]:
image_name = 'fashion:image_embeddings'
repository_url = 'sfsenorthamerica-polaris2.registry.snowflakecomputing.com/fashion/public/fashion_repository'
db = 'fashion'
schema = 'public'
repo = 'fashion_repository'
user_name = 'jprusa'

In [124]:
print(f"docker build --rm --platform linux/amd64 -t {image_name} .")
print(f"\ndocker tag {image_name} {repository_url}/{image_name}")
print(f"\ndocker login {repository_url} -u {user_name}")
print(f"\ndocker push {repository_url}/{image_name}")

docker build --rm --platform linux/amd64 -t fashion:image_embeddings .

docker tag fashion:image_embeddings sfsenorthamerica-polaris2.registry.snowflakecomputing.com/fashion/public/fashion_repository/fashion:image_embeddings

docker login sfsenorthamerica-polaris2.registry.snowflakecomputing.com/fashion/public/fashion_repository -u jprusa

docker push sfsenorthamerica-polaris2.registry.snowflakecomputing.com/fashion/public/fashion_repository/fashion:image_embeddings


## Launch Container Service

Since we are running a training job, we will create a service that when executed:

1. downloads a base image classification model
2. finetunes the model on our data
3. saves the best model to staging 
4. converts the best model to an embedding model
5. saves the embedding model to staging

### Create Compute Pool

Since we are training a deep learning model in pytorch, we create a gpu compute pool to execute our service on

In [103]:
session.sql("""CREATE OR REPLACE COMPUTE POOL embedding_compute_pool
  MIN_NODES = 1
  MAX_NODES = 1
  INSTANCE_FAMILY = GPU_7;""").collect() # HIGH_MEMORY_5

[Row(status='Compute Pool EMBEDDING_COMPUTE_POOL successfully created.')]

### Upload spec.yaml and run service

This yaml provides important specifications for our training service. In addition to specifying the docker image we will be using, this is also where we specify three mounted volumes in this yaml. 

    1. `images`, from `@IMAGE_STG`. This contains our images and annotation files
    2. `models`, from `@MODELS`. This is where we will save our trained models to
    3. `dshm`, from memory. This is a shared space used while training our deep learning model. If we fail to define this, a default value is used that is far to small and results in segmentation faults while training.
    
Additionally, we define 3 environment variables

    1. IMAGE_PATH: location of training images in the mounted volume
    2. ANNOTATION_PATH: location of image annotations in the mounted volume
    3. SAVE_PATH: location of the mounted volume we will save our model to
    
    
`EXECUTE SERVICE` is used (instead of CREATE SERVICE) as we want our service to run, then automatically shut down the container once training is complete.


In [266]:
# push spec.yaml to stage
session.sql("""PUT file:///Users/jprusa/documents/github/Fashion/src/docker_embedding/embedding_spec.yaml 
@MODEL_STG AUTO_COMPRESS=FALSE OVERWRITE=TRUE;""").collect()

[Row(source='embedding_spec.yaml', target='embedding_spec.yaml', source_size=691, target_size=704, source_compression='NONE', target_compression='NONE', status='UPLOADED', message='')]

In [241]:
# execute service
output = session.sql("""EXECUTE SERVICE
  IN COMPUTE POOL embedding_compute_pool
  FROM @MODEL_STG
  SPEC='embedding_spec.yaml';""").collect()
UUID = output[0][0].split()[-1][:-1]

### Check status and logs

In [251]:
# check status of service
eval(session.sql(f"SELECT SYSTEM$GET_JOB_STATUS('{UUID}');").collect()[0][0])

[{'status': 'READY',
  'message': 'Running',
  'containerName': 'fashion',
  'instanceId': '0',
  'serviceName': 'JOB_01AF41060001C60F00226D870235EC7A',
  'image': 'sfsenorthamerica-polaris2.registry.snowflakecomputing.com/fashion/public/fashion_repository/fashion:image_embeddings',
  'restartCount': 0,
  'startTime': '2023-09-26T20:22:51Z'}]

In [262]:
# check logs for training progress 
print(session.sql(f"SELECT SYSTEM$GET_JOB_LOGS('{UUID}', 'fashion');").collect()[0][0])

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 253MB/s]
Epoch 0/24
----------
100%|██████████| 325/325 [01:38<00:00,  3.29it/s]
train Loss: 1.9486 Acc: 0.4467
100%|██████████| 36/36 [00:11<00:00,  3.22it/s]
val Loss: 1.3178 Acc: 0.6277

Epoch 1/24
----------
100%|██████████| 325/325 [00:18<00:00, 17.85it/s]
train Loss: 1.4555 Acc: 0.5727
100%|██████████| 36/36 [00:01<00:00, 18.01it/s]
val Loss: 1.1248 Acc: 0.6507

Epoch 2/24
----------
100%|██████████| 325/325 [00:17<00:00, 18.66it/s]
train Loss: 1.2793 Acc: 0.6179
100%|██████████| 36/36 [00:01<00:00, 18.74it/s]
val Loss: 1.0156 Acc: 0.6809

Epoch 3/24
----------
100%|██████████| 325/325 [00:17<00:00, 18.78it/s]
train Loss: 1.2248 Acc: 0.6368
100%|██████████| 36/36 [00:01<00:00, 18.34it/s]
val Loss: 1.0476 Acc: 0.6986

Epoch 4/24
----------
100%|██████████| 325/325 [00:16<00:00, 19.12it/s]
train Loss: 1.1282 

### Check that models were saved

In [267]:
pd.DataFrame(session.sql('LIST @MODELS').collect())

Unnamed: 0,name,size,md5,last_modified
0,models/.ipynb_checkpoints/image_captioning-che...,5483,cca7bff8fdad45e40503201398e412c7,"Thu, 31 Aug 2023 23:00:00 GMT"
1,models/best_model.pt,44838851,3cfa9610a9af4a5decc6bf5d38d562af-5,"Tue, 26 Sep 2023 20:32:59 GMT"
2,models/embedding_model.pt,44795880,a11751037dd96d7dc50d3c9bcb9bde53-5,"Tue, 26 Sep 2023 20:33:00 GMT"
3,models/image_captioning.ipynb,5483,cca7bff8fdad45e40503201398e412c7,"Thu, 31 Aug 2023 23:00:00 GMT"
4,models/images.csv,130087123,cd19eff08e06f2717f96fde15d2c5e3d,"Thu, 31 Aug 2023 22:27:33 GMT"
5,models/images_captioned.csv,130370149,f7f7ed81de778dad65989757e3df5768-13,"Thu, 31 Aug 2023 22:59:56 GMT"
6,models/spec-finetune.yaml,535,aa7f212e0607ea6c9c1ae54a7b99298e,"Fri, 1 Sep 2023 13:34:48 GMT"
7,models/spec_img_desc.yaml,553,611deac90a1f4ac646fc098ebeff0be6,"Thu, 31 Aug 2023 21:56:22 GMT"


## Stop Service and Clean Up

we need to stop any running service(s) on our compute pool and stop the compute pool as we will continue to be charged as long as the compute exists, even if no services are actively running on it.

In [264]:
#stop services running on the compute pool
session.sql("""ALTER COMPUTE POOL embedding_compute_pool STOP ALL;""").collect()

[Row(status='Statement executed successfully.')]

In [265]:
# shut down compute pool
session.sql("""DROP COMPUTE POOL embedding_compute_pool;""").collect()

[Row(status='EMBEDDING_COMPUTE_POOL successfully dropped.')]