# Data / Feature Engineering

In this notebook, we acheive the following feature engineering efforts:
    - use image processing library to parse the image into matrices
    - we reshape the matrice 
    - store the matrice into table 

In [3]:
from IPython.display import display, HTML, Image , Markdown
from snowflake.snowpark.session import Session
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F
import os ,configparser ,json ,logging

# Import the commonly defined utility scripts using
# dynamic path include
import sys
sys.path.append('../python/lutils')
import sflk_base as L

display(Markdown("### Initialization"))
logging.basicConfig(stream=sys.stdout, level=logging.ERROR)

# Source various helper functions
%run ./scripts/notebook_helpers.py

# Define the project home directory, this is used for locating the config.ini file
PROJECT_HOME_DIR = '../../'
config = L.get_config(PROJECT_HOME_DIR)
sp_session = L.connect_to_snowflake(PROJECT_HOME_DIR)

if(sp_session == None):
    raise Exception(f'Unable to connect to snowflake. Validate connection information ')

sp_session.use_role(f'''{config['APP_DB']['role']}''')
sp_session.use_schema(f'''{config['APP_DB']['database']}.{config['APP_DB']['schema']}''')
sp_session.use_warehouse(f'''{config['APP_DB']['snow_opt_wh']}''')

df = sp_session.sql('select current_user() ,current_role() ,current_database() ,current_schema();').to_pandas()
display(df)

### Initialization

Unnamed: 0,CURRENT_USER(),CURRENT_ROLE(),CURRENT_DATABASE(),CURRENT_SCHEMA()
0,VSEKAR,PUBLIC,INDSOL_DICOM_DB,PUBLIC


In [18]:
stmts = [
   f'''
    PUT file://{PROJECT_HOME_DIR}/data/train/PNEUMONIA/person374_bacteria_1710.jpeg @data_stg 
    auto_compress = false
    overwrite = true;
    '''
    ,f''' alter stage data_stg refresh; '''
    ,f'''
    PUT file://../python/skimage_parser_fn.py @lib_stg/scripts 
    auto_compress = false
    overwrite = true;
    '''
    ,f'''
    create or replace function skimage_parser_fn(image_fl varchar)
     returns variant
     language python
     runtime_version = '3.8'
     packages = ('snowflake-snowpark-python','numpy', 'pandas', 'scikit-learn' ,'scikit-image')
     imports = ('@lib_stg/scripts/skimage_parser_fn.py')
     handler = 'skimage_parser_fn.main'
     ;
    '''
]    
    
for stmt in stmts:
    sp_session.sql(stmt).collect()

In [19]:
df = sp_session.sql('''
    select 
    relative_path
    ,concat('@data_stg/',relative_path) as full_image_path
    ,skimage_parser_fn(full_image_path) as parsed_image_info
from directory(@data_stg)
;
''').to_pandas()
display(df)

Unnamed: 0,RELATIVE_PATH,FULL_IMAGE_PATH,PARSED_IMAGE_INFO
0,person374_bacteria_1710.jpeg,@data_stg/person374_bacteria_1710.jpeg,"{\n ""image_array"": ""[54, 30, 26, 24, 26, 29, ..."


In [8]:
sp_session.sql('''
    call skimage_parser('@data_stg/person374_bacteria_1710.jpeg');
''').collect()

[Row(SKIMAGE_PARSER='{\n  "elapsed": "=> 0:00:03.012706 ",\n  "status": true\n}')]

In [None]:
sp_session.sql('''
    select *
    from image_parsed_raw
    where image_filepath like '%person374_bacteria_1710.jpeg%';
''').collect()

In [9]:
import pandas as pd

list_1 = images_to_pddf('../../data/train')
list_2 = images_to_pddf('../../data/test')
list_3 = images_to_pddf('../../data/val')

images_parsed_list = list_1 + list_2 + list_3

In [12]:
# The parsed images are then stored in a table

images_parsed_pddf = pd.DataFrame(images_parsed_list
    , columns =['image_filepath', 'class_label','class_num','status','parsing_exception'
        ,'image_array' ,'image_array_shape_0' ,'image_array_shape_1' 
        ,'normalized_image_array' ,'resized_feature'])

images_parsed_pddf.columns = map(lambda x: str(x).upper(), images_parsed_pddf.columns)

tbl_schema = StructType([
    StructField('IMAGE_FILEPATH', StringType())
    ,StructField('CLASS_LABEL', StringType())
    ,StructField('CLASS_NUM', IntegerType())
    ,StructField('STATUS', BooleanType())
    ,StructField('PARSING_EXCEPTION', StringType())
    ,StructField('IMAGE_ARRAY', VariantType())
    ,StructField('IMAGE_ARRAY_SHAPE_0', IntegerType())
    ,StructField('IMAGE_ARRAY_SHAPE_1', IntegerType())
    ,StructField('NORMALIZED_IMAGE_ARRAY', VariantType())
    ,StructField('RESIZED_FEATURE', VariantType())
])

img_table = f'''{config['APP_DB']['database']}.public.images_parsed'''

df = sp_session.create_dataframe(images_parsed_pddf
    , schema=tbl_schema)
df.write.save_as_table(img_table, mode="overwrite" ,table_type='transient')

In [4]:
img_table = f'''{config['APP_DB']['database']}.public.images_parsed'''

df = sp_session.table(img_table).limit(5).to_pandas()
display(df)

Unnamed: 0,IMAGE_FILEPATH,CLASS_LABEL,CLASS_NUM,STATUS,PARSING_EXCEPTION,IMAGE_ARRAY,IMAGE_ARRAY_SHAPE_0,IMAGE_ARRAY_SHAPE_1,NORMALIZED_IMAGE_ARRAY,RESIZED_FEATURE
0,../../data/val/PNEUMONIA/person354_bacteria_16...,PNEUMONIA,0,True,,"[\n 217,\n 217,\n 210,\n 204,\n 204,\n 2...",150,150,"[\n 8.509803921568627e-01,\n 8.5098039215686...","[\n 8.509803921568627e-01,\n 8.5098039215686..."
1,../../data/val/PNEUMONIA/person1880_bacteria_4...,PNEUMONIA,0,True,,"[\n 0,\n 0,\n 0,\n 1,\n 3,\n 10,\n 23,\...",150,150,"[\n 0.000000000000000e+00,\n 0.0000000000000...","[\n 0.000000000000000e+00,\n 0.0000000000000..."
2,../../data/val/PNEUMONIA/person324_virus_658.jpeg,PNEUMONIA,0,True,,"[\n 3,\n 8,\n 11,\n 23,\n 33,\n 38,\n 2...",150,150,"[\n 1.176470588235294e-02,\n 3.1372549019607...","[\n 1.176470588235294e-02,\n 3.1372549019607..."
3,../../data/val/PNEUMONIA/person1817_bacteria_4...,PNEUMONIA,0,True,,"[\n 0,\n 0,\n 3,\n 6,\n 10,\n 13,\n 24,...",150,150,"[\n 0.000000000000000e+00,\n 0.0000000000000...","[\n 0.000000000000000e+00,\n 0.0000000000000..."
4,../../data/val/PNEUMONIA/person319_bacteria_14...,PNEUMONIA,0,True,,"[\n 56,\n 51,\n 63,\n 45,\n 53,\n 54,\n ...",150,150,"[\n 2.196078431372549e-01,\n 2.0000000000000...","[\n 2.196078431372549e-01,\n 2.0000000000000..."


--- 
### Closeout

    With that we are finished this section of the demo setup

In [None]:
sp_session.close()
print('Finished!!!')

Finished!!!
