In [None]:
from dotenv import load_dotenv
import os 

SF_ACCOUNT    = os.getenv('SF_ACCOUNT')
SF_USER       = os.getenv('SF_USER')


In [None]:
CONNECTION_PARAMETERS = {
    'host': '',
    'user': SF_USER,
    'authenticator': 'externalbrowser',
    'account': SF_ACCOUNT,
    'warehouse': 'ANALYSIS_XL_WH',
    'database': 'WAREHOUSE',
    'schema': 'Analytics'
} # host name omitted since it was Epic's snowflake domain.

In [None]:
#import required modules
from snowflake.snowpark import Session
from ydata_profiling import ProfileReport
from snowflake.snowpark.types import VariantType,StringType
#from ipywidgets import widgets
import pandas as pd
import json as js

def poc_fun(
    snowpark_session:Session 
    ,table:str
    ,col:str
    ,where:str
    ,limit:str = '1000'
    

    ):

# generate sql query, convert to pandas and retrieve single column as pandas df
  df_sf = snowpark_session.sql("""Select %s from %s  where event_date_partition = %s  limit %s"""%(col,table,where,limit))
  df_pd = df_sf.to_pandas() 
  single_col_df = df_pd[col.upper()].to_frame() 

  # run profile report,collect and return the report as a dict
  report = ProfileReport(single_col_df)
  json_report = report.to_json()
  dict_report = js.loads(json_report)
  return dict_report
  

In [None]:
#import required modules
from snowflake.snowpark import Session
from ydata_profiling import ProfileReport
from snowflake.snowpark.types import VariantType,StringType
#from ipywidgets import widgets
import pandas as pd
import json as js
from operator import itemgetter # Needed to retrieve all the necessary dict entries        
from datetime import date
def poc_fun_test(
    snowpark_session:Session 
    ,table:str
    ,col:str
    ,filt:str
    ,where:str = date.today()
    ,where_end:str = date.today()
    ,limit:int = 1000
    

    ):

# generate sql query, convert to pandas and retrieve single column as pandas df
  df_sf = snowpark_session.sql("""Select %s from %s  where %s BETWEEN '%s' AND '%s' limit %s"""%(col,table,filt,where,where_end,limit))
  df_pd = df_sf.to_pandas() 
 # single_col_df = df_pd[col.upper()].to_frame() 

  # run profile report,collect and return the report as a dict
  report = ProfileReport(df_pd,
  missing_diagrams={"bar":False,"matrix":False,"heatmap":False,"histogram":False}
  ,correlations={
        "auto": {"calculate": False},
        "pearson": {"calculate": False},
        "spearman": {"calculate": False},
        "kendall": {"calculate": False},
        "phi_k": {"calculate": False},
        "cramers": {"calculate": False},
    }
    ,interactions={"continuous":False} 
    ,sample = None
    )
  json_report = report.to_json()
  dict_report = js.loads(json_report)
  get_metrics_one = itemgetter("alerts","duplicates")(dict_report)
  get_metrics_two = itemgetter("n_cells_missing","n_duplicates")(dict_report["table"])

  # will error out with key error if a string column is passed, so except handles that case.
  try:
    get_metrics_three =itemgetter("n_negative","n_infinite","n_zeros","n_distinct","n_unique")(dict_report["variables"][col.upper()])
  except KeyError:
    get_metrics_three =itemgetter("n_distinct","n_unique")(dict_report["variables"][col.upper()])
    get_metrics_three = ("N/A","N/A","N/A") + get_metrics_three

 
  dict_merged ={"num_missing":get_metrics_two[0],"num_duplicates":get_metrics_two[1]
  ,"num_negative":get_metrics_three[0],"num_infinite":get_metrics_three[1],"num_zeros":get_metrics_three[2],
  "num_distinct":get_metrics_three[3],"num_unique":get_metrics_three[4],"specific_duplicates":get_metrics_one[1]
  }
  # metrics_merged = get_metrics_one+get_metrics_two+get_metrics_three
  return dict_merged

In [None]:
poc_fun_test(session,'item_sales_daily_test_v6','template_id','purchase_date','2023-09-05','2023-09-05',1000)

In [None]:
print('Connecting to Snowflake...')
session =  Session.builder.configs(CONNECTION_PARAMETERS).create()
print('Connected Successfully!')

In [None]:
## Register Stored Produre in Snowflake
### Add packages and data types
## Kernel can bug out and keep trying to retrieve a folder you removed
from snowflake.snowpark.types import VariantType,StringType,IntegerType
session.add_packages('snowflake-snowpark-python','pandas','ydata-profiling','ipywidgets')
# must only attach zip of desired library
### Upload Stored Produre to Snowflake
session.sproc.register(
    func = poc_fun_test
  , return_type = VariantType()
  , input_types = [StringType(),StringType(),StringType(),StringType(),StringType(),IntegerType()]
  , is_permanent = True
  , name = 'poc_fun'
  , replace = True
  , stage_location = '@tfingland_stage')