# Part 1 - Data Engineering



In [10]:
from snowflake.snowpark.session import Session
from snowflake.core import Root

from snowflake.snowpark.version import VERSION
import pandas as pd

## Make a connection to Snowflake via Snowpark

In [120]:

connection_config = {
    "account": "fcb00776.us-east-1"
    ,"user": "admin"
    ,"password": "123Axdsw!",
    'role':'accountadmin'
}


session = Session.builder.configs(connection_config).create()
root= Root(session)
snowflake_environment = session.sql('select current_user(), current_version()').collect()
snowpark_version = VERSION

# Current Environment Details
print('User                        : {}'.format(snowflake_environment[0][0]))
print('Role                        : {}'.format(session.get_current_role()))
print('Database                    : {}'.format(session.get_current_database()))
print('Schema                      : {}'.format(session.get_current_schema()))
print('Warehouse                   : {}'.format(session.get_current_warehouse()))
print('Snowflake version           : {}'.format(snowflake_environment[0][1]))
print('Snowpark for Python version : {}.{}.{}'.format(snowpark_version[0],snowpark_version[1],snowpark_version[2]))

User                        : ADMIN
Role                        : "ACCOUNTADMIN"
Database                    : None
Schema                      : None
Warehouse                   : None
Snowflake version           : 8.13.3
Snowpark for Python version : 1.13.0


### Challenge 1: Setup your environment with the Session and Root object

Lets create a new database, schema and warehouse for our work. 

The Snowpark session is no different than a regular Snowflake connection for this.
 

In [12]:
db_name = 'SNOWPARK_DB'
schema_name = 'HOL_SCHEMA'
wh_name = 'HOL_WH'

#Write code to create a Database, Schema , and Warehouse


[Row(status='Statement executed successfully.')]

#### The Snowflake Core Python libary lets you do the same steps with less code, no SQL,and in Object-oriented manner

[Docs: Snowflake Python API](https://docs.snowflake.com/en/developer-guide/snowflake-python-api/snowflake-python-overview)

In [107]:
from snowflake.core.database import Database
from snowflake.core.schema import Schema 
from snowflake.core.warehouse import Warehouse



### Challenge 2: To connect Snowflake to external cloud storage, like S3, you need to create a Stage.

At the moment, **anything besides Databases, Schemas, Tables, Tasks, and Warehouse(compute pools) are SQL-only** until the Python API GA later this year.

[Create a Stage SQL](https://docs.snowflake.com/en/sql-reference/sql/create-storage-integration#examples)

In [22]:

stage_info = {
    'campaign_data_stage': 's3://sfquickstarts/ad-spend-roi-snowpark-python-scikit-learn-streamlit/campaign_spend/' ,
    'monthly_revenue_data_stage': 's3://sfquickstarts/ad-spend-roi-snowpark-python-scikit-learn-streamlit/monthly_revenue/'
}


#Write a function below to create a stage using the Session objects SQL cursor()


In [24]:
#Run your function for all data sources



[Row(status='Stage area CAMPAIGN_DATA_STAGE successfully created.')]
[Row(status='Stage area MONTHLY_REVENUE_DATA_STAGE successfully created.')]


## Lets take a look at whats in the S3 bucket

In [31]:
#Use the list command to check out the files in your stage!

### Challenge 3: Time for our first Snowpark Dataframe! 

Snowpark is capabable of loading staged files to a dataframe using the DataFrameReader. This reader takes options similar to a SQL COPY command.

[Docs: Snowpark DataframeReader](https://docs.snowflake.com/ko/developer-guide/snowpark/reference/python/latest/api/snowflake.snowpark.DataFrameReader)

In [111]:
#Read the monthly revenue data to a DataFrame and show it



SyntaxError: invalid syntax (4271019390.py, line 2)

In [36]:
#read the campaign spend data to a DataFrame



------------------------------------------------------------------------------------------------------
|"CAMPAIGN"              |"CHANNEL"      |"DATE"      |"TOTAL_CLICKS"  |"TOTAL_COST"  |"ADS_SERVED"  |
------------------------------------------------------------------------------------------------------
|winter_sports           |video          |2012-06-03  |213             |1762          |426           |
|sports_across_cultures  |video          |2012-06-02  |87              |678           |157           |
|building_community      |search_engine  |2012-06-03  |66              |471           |134           |
|world_series            |social_media   |2017-12-28  |72              |591           |149           |
|winter_sports           |email          |2018-02-09  |252             |1841          |473           |
|spring_break            |video          |2017-11-14  |162             |1155          |304           |
|nba_finals              |email          |2017-11-22  |68              |4

### Challenge 4: In reality, we need a Pivot Table to break out different marketing channels

In [38]:
#get your pivot list without hardcoding

['video', 'search_engine', 'social_media', 'email']

#### Lets leverage some handy snowpark functions to perform our aggregate

In [41]:
from snowflake.snowpark.functions import month,year,col,sum



In [42]:
spend_per_month.show()

---------------------------------------------------
|"YEAR"  |"MONTH"  |"CHANNEL"      |"TOTAL_COST"  |
---------------------------------------------------
|2012    |5        |search_engine  |516431        |
|2012    |5        |video          |516729        |
|2012    |5        |email          |517208        |
|2012    |5        |social_media   |517618        |
|2012    |6        |video          |501098        |
|2012    |6        |search_engine  |506497        |
|2012    |6        |social_media   |504679        |
|2012    |6        |email          |501947        |
|2012    |7        |search_engine  |522780        |
|2012    |7        |email          |518405        |
---------------------------------------------------



In [43]:
# Do the Pivot
spend_per_channel = spend_per_month.pivot('CHANNEL', channel_pivot_list).sum('TOTAL_COST').sort('YEAR','MONTH')
spend_per_channel.show()

-----------------------------------------------------------------------------------
|"YEAR"  |"MONTH"  |"'video'"  |"'search_engine'"  |"'social_media'"  |"'email'"  |
-----------------------------------------------------------------------------------
|2012    |5        |516729     |516431             |517618            |517208     |
|2012    |6        |501098     |506497             |504679            |501947     |
|2012    |7        |522762     |522780             |521395            |518405     |
|2012    |8        |520685     |519959             |520537            |521584     |
|2012    |9        |511364     |507211             |507404            |507363     |
|2012    |10       |522768     |518942             |520863            |519950     |
|2012    |11       |505292     |505715             |505221            |503748     |
|2012    |12       |521427     |520148             |520711            |520724     |
|2013    |1        |520583     |522151             |518635            |52116

In [44]:
#Use a quick select statement to clean up column names and join in the revenue_data
spend_per_channel.select(
    col("YEAR"),
    col("MONTH"),
    col("'search_engine'").as_("SEARCH_ENGINE"),
    col("'social_media'").as_("SOCIAL_MEDIA"),
    col("'video'").as_("VIDEO"),
    col("'email'").as_("EMAIL")
).join(
    revenue_data, ['YEAR' ,'MONTH']
).show()


----------------------------------------------------------------------------------------
|"YEAR"  |"MONTH"  |"SEARCH_ENGINE"  |"SOCIAL_MEDIA"  |"VIDEO"  |"EMAIL"  |"REVENUE"   |
----------------------------------------------------------------------------------------
|2012    |5        |516431           |517618          |516729   |517208   |3264300.11  |
|2012    |6        |506497           |504679          |501098   |501947   |3208482.33  |
|2012    |7        |522780           |521395          |522762   |518405   |3311966.98  |
|2012    |8        |519959           |520537          |520685   |521584   |3311752.81  |
|2012    |9        |507211           |507404          |511364   |507363   |3208563.06  |
|2012    |10       |518942           |520863          |522768   |519950   |3334028.46  |
|2012    |11       |505715           |505221          |505292   |503748   |3185894.64  |
|2012    |12       |520148           |520711          |521427   |520724   |3334570.96  |
|2013    |1        |5

### Automation: Run Campaign Spend Data Transformations As a Snowflake Task
Note: Optionally you can run all these transformations as an automated task by deploying the code to Snowflake as a Snowpark Stored Procedure and executing it as a Snowflake Task.



In [114]:
from snowflake.snowpark.functions import sproc
session.use_warehouse('compute_wh')
session.add_packages('snowflake-snowpark-python')

def campaign_spend_data_pipeline(session: Session) -> bool:
    
    #Load data from stage
    campaign_spend_data = session.read.options(
        {
        'infer_schema':True,
        'parse_header':True}).csv('@campaign_data_stage')

    #write to table
    spend_and_revenue.write.mode('append').save_as_table('CAMPAIGN_SPEND')
    revenue_data = session.read.options(
        {
        'infer_schema':True,
        'parse_header':True}).csv('@monthly_revenue_data_stage')
    
    # Dynamically get list of channels to pivot
    channel_pivot_list = list(campaign_spend_data.select('channel').distinct().to_pandas().CHANNEL)

    #Get Aggregate
    spend_per_month = campaign_spend_data.group_by(
        year('DATE'),
        month('DATE'),
        'CHANNEL').agg(
                    sum('TOTAL_COST').as_('TOTAL_COST')).\
                                        with_column_renamed('"YEAR(DATE)"',"YEAR").\
                                        with_column_renamed('"MONTH(DATE)"',"MONTH").sort('YEAR','MONTH')


    spend_per_channel = spend_per_month.pivot('CHANNEL', channel_pivot_list).sum('TOTAL_COST').sort('YEAR','MONTH').\
                                        select(
                                            col("YEAR"),
                                            col("MONTH"),
                                            col("'search_engine'").as_("SEARCH_ENGINE"),
                                            col("'social_media'").as_("SOCIAL_MEDIA"),
                                            col("'video'").as_("VIDEO"),
                                            col("'email'").as_("EMAIL")
                                        )


    
    spend_and_revenue = spend_per_channel.join(revenue_data, ['YEAR' ,'MONTH'])
    #write to snowflake
    spend_and_revenue.write.mode('append').save_as_table('SPEND_AND_REVENUE_PER_MONTH')

    return True

In [45]:
# Create  Stored Procedure and Task from my python function


<snowflake.snowpark.stored_procedure.StoredProcedure at 0x12e7a2c10>

In [125]:
from snowflake.core.task import Task, StoredProcedureCall, Cron
from datetime import timedelta

session.use_database(db_name)
session.use_schema(schema_name)
session.use_warehouse(wh_name)
session.sql('create or replace stage snowpark_stage').collect()

run_campaign_task =  Task('campaign_spend_pipeline', 
                          StoredProcedureCall(campaign_spend_data_pipeline, 
                                              stage_location='@snowpark_stage',
                                              packages=["snowflake-snowpark-python"]
                                             ) ,
                           schedule= Cron('0 0 2 * *', timezone = 'UTC'),
                           warehouse= wh_name
                                             )


In [126]:
root.databases[db_name].schemas[schema_name].tasks.create(run_campaign_task, mode='orreplace').execute()

In [127]:
#Look at your tasks
list(root.databases[db_name].schemas[schema_name].tasks)

['campaign_spend_pipeline']

#### Final Checkpoint for Part 1. Do you have a clean table ready for ML training that looks like the below?

In [128]:
snow_df_spend_and_revenue_per_month = session.table('spend_and_revenue_per_month')
snow_df_spend_and_revenue_per_month.show()

----------------------------------------------------------------------------------------
|"YEAR"  |"MONTH"  |"SEARCH_ENGINE"  |"SOCIAL_MEDIA"  |"VIDEO"  |"EMAIL"  |"REVENUE"   |
----------------------------------------------------------------------------------------
|2012    |5        |516431           |517618          |516729   |517208   |3264300.11  |
|2012    |6        |506497           |504679          |501098   |501947   |3208482.33  |
|2012    |7        |522780           |521395          |522762   |518405   |3311966.98  |
|2012    |8        |519959           |520537          |520685   |521584   |3311752.81  |
|2012    |9        |507211           |507404          |511364   |507363   |3208563.06  |
|2012    |10       |518942           |520863          |522768   |519950   |3334028.46  |
|2012    |11       |505715           |505221          |505292   |503748   |3185894.64  |
|2012    |12       |520148           |520711          |521427   |520724   |3334570.96  |
|2013    |1        |5

# Part 2 - Data Science and MLOps with Snowpark ML


In [76]:
from snowflake.ml.modeling.compose import ColumnTransformer
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.preprocessing import PolynomialFeatures, StandardScaler
from snowflake.ml.modeling.linear_model import LinearRegression
from snowflake.ml.modeling.model_selection import GridSearchCV
from snowflake.ml.registry import Registry



### Challenge 1: Preprocess your table to get to a Feature table.

##### Quick Cleaning

In [77]:


# Delete rows with missing values from your view


# Exclude columns we don't need for modeling, like date and year


# Save features into a Snowflake table call MARKETING_BUDGETS_FEATURES



##### Feature Preprocessing
[Docs: Snowpark ML - Preprocessing](https://docs.snowflake.com/en/developer-guide/snowpark-ml/snowpark-ml-modeling)

In [80]:


# Create train and test Snowpark DataDrames


# Preprocess the Numeric columns
# We apply PolynomialFeatures and StandardScaler preprocessing steps to the numeric columns
# NOTE: High degrees can cause overfitting.

# Combine the preprocessed step together using the Column Transformer module

# The next step is the integrate the features we just preprocessed with our Machine Learning algorithm to enable us to build a model





R2 score on Train : 0.9948365977535002
R2 score on Test  : 0.9672847611985426


#### Challenge 2: Model Training and MLOps

In [None]:
#Create larger warehouse 

CROSS_VALIDATION_FOLDS = 10
POLYNOMIAL_FEATURES_DEGREE = 2

# Use GridSearch to find the best fitting model based on number_of_folds folds

# Fit and Score

# R2 score on train and test datasets

##### Use the Model Registry

[Docs: Snowpark ML Model Registry](https://docs.snowflake.com/en/developer-guide/snowpark-ml/snowpark-ml-mlops-model-registry#opening-the-snowpark-model-registry)


In [92]:
# Dont forget your import 

# create the Registry

#Log the model and create a reference. 


<snowflake.ml.model._client.model.model_version_impl.ModelVersion at 0x13f34a010>

#### Create fake data to test your model against


In [129]:
session.sql("""CREATE or REPLACE TABLE BUDGET_ALLOCATIONS_AND_ROI (
  MONTH varchar(30),
  SEARCHENGINE integer,
  SOCIALMEDIA integer,
  VIDEO integer,
  EMAIL integer,
  ROI float);""").collect()

session.sql("""
INSERT INTO BUDGET_ALLOCATIONS_AND_ROI (MONTH, SEARCHENGINE, SOCIALMEDIA, VIDEO, EMAIL, ROI)
VALUES
('January',35,50,35,85,8.22),
('February',75,50,35,85,13.90),
('March',15,50,35,15,7.34),
('April',25,80,40,90,13.23),
('May',95,95,10,95,6.246),
('June',35,50,35,85,8.22);""").collect()

budgets = session.table('BUDGET_ALLOCATIONS_AND_ROI')
budgets.show()

-------------------------------------------------------------------------
|"MONTH"   |"SEARCHENGINE"  |"SOCIALMEDIA"  |"VIDEO"  |"EMAIL"  |"ROI"  |
-------------------------------------------------------------------------
|January   |35              |50             |35       |85       |8.22   |
|February  |75              |50             |35       |85       |13.9   |
|March     |15              |50             |35       |15       |7.34   |
|April     |25              |80             |40       |90       |13.23  |
|May       |95              |95             |10       |95       |6.246  |
|June      |35              |50             |35       |85       |8.22   |
-------------------------------------------------------------------------



In [130]:
#Run the inference on your mock data


SnowparkSQLException: (1300) (1304): 390114 (08001): Authentication token has expired.  The user must authenticate again.

## Part 3: Quickly build a data app with Streamlit

In [131]:
# You will need this hanndy charting function, but otherwise we will build the dashboard live!

def chart(chart_data):
  base = alt.Chart(chart_data).encode(alt.X("MONTH", sort=list(calendar.month_name), title=None))
  bars = base.mark_bar().encode(y=alt.Y("BUDGET", title="Budget", scale=alt.Scale(domain=[0, 300])), color=alt.Color("CHANNEL", legend=alt.Legend(orient="top", title=" ")), opacity=alt.condition(alt.datum.MONTH=="July", alt.value(1), alt.value(0.3)))
  lines = base.mark_line(size=3).encode(y=alt.Y("ROI", title="Revenue", scale=alt.Scale(domain=[0, 25])), color=alt.value("#808495"))
  points = base.mark_point(strokeWidth=3).encode(y=alt.Y("ROI"), stroke=alt.value("#808495"), fill=alt.value("white"), size=alt.condition(alt.datum.MONTH=="July", alt.value(300), alt.value(70)))
  chart = alt.layer(bars, lines + points).resolve_scale(y="independent").configure_view(strokeWidth=0).configure_axisY(domain=False).configure_axis(labelColor="#808495", tickColor="#e6eaf1", gridColor="#e6eaf1", domainColor="#e6eaf1", titleFontWeight=600, titlePadding=10, labelPadding=5, labelFontSize=14).configure_range(category=["#FFE08E", "#03C0F2", "#FFAAAB", "#995EFF"])
  st.altair_chart(chart, use_container_width=True)

In [None]:
import calendar 
import altair as alt
import streamlit as st
import pandas as pd
from snowflake.snowpark.functions import col
from snowflake.ml.registry import Registry
from snowflake.snowpark.context import get_active_session


#Create a header
st.header("SkiGear Co Ad Spend Optimizer")
forcast_month = 'July'
#Get your session
session = get_active_session()

#Get an unpivoted view of your Budget without the Forcast month
campaign_spend_data = session.table('campaign_spend')

channels_upper = [c.replace('_','').upper() for  c in \
    list(campaign_spend_data.select('channel').distinct().to_pandas().CHANNEL)
                 ]



data = session.table('SNOWPARK_DB.HOL_SCHEMA.BUDGET_ALLOCATIONS_AND_ROI').\
                unpivot("Budget", "Channel",
                       channels_upper).filter(col("MONTH") != forcast_month)



last_allocation = data.filter(col("MONTH") == "June").to_pandas()
previous_rois = data.drop(["CHANNEL", "BUDGET"]).distinct().to_pandas()


#st.dataframe(data)
st.subheader("Expected Advertising budgets")

#Create a set of slides in columns
col1, _, col2 = st.columns([4, 1, 4])
budgets = []
for alloc, col in zip(last_allocation.itertuples(), [col1, col1, col2, col2]):
  budgets.append(col.slider(alloc.CHANNEL, 0, 100, alloc.BUDGET, 5))

budgets_df = pd.DataFrame(budgets).T

#st.dataframe(budgets_df)

#get latest version of my model
ml_registry = Registry(session , database_name='SNOWPARK_DB', schema_name='HOL_SCHEMA')

ROI_MODEL = ml_registry.get_model(model_name= "PREDICT_ROI").versions()[-1]

def predict(budgets_from_widget):
    budgets_for_inf = budgets_from_widget *1000
    
    pred = ROI_MODEL.run(budgets_for_inf,function_name='predict')
    pred = pred["PREDICTED_REVENUE"].values[0] / 100000
    change = round(((pred / previous_rois["ROI"].iloc[-1]) - 1) * 100, 1)
    return pred, change


#st.dataframe(predict(budgets_df))
pred, change = predict(budgets_df)
st.metric("", f"Predicted revenue ${pred:.2f} million", f"{change:.1f} % vs last month")
july = pd.DataFrame({"MONTH": ["July"]*4, "CHANNEL": channels_upper, "BUDGET": budgets, "ROI": [pred]*4})

def chart(chart_data):
  base = alt.Chart(chart_data).encode(alt.X("MONTH", sort=list(calendar.month_name), title=None))
  bars = base.mark_bar().encode(y=alt.Y("BUDGET", title="Budget", scale=alt.Scale(domain=[0, 300])), color=alt.Color("CHANNEL", legend=alt.Legend(orient="top", title=" ")), opacity=alt.condition(alt.datum.MONTH=="July", alt.value(1), alt.value(0.3)))
  lines = base.mark_line(size=3).encode(y=alt.Y("ROI", title="Revenue", scale=alt.Scale(domain=[0, 25])), color=alt.value("#808495"))
  points = base.mark_point(strokeWidth=3).encode(y=alt.Y("ROI"), stroke=alt.value("#808495"), fill=alt.value("white"), size=alt.condition(alt.datum.MONTH=="July", alt.value(300), alt.value(70)))
  chart = alt.layer(bars, lines + points).resolve_scale(y="independent").configure_view(strokeWidth=0).configure_axisY(domain=False).configure_axis(labelColor="#808495", tickColor="#e6eaf1", gridColor="#e6eaf1", domainColor="#e6eaf1", titleFontWeight=600, titlePadding=10, labelPadding=5, labelFontSize=14).configure_range(category=["#FFE08E", "#03C0F2", "#FFAAAB", "#995EFF"])
  st.altair_chart(chart, use_container_width=True)

#st.dataframe(july)

chart(pd.concat([data.to_pandas(), july]).reset_index(drop=True))

if st.button("❄️ Save to Snowflake"):
  with st.spinner("Making snowflakes..."):
    df = pd.DataFrame({"MONTH": ["July"], "SEARCHENGINE": [budgets[0]], "SOCIALMEDIA": [budgets[1]], "VIDEO": [budgets[2]], "EMAIL": [budgets[3]], "ROI": [pred]})
    session.write_pandas(df, "BUDGET_ALLOCATIONS_AND_ROI")  
    st.success("✅ Successfully wrote budgets & prediction to your Snowflake account!")
    st.snow()
    