In [125]:
#Optional step to delete session if you get an error that multiple sessions are open
del session

# Initialize Notebook, import libraries and create Snowflake connection

In [3]:
from snowflake.snowpark.session import Session
from snowflake.snowpark.types import IntegerType, FloatType
from snowflake.snowpark.functions import avg, sum, col, udf, call_udf, call_builtin, year
import streamlit as st
import pandas as pd
from datetime import date

# scikit-learn (install: pip install -U scikit-learn)
from sklearn.linear_model import LinearRegression

# Session
connection_parameters = {
   "account": "<account_identifier>",
   "user": "<username>",
   "password": "<password>",
   "warehouse": "compute_wh",
   "role": "accountadmin",
   "database": "summit_hol",
   "schema": "public"
}
session = Session.builder.configs(connection_parameters).create()


# test if we have a connection
session.sql("select current_warehouse() wh, current_database() db, current_schema() schema, current_version() v").show()


2022-06-07 20:30:01.778 INFO    snowflake.connector.connection: Snowflake Connector for Python Version: 2.7.8, Python Version: 3.8.13, Platform: macOS-10.16-x86_64-i386-64bit
2022-06-07 20:30:01.779 INFO    snowflake.connector.connection: This connection is in OCSP Fail Open Mode. TLS Certificates would be checked for validity and revocation status. Any other Certificate Revocation related exceptions or OCSP Responder failures would be disregarded in favor of connectivity.
2022-06-07 20:30:02.423 INFO    snowflake.snowpark.session: Snowpark Session information: 
"version" : 0.7.0,
"python.version" : 3.8.13,
"python.connector.version" : 2.7.8,
"python.connector.session.id" : 82794089108550,
"os.name" : Darwin

2022-06-07 20:30:02.425 INFO    snowflake.connector.cursor: query: [SELECT  *  FROM (select current_warehouse() wh, current_database() db, current_s...]
2022-06-07 20:30:02.527 INFO    snowflake.connector.cursor: query execution done


---------------------------------------------
|"WH"      |"DB"        |"SCHEMA"  |"V"     |
---------------------------------------------
|SMALL_WH  |SUMMIT_HOL  |PUBLIC    |6.17.0  |
---------------------------------------------



# Query the data

In [4]:
# SQL query to explore the data
session.sql("SELECT * FROM ECONOMY_DATA_ATLAS.ECONOMY.BEANIPA WHERE \"Table Name\" = 'Price Indexes For Personal Consumption Expenditures By Major Type Of Product' AND \"Indicator Name\" = 'Personal consumption expenditures (PCE)' AND \"Frequency\" = 'A' ORDER BY \"Date\"").show()

2022-06-07 20:30:06.878 INFO    snowflake.connector.cursor: query: [SELECT  *  FROM (SELECT * FROM ECONOMY_DATA_ATLAS.ECONOMY.BEANIPA WHERE "Table N...]
2022-06-07 20:30:08.952 INFO    snowflake.connector.cursor: query execution done


----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"Table"  |"Table Name"                                        |"Table Description"  |"Table Full Name"                                   |"Indicator"  |"Indicator Name"                         |"Indicator Description"  |"Indicator Full Name"  |"Units"          |"Scale"  |"Frequency"  |"Date"      |"Value"  |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|T20304   |Price Indexes For Personal Consumption Expendit...  |NUL

In [5]:
# Now use Snowpark dataframe
snow_df_pce = (session.table("ECONOMY_DATA_ATLAS.ECONOMY.BEANIPA") 
                            .filter(col('Table Name') == 'Price Indexes For Personal Consumption Expenditures By Major Type Of Product') 
                            .filter(col('Indicator Name') == 'Personal consumption expenditures (PCE)')
                            .filter(col('"Frequency"') == 'A')
                            .filter(col('"Date"') >= '1972-01-01'))
snow_df_pce.show()

2022-06-07 20:30:12.642 INFO    snowflake.connector.cursor: query: [SELECT  *  FROM ( SELECT  *  FROM ( SELECT  *  FROM ( SELECT  *  FROM ( SELECT  ...]
2022-06-07 20:30:12.820 INFO    snowflake.connector.cursor: query execution done


----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"Table"  |"Table Name"                                        |"Table Description"  |"Table Full Name"                                   |"Indicator"  |"Indicator Name"                         |"Indicator Description"  |"Indicator Full Name"  |"Units"          |"Scale"  |"Frequency"  |"Date"      |"Value"  |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|T20304   |Price Indexes For Personal Consumption Expendit...  |NUL

In [6]:
# Let Snowflake perform filtering using the Snowpark pushdown and display results in a Pandas dataframe
snow_df_pce = (session.table("ECONOMY_DATA_ATLAS.ECONOMY.BEANIPA")
                        .filter(col('"Table Name"') == 'Price Indexes For Personal Consumption Expenditures By Major Type Of Product')
                        .filter(col('"Indicator Name"') == 'Personal consumption expenditures (PCE)')
                        .filter(col('"Frequency"') == 'A')
                        .filter(col('"Date"') >= '1972-01-01'))
pd_df_pce_year = snow_df_pce.select(year(col('"Date"')).alias('"Year"'), col('"Value"').alias('PCE') ).to_pandas()
pd_df_pce_year


2022-06-07 20:30:21.668 INFO    snowflake.connector.cursor: query: [SELECT year("Date") AS "Year", "Value" AS "PCE" FROM ( SELECT  *  FROM ( SELECT ...]
2022-06-07 20:30:22.120 INFO    snowflake.connector.cursor: query execution done


Unnamed: 0,Year,PCE
0,1972,22.542
1,1973,23.756
2,1974,26.229
3,1975,28.415
4,1976,29.974
5,1977,31.923
6,1978,34.145
7,1979,37.178
8,1980,41.182
9,1981,44.871


# Train the Linear Regression model

In [7]:
# train model with PCE index

x = pd_df_pce_year["Year"].to_numpy().reshape(-1,1)
y = pd_df_pce_year["PCE"].to_numpy()

model = LinearRegression().fit(x, y)

# test model for 2021
predictYear = 2021
pce_pred = model.predict([[predictYear]])
# print the last 5 years
print (pd_df_pce_year.tail() )
# run the prediction for 2021
print ('Prediction for '+str(predictYear)+': '+ str(round(pce_pred[0],2)))


    Year      PCE
44  2016  104.148
45  2017  106.051
46  2018  108.318
47  2019  109.922
48  2020  111.225
Prediction for 2021: 116.23


### Creating a User Defined Function within Snowflake to do the scoring there

In [8]:
def predict_pce(predictYear: int) -> float:
    return model.predict([[predictYear]])[0].round(2).astype(float)

_ = session.udf.register(predict_pce,
                        return_type=FloatType(),
                        input_type=IntegerType(),
                        packages= ["pandas","scikit-learn"],
                        is_permanent=True, 
                        name="predict_pce_udf", 
                        replace=True,
                        stage_location="@udf_stage")

2022-06-07 20:30:38.317 INFO    snowflake.connector.cursor: query: [ls '@udf_stage']
2022-06-07 20:30:38.446 INFO    snowflake.connector.cursor: query execution done
2022-06-07 20:30:38.447 INFO    snowflake.connector.cursor: query: [SELECT "name" FROM ( SELECT  *  FROM  TABLE ( RESULT_SCAN('01a4cb16-3201-26b9-00...]
2022-06-07 20:30:38.946 INFO    snowflake.connector.cursor: query execution done
2022-06-07 20:30:38.948 INFO    snowflake.connector.cursor: query: [select package_name, version from information_schema.packages where language='py...]
2022-06-07 20:30:39.626 INFO    snowflake.connector.cursor: query execution done
2022-06-07 20:30:39.728 INFO    snowflake.connector.cursor: query: [CREATE OR REPLACE FUNCTION predict_pce_udf(arg1 BIGINT) RETURNS FLOAT LANGUAGE P...]
2022-06-07 20:30:55.950 INFO    snowflake.connector.cursor: query execution done


# Test the trained model by invoking the UDF via a SQL statement

In [9]:
session.sql("select predict_pce_udf(2021)").show()


2022-06-07 20:30:59.569 INFO    snowflake.connector.cursor: query: [SELECT  *  FROM (select predict_pce_udf(2021)) LIMIT 10]
2022-06-07 20:31:11.103 INFO    snowflake.connector.cursor: query execution done


---------------------------
|"PREDICT_PCE_UDF(2021)"  |
---------------------------
|116.23                   |
---------------------------

