In [1]:
import snowflake.snowpark
from snowflake.snowpark.session import Session
from snowflake.snowpark import functions as F
from snowflake.snowpark.functions import udf, udtf
from snowflake.snowpark.types import IntegerType, StringType, VariantType, DateType, PandasSeries, PandasSeriesType, StructField, StructType
from snowflake.snowpark.functions import table_function

from datetime import date
from tokenize import String
import json
import pandas
import zipfile
import sys
import io
import os
import re
import pickle
import cachetools
from joblib import load

import numpy as np
import pandas as pd

In [2]:
connection_parameters = json.load(open('creds.json'))
session = Session.builder.configs(connection_parameters).create()

In [None]:
# create the stage for python and model data
session.sql('create stage if not exists raw_data').collect()
session.sql('create stage if not exists model_data').collect()
session.sql('create stage if not exists python_load').collect()

# create the directory stage for the data
session.sql('create stage if not exists raw_data_stage directory = (enable = true)').collect()

# upload the unstructured file and stop words to the stages
session.file.put('reviews__0_0_0.dat','@raw_data_stage',auto_compress=False)
session.file.put('en_core_web_sm.zip','@model_data')

# refresh the stage
session.sql('alter stage raw_data_stage refresh').collect()

# Data Prep

We'll start this demo by first building our sentiment model, in order to do this we have a set of training data containing previous reviews and their classification for sentiment that requires cleaning and transforming.

First we'll need to refine the text (remove punctuation, stopwords etc.) and then we'll want to make the sentiment classification more suitable for our algoritm. 

---
To get started, lets take a look at the training data we have:

In [None]:
session.table("TRAINING_DATA").show(30)

And check the distribution of data:

In [None]:
import seaborn as sns

df = session.table('TRAINING_DATA') \
    .group_by(F.col('SENTIMENT')) \
    .agg(F.count(F.col('PRODUCT_ID')).alias('COUNT')).to_pandas()

sns.set(rc={'figure.figsize':(20,10)})
sns.barplot(x='SENTIMENT',y='COUNT',data=df)

We can see we have various reviews for products with their corresponding sentiment classification.

---

The first transformation will be to process the review text. To do this we create a UDF that will perform the following:

- Remove stop words
- Remove punctuation
- Remove currency values
- Lemmatize the text

<br>

Note that we create a vectorized UDF, so we can take advantage of batch processing in the UDF, additionally we cache the stopwords lexicon for better performance.

In [None]:
import spacy

session.add_import('@model_data/en_core_web_sm.zip.gz')

@cachetools.cached(cache={})
def load_file(import_dir):
    input_file = import_dir + 'en_core_web_sm.zip'
    output_dir = '/tmp/en_core_web_sm' + str(os.getpid())
            
    with zipfile.ZipFile(input_file, 'r') as zip_ref:
        zip_ref.extractall(output_dir)
        
    return spacy.load(output_dir + "/en_core_web_sm/en_core_web_sm-2.3.0")    

@udf(name='remove_stopwords_vect',packages=['spacy==2.3.5','cachetools'], session=session, is_permanent=True, replace=True, max_batch_size=1000,stage_location='python_load',)
def remove_stopwords_vect(raw_text: PandasSeries[str]) -> PandasSeries[str]:
    nlp = load_file(sys._xoptions['snowflake_import_directory'])
    stop_words = nlp.Defaults.stop_words

    result = []
    
    for s in raw_text:
        doc = nlp(s)
        text = [str(t.lemma_) for t in doc if  
                t not in stop_words 
                and not t.is_punct 
                and not t.is_currency
                and t.lemma_ != '-PRON-']
        text = list(map(lambda x: x.replace(' ', ''), text))
        text = list(map(lambda x: x.replace('\n', ''), text))
        result.append(' '.join(token.lower() for token in text))
    
    return pandas.Series(result)

So we can use this on text to see how this is processed: `This surfboard is amazing. I can ride some epic waves on this thing` will turn into:

In [None]:
df = session.sql('''select remove_stopwords_vect('This surfboard is awesome. I can ride some epic waves on this thing and I only paid $1,000') as processed_text''').toPandas()
str_sentiment = df.iat[0,0]
print(str_sentiment)

---
The next transformation we'll need to do is convert the string value for sentiment into a numeric value, in order to make it more optimized for our ML algorithm. 

To do this we can create a simple UDF to bin the sentiment string to a value.

In [None]:
# Create and upload the UDF to bin the rating to sentiment 
@udf(name='convert_rating',
     is_permanent=True,
     replace=True,
     stage_location='python_load')

def convert_rating(x: str) -> int:
    if x == 'NEGATIVE':
        return -1
    elif x == 'NEUTRAL':
        return 0
    elif x == 'POSITIVE':
        return 1

---
With these UDFs we can now run a query and see what our data will look like for training:

In [None]:
df = session.table('TRAINING_DATA') \
    .filter(
        F.col('REVIEWTEXT') != ''
    ) \
    .select( \
        F.col('PRODUCT_ID'),
        F.col('REVIEWDATE'),
        F.call_udf(
            'REMOVE_STOPWORDS_VECT',
            F.col('REVIEWTEXT')).alias('PROCESSED_REVIEW'),
        F.call_udf(
            'CONVERT_RATING',
            F.col('SENTIMENT')).alias('SENTIMENT')).show(30)

---
## Model Training

Next we want to train a model. Doing this in Snowflake is as simple are creating a Python Stored Procedure, which also allows us to re-run this when we want to retrain the model. Model training uses Snowflake Compute.

The model will be saved to an internal stage, and can be used in a UDF for model inference within Snowflake. 

In [None]:
# Create and upload a stored proc to train our sentiment model
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

import os
from joblib import dump

def save_file(session, model, path):
    input_stream = io.BytesIO()
    pickle.dump(model, input_stream)
    session._conn._cursor.upload_stream(input_stream, path)
    
def train_sentiment_model(session: snowflake.snowpark.Session) -> float:        
    # build a pd with review data
    df = session.table('TRAINING_DATA') \
        .filter(
            F.col('REVIEWTEXT') != '') \
        .select(
            F.call_udf(
                'REMOVE_STOPWORDS_VECT',
                F.col('REVIEWTEXT')).alias('PROCESSED_TEXT'),
            F.call_udf(
                'CONVERT_RATING',
                F.col('SENTIMENT')).alias('SENTIMENT')).toPandas()
    
    index = df.index
    df['RANDOM'] = np.random.randn(len(index))
    train = df[df['RANDOM'] <= 0.8] # 0.8
    test = df[df['RANDOM'] > 0.8] # 0.8
    
    # vectorize the data
    vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
    train_matrix = vectorizer.fit_transform(train['PROCESSED_TEXT'])
    test_matrix = vectorizer.transform(test['PROCESSED_TEXT'])
    
    # split feature and label 
    x_train = train_matrix
    x_test = test_matrix
    y_train = train['SENTIMENT']
    y_test = test['SENTIMENT']
    
    # Logistic Regression Model
    lr = LogisticRegression(multi_class='multinomial', max_iter=10000)
    lr.fit(x_train,y_train)
    predictions = lr.predict(x_test)

    model_output_dir = '/tmp'

    # Save model file
    model_file = os.path.join(model_output_dir, 'model.joblib')
    dump(lr, model_file)
    session.file.put(model_file, "@model_data",overwrite=True)

    # Save vectorizer file
    vect_file = os.path.join(model_output_dir, 'vectorizer.joblib')
    dump(vectorizer, vect_file)
    session.file.put(vect_file, "@model_data",overwrite=True)

    return accuracy_score(y_test, predictions)

# Register the Stored Procedure
session.sproc.register(name='train_sentiment_model',
                       func=train_sentiment_model, 
                       packages=['snowflake-snowpark-python','pandas','scikit-learn', 'joblib'],
                       replace=True, 
                       is_permanent=True,
                       stage_location='python_load')

In [None]:
%%time

session.call('TRAIN_SENTIMENT_MODEL')

---
## Model Deployment

With the model artifact produced from the Stored Procedure, we can create a UDF that can be used to infer sentiment for future data ingested into Snowflake.

In [3]:
session.clear_packages()
session.clear_imports()
session.add_import('@MODEL_DATA/model.joblib.gz')
session.add_import('@MODEL_DATA/vectorizer.joblib.gz')

@cachetools.cached(cache={})
def load_model(file_name):
    model_file_path = sys._xoptions.get("snowflake_import_directory") + file_name
    return load(model_file_path)

columns = ('NEGATIVE','NEUTRAL','POSITIVE')
    
@udf(name='predict_sentiment_vect',
     is_permanent=True,
     replace=True,
     stage_location='python_load',
     max_batch_size=50000,
     input_types=[PandasSeriesType(StringType())], 
     return_type=PandasSeriesType(VariantType()),
     packages=['pandas','scikit-learn','cachetools','joblib'])     
def predict_sentiment_vector(sentiment_str):  
    model = load_model('model.joblib.gz')
    vectorizer = load_model('vectorizer.joblib.gz')                            
    
    result = []
    
    for s in sentiment_str:        
        matrix = vectorizer.transform([s])
        
        df = pd.DataFrame(model.predict_proba(matrix),columns=columns)
                
        response = df.loc[0].to_json()
        result.append(json.loads(response))
        
    return pandas.Series(result)

We can quickly test our UDF with a simple SQL call:

In [4]:
session.sql('''select predict_sentiment_vect('PRACTICALLY PERFECT IN EVERY WAY') sentiment''').show()

------------------------------
|"SENTIMENT"                 |
------------------------------
|{                           |
|  "NEGATIVE": 0.08606723,   |
|  "NEUTRAL": 0.1035442952,  |
|  "POSITIVE": 0.8103884748  |
|}                           |
------------------------------



## Ingesting new data for scoring

Now the model is trained and deployed, we can begin ingesting our unstructured data and using our new arctifacts - all within Snowflake, using Snowflake compute infrastructure. 

---

First we create a UDTF that using the `snowflake` class which exposes functions allowing us to read data from unstructued files. We're then able to pull our data elements and return this as part of the UDTF response.

In [5]:
# create the UDTF to read the file
schema = StructType([
    StructField("product_id", StringType()),
    StructField("review_date", DateType()),
    StructField("product_review", StringType())   
])

@udtf(name = "read_unstructured_reviews",is_permanent = True, session=session, stage_location="model_data", replace=True, input_types=[StringType()], output_schema=schema)
class read_reviews:
    def process(self, stagefile):
        import _snowflake
        
        with _snowflake.open(stagefile) as f:
            data = f.readall().decode('utf-8')
            lines = data.split('\n')
            for line in lines:
                lineStr = line.strip()
                d = lineStr.split("|")
                try:
                    # Read the product_id, the product review and the review date.                    
                    review_date = date.fromisoformat(d[1])
                    product_id = d[0]                    
                    product_review = d[2]
                    yield (product_id, review_date, product_review, )
                except:
                    pass



Using a view that queries the Directory Table (this is the only way at present to query Directory Tables in Snowpark) we can get Stage URLs that can be passed to our UDTF. The Directory table looks like this:

In [6]:
session.sql('''
    select 
        relative_path,
        file_url
    from
        directory(@raw_data_stage)
''').show()

---------------------------------------------------------------------------
|"RELATIVE_PATH"     |"FILE_URL"                                          |
---------------------------------------------------------------------------
|reviews__0_0_0.dat  |https://pza13411.us-east-1.privatelink.snowflak...  |
---------------------------------------------------------------------------



In [7]:
df = session.sql('''
    select 
        relative_path,
        product_id,
        review_date,        
        product_review
    from 
        directory(@raw_data_stage) f,
        table(read_unstructured_reviews(f.file_url))
    order by 1,2,3
''')

df_temp = df
df_temp.show()

----------------------------------------------------------------------------------------------------------
|"RELATIVE_PATH"     |"PRODUCT_ID"  |"REVIEW_DATE"  |"PRODUCT_REVIEW"                                    |
----------------------------------------------------------------------------------------------------------
|reviews__0_0_0.dat  |B001W9Y4PK    |2013-04-02     |I've used Tenergy in the past for specialty Li-...  |
|reviews__0_0_0.dat  |B001W9Y4PK    |2013-04-04     |Despite reports and complaints regarding the in...  |
|reviews__0_0_0.dat  |B001W9Y4PK    |2013-04-04     |I got these for a fenix flashlight, and so far ...  |
|reviews__0_0_0.dat  |B001W9Y4PK    |2013-04-04     |I have had really good luck with these , which ...  |
|reviews__0_0_0.dat  |B001W9Y4PK    |2013-04-08     |These batteries are great. The price is right a...  |
|reviews__0_0_0.dat  |B001W9Y4PK    |2013-04-10     |These batteries last a good long time in my tac...  |
|reviews__0_0_0.dat  |B001W9Y4PK    |

In [8]:
df.write.save_as_table('new_reviews_raw',mode="overwrite", table_type="temporary")

In [9]:
session.table('new_reviews_raw').select(
    F.col('product_id'),
    F.col('review_date'),
    F.col('product_review'), 
    F.call_udf(
        'REMOVE_STOPWORDS_VECT',
        F.col('PRODUCT_REVIEW')).alias('PROCESSED_REVIEW')    
).write.save_as_table('new_reviews_ready',mode="overwrite", table_type="temporary")


In [10]:
df = session.table('new_reviews_ready').select(
    F.col('product_id'),
    F.col('review_date'),
    F.col('product_review'),
    F.col('PROCESSED_REVIEW'),
    F.call_udf(
        'predict_sentiment_vect',
        F.col('PROCESSED_REVIEW')).alias('SENTIMENT'))

df = df.select(
    F.col('product_id'),
    F.col('review_date'),
    F.col('product_review'),
    F.col('PROCESSED_REVIEW'),
    F.col('sentiment')['NEGATIVE'].alias('negative'),
    F.col('sentiment')['NEUTRAL'].alias('neutral'),    
    F.col('sentiment')['POSITIVE'].alias('positive')
).write.save_as_table('new_reviews_scored',mode="overwrite")

session.table('new_reviews_scored').select(
    F.col('product_id'),
    F.col('review_date'),
    F.col('product_review'),  
    F.col('positive'),
    F.col('neutral'),
    F.col('negative')).show(50)

-------------------------------------------------------------------------------------------------------------------------------------------------------------
|"PRODUCT_ID"  |"REVIEW_DATE"  |"PRODUCT_REVIEW"                                    |"POSITIVE"             |"NEUTRAL"              |"NEGATIVE"             |
-------------------------------------------------------------------------------------------------------------------------------------------------------------
|B0023B14TU    |2010-01-02     |We have an HD Flip Video which we bought at the...  |0.1427829016           |0.2248342077           |0.6323828907           |
|B0023B14TU    |2010-01-02     |The camera is perfect for portable recording.  ...  |0.8444131441           |0.1553306645           |0.0002561914           |
|B0023B14TU    |2010-01-02     |I purchased this flip camcorder because it had ...  |0.9990063715           |0.0009583783           |3.525010000000000e-05  |
|B0023B14TU    |2010-01-03     |I bought myself a Fl