In [2]:
!pip install stepfunctions
!pip show sagemaker stepfunctions

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting stepfunctions
  Using cached stepfunctions-2.2.0-py2.py3-none-any.whl
Installing collected packages: stepfunctions
Successfully installed stepfunctions-2.2.0
Name: sagemaker
Version: 2.68.0
Summary: Open source library for training and deploying models on Amazon SageMaker.
Home-page: https://github.com/aws/sagemaker-python-sdk/
Author: Amazon Web Services
Author-email: 
License: Apache License 2.0
Location: /opt/conda/lib/python3.7/site-packages
Requires: attrs, boto3, google-pasta, importlib-metadata, numpy, packaging, pandas, pathos, protobuf, protobuf3-to-dict, smdebug-rulesconfig
Required-by: stepfunctions
---
Name: stepfunctions
Version: 2.2.0
Summary: Open source library for develping data science workflows on AWS Step Functions.
Home-page: https://github.com/aws/aws-step-functions-data-science-sdk-python
Author: Amazon Web Services
Author-email: 
License: Apache License 2.0

In [3]:
import stepfunctions
from stepfunctions import steps
from stepfunctions.inputs import ExecutionInput
from stepfunctions.steps import (
    Chain,
    ChoiceRule,
    ModelStep,
    ProcessingStep,
    TrainingStep,
    TransformStep,
)

from sagemaker.workflow.lambda_step import (
    LambdaStep,
    LambdaOutput,
    LambdaOutputTypeEnum,
)

from stepfunctions.template import TrainingPipeline
from stepfunctions.template.utils import replace_parameters_with_jsonpath
from stepfunctions.workflow import Workflow
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import image_uris
from sagemaker.s3 import S3Uploader

import io
import logging
import os
import random
import time
import uuid

In [4]:
sagemaker.__version__

'2.68.0'

In [5]:
workflow_execution_role = "(your workflow execution role here)"

In [6]:
import boto3
import sagemaker
from sagemaker import get_execution_role

region = boto3.session.Session().region_name

role = get_execution_role()

In [7]:
# Combine My Reviews With Review Dataset

# StepFunction Workflow Execution Input Schema

In [None]:
# need to create customer training container for TF probability

In [7]:
# SageMaker expects unique names for each job, model and endpoint.
# If these names are not unique the execution will fail. Pass these
# dynamically for each execution using placeholders.
execution_input = ExecutionInput(
    schema={
        "ProcessingJobName": str,
        "TrainingJobName": str,
        "SaveModelJobName": str,
    }
)

In [8]:
sagemaker_session = sagemaker.Session()

In [9]:
# Set To True For Testing
testing = True

# Create Preprocessing Job

In [17]:
from sagemaker.sklearn.processing import SKLearnProcessor

# need to use ml.m5.2xlarge to have enough memory - can be scaled to larger instance if we have more data
sklearn_processing = SKLearnProcessor(
    framework_version="0.20.0", role=role, instance_type="ml.m5.xlarge", instance_count=1
)

In [2]:
%%writefile retrieval_preprocessing.py

import argparse
import os
import warnings

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


def users_test_train_split(df, split_ratio=0.1):
    unique_users = df['username'].unique()
    
    test_user_len = int(len(unique_users)*split_ratio)
    
    test_df = df[df['username'].isin(unique_users[:test_user_len])]
    train_df = df[~df['username'].isin(unique_users[:test_user_len])]
    
    return test_df, train_df


if __name__ == "__main__":
    input_data_path_reviews = os.path.join("/opt/ml/processing/input/reviews", "final_reviews.csv")
    
    train_output_path = os.path.join("/opt/ml/processing/output/train_data", "retrieval_train.csv")
    test_output_path = os.path.join("/opt/ml/processing/output/test_data", "retrieval_test.csv")
    
    print("Reading review input data from {}".format(input_data_path_reviews))
    review_df = pd.read_csv(input_data_path_reviews, index_col="Unnamed: 0")
    
    # Shuffle dataframe
    review_df = review_df.sample(frac=1)
    
    # only get users with at least 5 reviews
    users_with_favorable_ratings = (review_df['username'].value_counts()
                                .loc[lambda x: x>10]
                                .loc[lambda x: x<100]
                                .index.values)
    
    review_df = review_df[review_df['username'].isin(users_with_favorable_ratings)]
    
    #Generate test train split
    test_df, train_df = users_test_train_split(review_df)
    
    
    print("Saving Train Data {}".format(train_output_path))
    train_df.to_csv(train_output_path, header=True, index=True)
    
    print("Saving Test Data {}".format(test_output_path))
    test_df.to_csv(test_output_path, header=True, index=True)
    
    
    
    

Overwriting retrieval_preprocessing.py


In [19]:
# S3 bucket pathing
input_reviews = "s3://beer-reviews-models-pb/Rec Automation/Review Data/Initial Data/final_reviews.csv".format(region)

output_folder_train = "s3://beer-reviews-models-pb/Rec Automation/Review Data/Retrieval Data/Train".format(region)
output_folder_test = "s3://beer-reviews-models-pb/Rec Automation/Review Data/Retrieval Data/Test".format(region)

In [20]:
PROCESSING_SCRIPT_LOCATION = "retrieval_preprocessing.py"

processing_code = sagemaker_session.upload_data(
    PROCESSING_SCRIPT_LOCATION,
    bucket="beer-reviews-models-pb",
    key_prefix="Rec Automation/Job Scripts",
)

In [21]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

if testing:
    
    # Test labeling job (only run once to check functionality)
    sklearn_processing.run(
        code=processing_code,
        inputs=[
                ProcessingInput(source=input_reviews, destination="/opt/ml/processing/input/reviews"),
            ],
        outputs=[
            ProcessingOutput(output_name="training_data", source="/opt/ml/processing/output/train_data", destination = output_folder_train),
            ProcessingOutput(output_name="test_data", source="/opt/ml/processing/output/test_data", destination = output_folder_test),
        ],
    )

else: 
    pass
    


Job Name:  sagemaker-scikit-learn-2021-11-17-23-04-53-863
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://beer-reviews-models-pb/Rec Automation/Review Data/Initial Data/final_reviews.csv', 'LocalPath': '/opt/ml/processing/input/reviews', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://beer-reviews-models-pb/Rec Automation/Job Scripts/retrieval_preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'training_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://beer-reviews-models-pb/Rec Automation/Review Data/Retrieval Data/Train', 'LocalPath': '/opt/ml/processing/output/train_data', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'test_data', 'AppM

In [24]:
if testing:
    
    preprocessing_job_description = sklearn_processing.jobs[-1].describe()
    print(preprocessing_job_description)
else:
    pass

{'ProcessingInputs': [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://beer-reviews-models-pb/Rec Automation/Review Data/Initial Data/final_reviews.csv', 'LocalPath': '/opt/ml/processing/input/reviews', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://beer-reviews-models-pb/Rec Automation/Job Scripts/retrieval_preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}], 'ProcessingOutputConfig': {'Outputs': [{'OutputName': 'training_data', 'S3Output': {'S3Uri': 's3://beer-reviews-models-pb/Rec Automation/Review Data/Retrieval Data/Train', 'LocalPath': '/opt/ml/processing/output/train_data', 'S3UploadMode': 'EndOfJob'}, 'AppManaged': False}, {'OutputName': 'test_data', 'S3Output': {'S3Uri': '

In [None]:
# add input code as well

processing_inputs = [
            
            ProcessingInput(source=input_samples, destination="/opt/ml/processing/input/reviews", input_name = "input review data"),
            ProcessingInput(source = processing_code, destination="/opt/ml/processing/input/code", input_name="code")
        ]

processing_outputs = [
            ProcessingOutput(output_name="training_data", source="/opt/ml/processing/output/train_data", destination = output_folder_train),
            ProcessingOutput(output_name="test_data", source="/opt/ml/processing/output/test_data", destination = output_folder_test),
    ]

In [None]:
processing_step = ProcessingStep(
    "SageMaker Labeling Step", 
    processor = sklearn_processing,
    job_name = execution_input["ProcessingJobName"],
    inputs = processing_inputs,
    outputs = processing_outputs,
    container_entrypoint = ["python3", '/opt/ml/processing/input/code/retrieval_preprocessing.py'])

# Create Training Script

In [12]:
import pandas as pd

In [40]:
# Testing With Pandas

s3 = boto3.client('s3')
test_data = s3.get_object(Bucket='beer-reviews-models-pb', Key='Rec Automation/Review Data/Retrieval Data/Train/retrieval_train.csv')

df_test = pd.read_csv(io.BytesIO(test_data['Body'].read()), index_col="Unnamed: 0")
df_test.head()

Unnamed: 0,score,username,text,style,abv,brewery_name,beer_name
412226,4.03,cantal,,American IPA,4.9,Oskar Blues Grill & Brew,Pinner Throwback IPA
458058,4.0,warnerry,0%,Belgian Saison,4.2,Crooked Stave Artisan Beer Project,Vieille Artisanal Saison
116967,4.0,NorthCoastPranqster,,American Imperial Stout,9.5,Great Divide Brewing Company,Espresso Oak Aged Yeti Imperial Stout
87978,3.88,cvstrickland,12-ounce longneck poured into my DFH pint g...,English Pale Ale,5.4,Great Divide Brewing Company,Denver Pale Ale
319909,3.89,klewis,A: Pours a reddish copper with an audibly f...,German Doppelbock,7.7,Left Hand Brewing Company,Goosinator - Smoked


In [42]:

%%writefile tf_ret_train.py


import tensorflow as tf
import tensorflow_recommenders as tfrs

from typing import Dict, Text
import argparse
import numpy as np
import json
import os
import pandas as pd



# disable tf logging
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 


def parse_args():
    
    parser = argparse.ArgumentParser()
    
    # Hyperparameters- sent by clientt passed as command line args to script
    parser.add_argument('--epochs', type=int, default=4)
    parser.add_argument('--learning_rate', type=float, default=0.5)
    parser.add_argument('--returned_recommendations', type=int, default=500)
    
    # data directories
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAINING"))
    
    # model directory - /opt/ml/model   by default for sagemaker
    parser.add_argument("--model_dir", type=str)
    parser.add_argument("--sm-model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--hosts", type=list, default=json.loads(os.environ.get("SM_HOSTS")))
    parser.add_argument("--current-host", type=str, default=os.environ.get("SM_CURRENT_HOST"))
    
    return parser.parse_known_args()


def get_train_data(train_dir):
    
    df_train = pd.read_csv(os.path.join(train_dir, 'retrieval_train.csv'), index_col="Unnamed: 0")

    print('x train: ', np.shape(df_train))
    return df_train



def df_to_tensor(df):
    
    df_beer = df['beer_name'].unique()
    df_beer = pd.DataFrame(df_beer, columns = ['beer_name'])
    
    df_ratings = df[['username', 'beer_name']]
    df_ratings = df_ratings.dropna()
    
    # convert dataframes to tensors
    tf_beer_dict = tf.data.Dataset.from_tensor_slices(dict(df_beer))
    tf_ratings_dict = tf.data.Dataset.from_tensor_slices(dict(df_ratings))
    
    # map rows to a dictionary
    ratings = tf_ratings_dict.map(lambda x: {
        "beer_name": x["beer_name"],
        "username": x["username"]
    })
    beer_list = tf_beer_dict.map(lambda x: x['beer_name'])
    
    print('converted df to tensors')
    return ratings, beer_list


def get_unique_beers_and_users(ratings, beer_list):
    usernames = ratings.map(lambda x: x['username'])
    unique_users = np.unique(np.concatenate(list(usernames.batch(1000))))
    unique_beers = np.unique(np.concatenate(list(beer_list.batch(1000))))

    print("unique users: ", len(unique_users), "unique_beers: ", len(unique_beers))
    return unique_users, unique_beers

    
def test_train_split(ratings, df):
    tf.random.set_seed(42)
    shuffled = ratings.shuffle(len(df), seed=42, reshuffle_each_iteration=False)

    train = shuffled.take(int(len(df)*0.8))
    test = shuffled.skip(int(len(df)*0.8)).take(int(len(df)*0.2))
    print("test data len: ", len(test), "train data len: ", len(train))
    return test, train
    
    
# extend the tfrs class
class BeerRetreival(tfrs.Model):
    def __init__(self):
        super().__init__()
        
        embedding_dims = 32
        self.user_model =  tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary= unique_users, mask_token=None),
            tf.keras.layers.Embedding(len(unique_users)+1, embedding_dims)
        ])

        self.beer_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_beers, mask_token=None),
            tf.keras.layers.Embedding(len(unique_beers)+1, embedding_dims)
        ])

        self.task = tfrs.tasks.Retrieval(
                        metrics=tfrs.metrics.FactorizedTopK(
                        candidates=beer_list.batch(128).cache().map(self.beer_model)
                        ))
        
    
    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        user_embeddings = self.user_model(features['username'])
        beer_embeddings = self.beer_model(features['beer_name'])
        return self.task(user_embeddings, beer_embeddings)
        
        

if __name__ == "__main__":

    args, _ = parse_args()
    

    print('Training data location: {}'.format(args.train))
    
    df_train = get_train_data(args.train)
    ratings, beer_list = df_to_tensor(df_train)
    unique_users, unique_beers = get_unique_beers_and_users(ratings, beer_list)
    test, train = test_train_split(ratings, df_train)

    
    returned_recommendations = args.returned_recommendations
    epochs = args.epochs
    learning_rate = args.learning_rate
    #returned_recommendations = 500
    #epochs = 4
    #learning_rate = 0.5
    print('returned reccomendations = {}, epochs = {}, learning rate = {}'.format(returned_recommendations, epochs, learning_rate))
    
    # create + train model
    model = BeerRetreival()
    optimizer = tf.keras.optimizers.Adagrad(learning_rate)
    model.compile(optimizer)
    model.fit(train.batch(8192),
             validation_data = test.batch(512),
             validation_freq = 2,
             epochs = epochs,
             verbose = 0)

    # Eval model
    scores = model.evaluate(test.batch(8192), return_dict=True, verbose=0)

    print("top 10 score: ", scores['factorized_top_k/top_10_categorical_accuracy'])
    print("top 50 score: ", scores['factorized_top_k/top_50_categorical_accuracy'])
    print("top 100 score: ", scores['factorized_top_k/top_100_categorical_accuracy'])

    #save model - need to call first
    brute_force = tfrs.layers.factorized_top_k.BruteForce(model.user_model, k=500)
    brute_force.index_from_dataset(
        beer_list.batch(128).map(lambda beer_name: (beer_name, model.beer_model(beer_name)))
    )

    _ = brute_force(np.array(["pblackburn"]))
    
    
    if args.current_host == args.hosts[0]:
        
        print("Host arg:", args.hosts[0])
        # save model to an S3 directory with version number '/1' in Tensorflow SavedModel Format
        tf.saved_model.save(
          brute_force,
          os.path.join(args.sm_model_dir, "01"))
        

Overwriting tf_ret_train.py


In [43]:
%%writefile requirements.txt
tensorflow-recommenders
pandas


Overwriting requirements.txt


In [44]:
# Test-
from sagemaker.tensorflow import TensorFlow


#git_config = {'repo': 'https://github.com/aws-samples/amazon-sagemaker-script-mode',
#               'branch': 'master' }
#local_instance_type = 'local'
train_instance_type = 'ml.p2.xlarge'
hyperparameters = {'epochs': 4, 'returned_recommendations': 500, 'learning_rate': 0.5}
retrieval_estimator = TensorFlow(
                            entry_point = 'tf_ret_train.py',
                            dependencies=['requirements.txt'],                       
                            instance_type = train_instance_type,
                            instance_count = 1,
                            hyperparameters=hyperparameters,
                            role=sagemaker.get_execution_role(),
                            framework_version='2.5',
                            py_version='py37',
                            #git_config = git_config
                            #script_mode=True
            )

In [45]:
ret_train_data_loc = "s3://beer-reviews-models-pb/Rec Automation/Review Data/Retrieval Data/Train"

retrieval_estimator.fit(ret_train_data_loc)

2021-11-20 00:34:37 Starting - Starting the training job...
2021-11-20 00:34:39 Starting - Launching requested ML instancesProfilerReport-1637368477: InProgress
......
2021-11-20 00:35:54 Starting - Preparing the instances for training.........
2021-11-20 00:37:34 Downloading - Downloading input data
2021-11-20 00:37:34 Training - Downloading the training image.....................
2021-11-20 00:41:08 Training - Training image download completed. Training in progress..[34m2021-11-20 00:41:10.297131: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2021-11-20 00:41:10.303675: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.[0m
[34m2021-11-20 00:41:10.436481: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0[0m
[34m2021-

##### To get Model

In [None]:
!aws s3 cp {estimator.model_data} ./model/model.tar.gz

In [None]:
!tar -xvzf ./model/model.tar.gz -C ./model

## Create Training Job

In [None]:
training_step = steps.TrainingStep(
        "Training Step",
        estimator = retrieval_estimator,
        role=workflow_execution_role,
        inputs= ret_train_data_loc,
        job_name = execution_input["TrainingJobName"]
        #s3_bucket= bucket
)

## Create Hyperperameter Tuning Job (In progress)

In [None]:

'''
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

hyperparameter_ranges = {
  'learning_rate': ContinuousParameter(0.001, 0.2, scaling_type="Logarithmic"),
  'epochs': IntegerParameter(10, 50),
  'batch_size': IntegerParameter(64, 256),
}

metric_definitions = [{'Name': 'loss',
                       'Regex': ' loss: ([0-9\\.]+)'},
                     {'Name': 'val_loss',
                       'Regex': ' val_loss: ([0-9\\.]+)'}]

objective_metric_name = 'val_loss'
objective_type = 'Minimize'
'''

# Save Model Step

In [None]:
model_step = steps.ModelStep(
    "Save Model",
    model = training_step.get_expected_model(), 
    model_name=execution_input["ModelName"])

# Create Async Endpoint

# Create Workflow

In [None]:
workflow_definition = steps.Chain([
    processing_step, training_step, model_step
])

In [None]:
workflow = Workflow(
    name="Retrieval_Model_Automation_v1",
    definition = workflow_definition,
    role = workflow_execution_role,
    execution_input = execution_input)

In [None]:
workflow.create()

In [None]:
# Generate Unique Names- 
processing_job_name = "SK-processing-{}".format(uuid.uuid1().hex)
training_job_name = "TFRS-Retrieval-training-{}".format(uuid.uuid1().hex)
model_name = "TFRS-Retrieval-model-{}".format(uuid.uuid1().hex)

    
execution = workflow.execute(
    inputs={
        "ProcessingJobName": processing_job_name,
        "TrainingJobName": training_job_name ,
        "SaveModelJobName": model_name,
        
    }
)

In [None]:
execution.render_progress()

In [None]:
execution.list_events(html=False)