In [1]:
# download the sigir17 data from https://sites.google.com/site/limkwanhui/datacode
# curl 'https://doc-00-b8-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/jbv2cduus8cnmhdbt7drradcako8r0ft/1550836800000/15465375115408245011/*/1TXw_HoUiyMLylcZY3VB5vsZG9jG0W1Je' > ~/SageMaker/mastering-ml-on-aws/chapter6/data-sigir17.zip
# cd /SageMaker/mastering-ml-on-aws/chapter6/
# unzip data-sigir17.zip
# remove README files from all dirs

In [2]:
from os.path import expanduser

SRC_PATH = expanduser("~") + '/SageMaker/mastering-ml-on-aws/chapter6/'


In [3]:
from pyspark.context import SparkContext

sc = SparkContext('local', 'test')


In [4]:
from pyspark.sql import SQLContext

spark = SQLContext(sc)

In [5]:
poi_df = spark.read.csv(SRC_PATH + 'data-sigir17/poiList-sigir17', header=True, inferSchema=True, sep=';')

In [6]:
poi_df.limit(4).toPandas().head()

Unnamed: 0,poiID,poiName,lat,long,rideDuration,theme,theme2,theme3,theme4
0,1,Gadget's Go Coaster,33.810259,-117.918438,1.0,Kiddie,Roller Coaster,,
1,2,Astro Orbitor,28.418532,-81.579153,1.5,Spinning Ride,,,
2,3,Mad Tea Party,33.813458,-117.918289,1.5,Family,Spinning Ride,,
3,4,Dumbo the Flying Elephant,33.81368,-117.918928,1.67,Family,Spinning Ride,,


In [7]:
visits_df = spark.read.csv(SRC_PATH + 'data-sigir17/userVisits-sigir17', header=True, inferSchema=True, sep=';')


In [8]:
sample_df = visits_df.limit(1000).toPandas()

In [9]:
sample_df.head()

Unnamed: 0,id,nsid,takenUnix,poiID,poiTheme,poiFreq,rideDuration,seqID
0,5858403310,10004778@N07,1308262550,6,Ride,1665,120.0,1
1,5857850631,10004778@N07,1308270702,26,Family,18710,900.0,1
2,5858399220,10004778@N07,1308631356,6,Ride,1665,120.0,2
3,8277294024,10004778@N07,1355568624,26,Family,18710,900.0,3
4,9219062165,10004778@N07,1373030964,29,Water,10427,900.0,4


In [10]:
visits_df.describe().toPandas()

Unnamed: 0,summary,id,nsid,takenUnix,poiID,poiTheme,poiFreq,rideDuration,seqID
0,count,332091.0,332091,332091.0,332091.0,332091,332091.0,332091.0,332091.0
1,mean,8916292302.139416,,1323382407.5555675,15.975127299445033,,6181.338365086678,740.7857015095311,4288.19415762547
2,stddev,6226917245.549271,,74244858.13151878,8.695388902420351,,5199.41535123871,488.5329445328169,3093.323953206581
3,min,102530213.0,10000151@N02,1187918299.0,1.0,Dark,162.0,60.0,1.0
4,max,29475731115.0,99987318@N03,1471870895.0,31.0,Water,18710.0,2700.0,11758.0


In [11]:
sample_df.describe()

Unnamed: 0,id,takenUnix,poiID,poiFreq,rideDuration,seqID
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,8448781000.0,1328379000.0,20.785,7764.625,625.2606,50.349
std,4805738000.0,63463680.0,8.138243,6233.964628,324.947216,20.027838
min,1643421000.0,1191397000.0,1.0,580.0,60.0,1.0
25%,6075781000.0,1308877000.0,15.0,2757.0,270.0,33.0
50%,6222417000.0,1310770000.0,23.0,4082.0,600.0,56.0
75%,11517500000.0,1376198000.0,28.0,16366.0,900.0,69.0
max,27776010000.0,1466605000.0,31.0,18710.0,1500.0,73.0


In [12]:
sample_df.nsid.describe()

count             1000
unique              36
top       10182842@N08
freq               365
Name: nsid, dtype: object

In [13]:
visits_df.createOrReplaceTempView('visits')

In [14]:
poi_df.createOrReplaceTempView('points')

In [15]:
spark.sql('select distinct poiID from visits').count()

31

In [16]:
spark.sql('select nsid,count(distinct poiID) as cnt from visits group by nsid').describe().show()

+-------+------------+-----------------+
|summary|        nsid|              cnt|
+-------+------------+-----------------+
|  count|        8903|             8903|
|   mean|        null| 4.86027181848815|
| stddev|        null|5.965584836576787|
|    min|10000151@N02|                1|
|    max|99987318@N03|               31|
+-------+------------+-----------------+



In [17]:
spark.sql('select nsid,poiID,count(*) from visits group by nsid,poiID').describe().show()

+-------+------------+------------------+-----------------+
|summary|        nsid|             poiID|         count(1)|
+-------+------------+------------------+-----------------+
|  count|       43271|             43271|            43271|
|   mean|        null|14.920061935245315|7.674678190936193|
| stddev|        null| 8.437883931275111|52.93100615991835|
|    min|10000151@N02|                 1|                1|
|    max|99987318@N03|                31|             4128|
+-------+------------+------------------+-----------------+



In [28]:
train_df = spark.sql('select hash(nsid) as user_hash_id, poiID, count(*) as pictures_taken from visits group by 1,2')

In [21]:
train_df.count()

43271

In [51]:
train_df.limit(3).toPandas()

Unnamed: 0,user_hash_id,poiID,pictures_taken
0,-1861435726,19,7
1,-1064654977,26,8
2,-636721096,17,1


In [29]:
from pyspark.ml.recommendation import ALS

recommender = ALS(userCol="user_hash_id", itemCol="poiID", ratingCol="pictures_taken", coldStartStrategy="drop")

model = recommender.fit(train_df)

In [30]:
recommendations = model.recommendForAllUsers(10)

In [31]:
recommendations.limit(3).toPandas()

Unnamed: 0,user_hash_id,recommendations
0,413285690,"[(25, 39.260990142822266), (18, 34.83002853393..."
1,1005782960,"[(29, 6.377601146697998), (25, 6.2345833778381..."
2,1410121870,"[(25, 12.15351390838623), (29, 11.446855545043..."


In [71]:
recommendations.createOrReplaceTempView('recommendations')

In [39]:
row_list = spark.sql('select distinct p.poiName, p.poiID from visits v join points p on (p.poiID=v.poiID) ').collect()

In [40]:
id_to_poi_name =  dict(map(lambda x: (x.poiID, x.poiName), row_list))

In [42]:
id_to_poi_name

{1: 'Test Track',
 10: 'Golden Zephyr',
 19: "Tarzan's Treehouse",
 22: 'Country Bear Jamboree',
 9: "Pinocchio's Daring Journey",
 21: 'Red Car Trolley & News Boys',
 13: 'Haunted Mansion',
 26: 'Sleeping Beauty Castle Walkthrough',
 8: 'The Great Movie Ride',
 12: "It's A Small World",
 20: 'Splash Mountain',
 29: 'Pirates of the Caribbean',
 16: 'Buzz Lightyear Astro Blasters',
 25: "It's A Small World",
 14: 'The Many Adventures of Winnie the Pooh',
 11: "California Screamin'",
 15: 'The Twilight Zone Tower of Terror',
 3: "Soarin'",
 4: 'Journey Into Imagination With Figment',
 24: 'Jungle Cruise',
 2: 'Astro Orbiter',
 5: 'Silly Symphony Swings',
 6: "Snow White's Scary Adventures",
 30: 'Mark Twain Riverboat',
 28: 'Main Street Cinema',
 7: 'Voyage of The Little Mermaid',
 23: 'Redwood Creek Challenge Trail',
 18: 'Tom Sawyer Island',
 27: 'Walt Disney World Railroad',
 17: 'Rose & Crown Pub Musician',
 31: 'Fantasmic!'}

In [79]:
def poi_names(recommendations, visited_pois):
    visited_set = set([id_to_poi_name[poi] for poi in visited_pois])
    recommended = str([(id_to_poi_name[poi], weight) for (poi,weight) in recommendations 
                       if id_to_poi_name[poi] not in visited_set])
    return "recommended: %s ; visited: %s "%(recommended, visited_set)

spark.udf.register("poi_names", poi_names)

<function __main__.poi_names(recommendations, visited_pois)>

In [80]:
recommendation_sample = spark.sql('select user_hash_id, collect_list(poiID), poi_names(max(recommendations), collect_list(poiID)) as recommendation from recommendations r join visits v on (r.user_hash_id = hash(v.nsid)) group by 1').sample(fraction=0.1, withReplacement=False).collect()

In [81]:
print(recommendation_sample[0].recommendation)

recommended: [("It's A Small World", 31.352962493896484), ('Walt Disney World Railroad', 23.464025497436523), ('Pirates of the Caribbean', 21.36219596862793), ('Buzz Lightyear Astro Blasters', 17.21680450439453), ('Haunted Mansion', 15.873616218566895), ('Country Bear Jamboree', 9.63521957397461), ('Astro Orbiter', 9.164801597595215), ('The Great Movie Ride', 8.167647361755371)] ; visited: {"California Screamin'", 'Sleeping Beauty Castle Walkthrough', 'Voyage of The Little Mermaid', "Tarzan's Treehouse", 'Main Street Cinema', 'The Many Adventures of Winnie the Pooh', 'Jungle Cruise', 'Tom Sawyer Island', 'Test Track', 'The Twilight Zone Tower of Terror'} 


In [85]:
print(recommendation_sample[200].recommendation)

recommended: [('Splash Mountain', 0.9785523414611816), ('Sleeping Beauty Castle Walkthrough', 0.8383632302284241), ("Pinocchio's Daring Journey", 0.7456990480422974), ('Journey Into Imagination With Figment', 0.4501221477985382), ("California Screamin'", 0.44446268677711487), ('Tom Sawyer Island', 0.41949236392974854), ("It's A Small World", 0.40130260586738586), ('Astro Orbiter', 0.37899214029312134), ('The Twilight Zone Tower of Terror', 0.3728359639644623)] ; visited: {"Snow White's Scary Adventures"} 


In [88]:
print(recommendation_sample[600].recommendation)

recommended: [('Fantasmic!', 20.900590896606445), ('Pirates of the Caribbean', 9.25596809387207), ("It's A Small World", 8.825133323669434), ('Buzz Lightyear Astro Blasters', 5.474684715270996), ('Main Street Cinema', 5.1001691818237305), ('Country Bear Jamboree', 4.3145904541015625), ("California Screamin'", 3.717888832092285), ("It's A Small World", 3.6027705669403076), ('The Many Adventures of Winnie the Pooh', 3.429044246673584)] ; visited: {'Haunted Mansion', 'The Twilight Zone Tower of Terror', 'Journey Into Imagination With Figment'} 


In [97]:
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.feature import QuantileDiscretizer
from pyspark.ml.feature import VectorAssembler


pipeline = Pipeline(stages = [
    StringIndexer(inputCol='user_hash_id', outputCol="user_hash_id_index", handleInvalid='keep'),
    OneHotEncoder(inputCol='user_hash_id_index', outputCol='user_hash_id_encoded'),
    StringIndexer(inputCol='poiID', outputCol='poi_id_indexed', handleInvalid='keep'),
    OneHotEncoder(inputCol='poi_id_indexed', outputCol='poi_id_encoded'),
    QuantileDiscretizer(numBuckets=5, inputCol='pictures_taken', outputCol='interest_level'),
    VectorAssembler(inputCols=['poi_id_encoded', 'user_hash_id_encoded'],
                    outputCol='features'),
])

model = pipeline.fit(train_df)

In [98]:
sparse_df = model.transform(train_df)

In [99]:
sparse_df.show(3)

+------------+-----+--------------+------------------+--------------------+--------------+---------------+--------------+--------------------+
|user_hash_id|poiID|pictures_taken|user_hash_id_index|user_hash_id_encoded|poi_id_indexed| poi_id_encoded|interest_level|            features|
+------------+-----+--------------+------------------+--------------------+--------------+---------------+--------------+--------------------+
| -1861435726|   19|             7|             279.0|  (8903,[279],[1.0])|          17.0|(31,[17],[1.0])|           3.0|(8934,[17,310],[1...|
| -1064654977|   26|             8|             181.0|  (8903,[181],[1.0])|           5.0| (31,[5],[1.0])|           3.0|(8934,[5,212],[1....|
|  -636721096|   17|             1|            2187.0| (8903,[2187],[1.0])|           4.0| (31,[4],[1.0])|           1.0|(8934,[4,2218],[1...|
+------------+-----+--------------+------------------+--------------------+--------------+---------------+--------------+--------------------+

In [100]:
sagemaker_train_df, sagemaker_test_df = sparse_df.randomSplit([0.8, 0.2], seed=17)


In [45]:
# once pyspark supports writing to protobuf directly, as one can do in scala we could do:
#
# sagemaker_train_df.write.format("sagemaker").option("labelColumnName", "interest_level").option("featuresColumnName", "features").save("s3://mastering-ml-aws/chapter6/train-data/")

In [101]:
from scipy.sparse import csr_matrix
import numpy as np 
import boto3
import io
import numpy as np
import scipy.sparse as sp
import sagemaker.amazon.common as smac


def spark_vector_to_sparse_matrix(row):
    vect = row['features']
    return csr_matrix((vect.values, vect.indices, np.array([0, vect.values.size])),(1, vect.size), dtype=np.float32)

def upload_matrices_to_s3(dataframe, dataset_name):
    features_matrices = dataframe.select("features").rdd.map(spark_vector_to_sparse_matrix).collect()
    interest_levels = dataframe.select("interest_level").rdd.map(lambda r: r['interest_level']).collect()
    
    interest_level_vector = np.array(interest_levels, dtype=np.float32)
    buffer = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buffer, sp.vstack(features_matrices), interest_level_vector)
    buffer.seek(0)
    bucket = boto3.resource('s3').Bucket('mastering-ml-aws')
    bucket.Object('chapter6/%s-data.protobuf'%dataset_name).upload_fileobj(buffer)

In [102]:
upload_matrices_to_s3(sagemaker_train_df, 'train')
upload_matrices_to_s3(sagemaker_test_df, 'test')

In [103]:
sagemaker_train_df.select("features").limit(1).collect()


[Row(features=SparseVector(8934, {22: 1.0, 858: 1.0}))]

In [60]:
feature_dimension = len(sagemaker_train_df.select("features").limit(1).collect()[0].features)
feature_dimension

8934

In [61]:
model.stages

[StringIndexer_408c93fb63bc1870f856,
 OneHotEncoder_417081dda23913c94abf,
 StringIndexer_4c6db0dcee74ed526966,
 OneHotEncoder_4342a6d727d179d2e97a,
 Bucketizer_4760870914f95951004a,
 VectorAssembler_4ccd914048999de2d238]

In [67]:
model.stages[0].labels[:5]

['-339797423', '-45351632', '-1550420186', '333791386', '344980893']

In [68]:
model.stages[2].labels[:5]

['944065163', '94926449', '-559580957', '-554124381', '-1355542311']

In [70]:
len(model.stages[0].labels), len(model.stages[2].labels)

(8903, 31)

In [72]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3

sess = sagemaker.Session()
role = get_execution_role()

container = sagemaker.amazon.amazon_estimator.get_image_uri('us-east-1', "factorization-machines", "latest")

s3_train_data = 's3://mastering-ml-aws/chapter6/train-data.protobuf'
s3_test_data = 's3://mastering-ml-aws/chapter6/train-data.protobuf'
s3_output_location = 's3://mastering-ml-aws/chapter6/sagemaker/output/'
s3_model_location = 's3://mastering-ml-aws/chapter6/sagemaker/model/'

In [74]:
from sagemaker.session import s3_input

sess = sagemaker.Session()

recommender = sagemaker.estimator.Estimator(container,
                                            role, 
                                            train_instance_count=1, 
                                            train_instance_type='ml.c4.xlarge',
                                            output_path=s3_output_location,
                                            sagemaker_session=sess)

recommender.set_hyperparameters(predictor_type='regressor',
                                feature_dim=feature_dimension,
                                epochs=200,
                                mini_batch_size=100,
                                num_factors=128)


recommender.fit({'train': s3_input(s3_train_data), \
                  'test': s3_input(s3_test_data)})

INFO:sagemaker:Creating training-job with name: factorization-machines-2019-02-22-14-25-41-535


2019-02-22 14:25:41 Starting - Starting the training job...
2019-02-22 14:25:43 Starting - Launching requested ML instances............
2019-02-22 14:27:47 Starting - Preparing the instances for training......
2019-02-22 14:29:10 Downloading - Downloading input data..
[31mDocker entrypoint called with argument(s): train[0m
[31m[02/22/2019 14:29:24 INFO 140042773481280] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'factors_lr': u'0.0001', u'linear_init_sigma': u'0.01', u'epochs': 1, u'_wd': u'1.0', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'factors_init_sigma': u'0.001', u'_log_level': u'info', u'bias_init_method': u'normal', u'linear_init_method': u'normal', u'linear_lr': u'0.001', u'factors_init_method': u'normal', u'_tuning_objective_metric': u'', u'bias_wd': u'0.01', u'use_linear': u'true', u'bias_lr': u'0.1', u'mini_batch_size': u'1000', u'_use_full_symbolic': u'true', u'batch_metrics_publish_interv


2019-02-22 14:29:22 Training - Training image download completed. Training in progress.[31m[02/22/2019 14:29:30 INFO 140042773481280] #quality_metric: host=algo-1, epoch=8, train rmse <loss>=0.630365094681[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 604.363203048706, "sum": 604.363203048706, "min": 604.363203048706}}, "EndTime": 1550845770.38344, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1550845769.778471}
[0m
[31m[02/22/2019 14:29:30 INFO 140042773481280] #progress_metric: host=algo-1, completed 4 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 346, "sum": 346.0, "min": 346}, "Number of Batches Since Last Reset": {"count": 1, "max": 346, "sum": 346.0, "min": 346}, "Number of Records Since Last Reset": {"count": 1, "max": 34560, "sum": 34560.0, "min": 34560}, "Total Batches Seen": {"count": 1, "max": 3115, "sum": 3115.0, "min": 3115}, "Total R

[31m[02/22/2019 14:29:40 INFO 140042773481280] #quality_metric: host=algo-1, epoch=25, train rmse <loss>=0.543338305246[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 578.0069828033447, "sum": 578.0069828033447, "min": 578.0069828033447}}, "EndTime": 1550845780.3995, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1550845779.821034}
[0m
[31m[02/22/2019 14:29:40 INFO 140042773481280] #progress_metric: host=algo-1, completed 13 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 346, "sum": 346.0, "min": 346}, "Number of Batches Since Last Reset": {"count": 1, "max": 346, "sum": 346.0, "min": 346}, "Number of Records Since Last Reset": {"count": 1, "max": 34560, "sum": 34560.0, "min": 34560}, "Total Batches Seen": {"count": 1, "max": 8997, "sum": 8997.0, "min": 8997}, "Total Records Seen": {"count": 1, "max": 898660, "sum": 898660.0, "min": 898660}, "Max Reco

[31m[02/22/2019 14:29:50 INFO 140042773481280] #quality_metric: host=algo-1, epoch=42, train rmse <loss>=0.450812121802[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 570.5058574676514, "sum": 570.5058574676514, "min": 570.5058574676514}}, "EndTime": 1550845790.653091, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1550845790.082126}
[0m
[31m[02/22/2019 14:29:50 INFO 140042773481280] #progress_metric: host=algo-1, completed 21 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 346, "sum": 346.0, "min": 346}, "Number of Batches Since Last Reset": {"count": 1, "max": 346, "sum": 346.0, "min": 346}, "Number of Records Since Last Reset": {"count": 1, "max": 34560, "sum": 34560.0, "min": 34560}, "Total Batches Seen": {"count": 1, "max": 14879, "sum": 14879.0, "min": 14879}, "Total Records Seen": {"count": 1, "max": 1486180, "sum": 1486180.0, "min": 1486180}, "

[31m[02/22/2019 14:30:00 INFO 140042773481280] #quality_metric: host=algo-1, epoch=59, train rmse <loss>=0.34357239303[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 575.6139755249023, "sum": 575.6139755249023, "min": 575.6139755249023}}, "EndTime": 1550845800.71201, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1550845800.135833}
[0m
[31m[02/22/2019 14:30:00 INFO 140042773481280] #progress_metric: host=algo-1, completed 30 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 346, "sum": 346.0, "min": 346}, "Number of Batches Since Last Reset": {"count": 1, "max": 346, "sum": 346.0, "min": 346}, "Number of Records Since Last Reset": {"count": 1, "max": 34560, "sum": 34560.0, "min": 34560}, "Total Batches Seen": {"count": 1, "max": 20761, "sum": 20761.0, "min": 20761}, "Total Records Seen": {"count": 1, "max": 2073700, "sum": 2073700.0, "min": 2073700}, "Ma

[31m[02/22/2019 14:30:10 INFO 140042773481280] #quality_metric: host=algo-1, epoch=76, train rmse <loss>=0.271676683421[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 579.2219638824463, "sum": 579.2219638824463, "min": 579.2219638824463}}, "EndTime": 1550845810.879541, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1550845810.299893}
[0m
[31m[02/22/2019 14:30:10 INFO 140042773481280] #progress_metric: host=algo-1, completed 38 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 346, "sum": 346.0, "min": 346}, "Number of Batches Since Last Reset": {"count": 1, "max": 346, "sum": 346.0, "min": 346}, "Number of Records Since Last Reset": {"count": 1, "max": 34560, "sum": 34560.0, "min": 34560}, "Total Batches Seen": {"count": 1, "max": 26643, "sum": 26643.0, "min": 26643}, "Total Records Seen": {"count": 1, "max": 2661220, "sum": 2661220.0, "min": 2661220}, "

[31m[02/22/2019 14:30:20 INFO 140042773481280] #quality_metric: host=algo-1, epoch=93, train rmse <loss>=0.23205343464[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 608.3459854125977, "sum": 608.3459854125977, "min": 608.3459854125977}}, "EndTime": 1550845820.884078, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1550845820.275276}
[0m
[31m[02/22/2019 14:30:20 INFO 140042773481280] #progress_metric: host=algo-1, completed 47 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 346, "sum": 346.0, "min": 346}, "Number of Batches Since Last Reset": {"count": 1, "max": 346, "sum": 346.0, "min": 346}, "Number of Records Since Last Reset": {"count": 1, "max": 34560, "sum": 34560.0, "min": 34560}, "Total Batches Seen": {"count": 1, "max": 32525, "sum": 32525.0, "min": 32525}, "Total Records Seen": {"count": 1, "max": 3248740, "sum": 3248740.0, "min": 3248740}, "M

[31m[02/22/2019 14:30:30 INFO 140042773481280] #quality_metric: host=algo-1, epoch=110, train rmse <loss>=0.223481784641[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 608.0150604248047, "sum": 608.0150604248047, "min": 608.0150604248047}}, "EndTime": 1550845830.885026, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1550845830.276345}
[0m
[31m[02/22/2019 14:30:30 INFO 140042773481280] #progress_metric: host=algo-1, completed 55 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 346, "sum": 346.0, "min": 346}, "Number of Batches Since Last Reset": {"count": 1, "max": 346, "sum": 346.0, "min": 346}, "Number of Records Since Last Reset": {"count": 1, "max": 34560, "sum": 34560.0, "min": 34560}, "Total Batches Seen": {"count": 1, "max": 38407, "sum": 38407.0, "min": 38407}, "Total Records Seen": {"count": 1, "max": 3836260, "sum": 3836260.0, "min": 3836260}, 

[31m[02/22/2019 14:30:40 INFO 140042773481280] #quality_metric: host=algo-1, epoch=127, train rmse <loss>=0.225142132804[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 579.7219276428223, "sum": 579.7219276428223, "min": 579.7219276428223}}, "EndTime": 1550845840.865238, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1550845840.285062}
[0m
[31m[02/22/2019 14:30:40 INFO 140042773481280] #progress_metric: host=algo-1, completed 64 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 346, "sum": 346.0, "min": 346}, "Number of Batches Since Last Reset": {"count": 1, "max": 346, "sum": 346.0, "min": 346}, "Number of Records Since Last Reset": {"count": 1, "max": 34560, "sum": 34560.0, "min": 34560}, "Total Batches Seen": {"count": 1, "max": 44289, "sum": 44289.0, "min": 44289}, "Total Records Seen": {"count": 1, "max": 4423780, "sum": 4423780.0, "min": 4423780}, 

[31m[02/22/2019 14:30:50 INFO 140042773481280] #quality_metric: host=algo-1, epoch=144, train rmse <loss>=0.273728688349[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 584.4399929046631, "sum": 584.4399929046631, "min": 584.4399929046631}}, "EndTime": 1550845850.921501, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1550845850.336605}
[0m
[31m[02/22/2019 14:30:50 INFO 140042773481280] #progress_metric: host=algo-1, completed 72 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 346, "sum": 346.0, "min": 346}, "Number of Batches Since Last Reset": {"count": 1, "max": 346, "sum": 346.0, "min": 346}, "Number of Records Since Last Reset": {"count": 1, "max": 34560, "sum": 34560.0, "min": 34560}, "Total Batches Seen": {"count": 1, "max": 50171, "sum": 50171.0, "min": 50171}, "Total Records Seen": {"count": 1, "max": 5011300, "sum": 5011300.0, "min": 5011300}, 

[31m[02/22/2019 14:31:00 INFO 140042773481280] #quality_metric: host=algo-1, epoch=161, train rmse <loss>=0.25522409174[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 581.9649696350098, "sum": 581.9649696350098, "min": 581.9649696350098}}, "EndTime": 1550845860.968799, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1550845860.386416}
[0m
[31m[02/22/2019 14:31:00 INFO 140042773481280] #progress_metric: host=algo-1, completed 81 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 346, "sum": 346.0, "min": 346}, "Number of Batches Since Last Reset": {"count": 1, "max": 346, "sum": 346.0, "min": 346}, "Number of Records Since Last Reset": {"count": 1, "max": 34560, "sum": 34560.0, "min": 34560}, "Total Batches Seen": {"count": 1, "max": 56053, "sum": 56053.0, "min": 56053}, "Total Records Seen": {"count": 1, "max": 5598820, "sum": 5598820.0, "min": 5598820}, "

[31m[02/22/2019 14:31:11 INFO 140042773481280] #quality_metric: host=algo-1, epoch=178, train rmse <loss>=0.284077982209[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 611.6819381713867, "sum": 611.6819381713867, "min": 611.6819381713867}}, "EndTime": 1550845871.001684, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1550845870.389535}
[0m
[31m[02/22/2019 14:31:11 INFO 140042773481280] #progress_metric: host=algo-1, completed 89 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 346, "sum": 346.0, "min": 346}, "Number of Batches Since Last Reset": {"count": 1, "max": 346, "sum": 346.0, "min": 346}, "Number of Records Since Last Reset": {"count": 1, "max": 34560, "sum": 34560.0, "min": 34560}, "Total Batches Seen": {"count": 1, "max": 61935, "sum": 61935.0, "min": 61935}, "Total Records Seen": {"count": 1, "max": 6186340, "sum": 6186340.0, "min": 6186340}, 

[31m[02/22/2019 14:31:20 INFO 140042773481280] #quality_metric: host=algo-1, epoch=195, train rmse <loss>=0.255086008703[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 564.7931098937988, "sum": 564.7931098937988, "min": 564.7931098937988}}, "EndTime": 1550845880.980498, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1550845880.415276}
[0m
[31m[02/22/2019 14:31:20 INFO 140042773481280] #progress_metric: host=algo-1, completed 98 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 346, "sum": 346.0, "min": 346}, "Number of Batches Since Last Reset": {"count": 1, "max": 346, "sum": 346.0, "min": 346}, "Number of Records Since Last Reset": {"count": 1, "max": 34560, "sum": 34560.0, "min": 34560}, "Total Batches Seen": {"count": 1, "max": 67817, "sum": 67817.0, "min": 67817}, "Total Records Seen": {"count": 1, "max": 6773860, "sum": 6773860.0, "min": 6773860}, 


2019-02-22 14:31:32 Uploading - Uploading generated training model
2019-02-22 14:31:32 Completed - Training job completed
Billable seconds: 143


In [75]:
predictor = recommender.deploy(instance_type='ml.c5.xlarge', initial_instance_count=1)

INFO:sagemaker:Creating model with name: factorization-machines-2019-02-22-14-48-53-845
INFO:sagemaker:Creating endpoint with name factorization-machines-2019-02-22-14-25-41-535


----------------------------------------------------------------!

In [83]:
def build_request(row):
    vect = row['features']
    return {'data':{ 'features': {'shape':[int(vect.size)], 'keys':list(map(int,vect.indices)), 'values':list(vect.values)}}}


In [141]:
sagemaker_test_df.select('features').where('user_hash_id=-1942492617').rdd.map(build_request).collect()

[{'data': {'features': {'shape': [8934],
    'keys': [22, 3926],
    'values': [1.0, 1.0]}}}]

In [142]:
import json
from sagemaker.predictor import json_deserializer
from sagemaker.predictor import json_serializer

predictor.content_type = 'application/json'
predictor.deserializer = json_deserializer
predictor.serializer = lambda x:x

predictor.predict(json.dumps({'instances': [{'data': {'features': {'shape': [8934], 'keys': [22, 3926], 'values': [1, 1]}}}]}))

{'predictions': [{'score': 0.7248020172119141}]}

In [150]:
def predict_poi(poi_position):
    prediction = predictor.predict(json.dumps({'instances': [{'data': {'features': {'shape': [8934], 'keys': [poi_position, 3926], 'values': [1, 1]}}}]}))
    return prediction['predictions'][0]['score']

predictions = [(poi_position, predict_poi(poi_position)) for poi_position in range(0,31)]

In [151]:
predictions.sort(key=lambda x:x[1], reverse=True)

In [163]:
predictions[:4]

[(0, 0.8622169494628906),
 (16, 0.8516387939453125),
 (3, 0.8335628509521484),
 (8, 0.8225231170654297)]

In [167]:
user_visited_pois = set(recommendation_sample[3]['collect_list(poiName)'])
for (poi_position, score) in predictions[:10]:
   recommended_poi = hash_to_poi_mapping[int(model.stages[2].labels[poi_position])]
   if recommended_poi not in user_visited_pois:
        print(recommended_poi) 

Casey Jr. Circus Train
Star Tours
The Barnstormer
Radiator Springs Racers
Ellen's Energy Adventure
Test Track
Mickey's Fun Wheel
Frontierland Shootin' Arcade


In [161]:
# compare with spark recommendations
recommendation_sample[3]

Row(user_hash_id=-1942492617, collect_list(poiName)=['Animation Academy', 'Swiss Family Treehouse', "It's A Small World", 'Main Street Cinema', 'Indiana Jones Epic Stunt Spectacular!', 'Impressions de France', 'Monsters, Inc. Mike & Sulley to the Rescue!', 'Haunted Mansion', "Roger Rabbit's Car Toon Spin"], poi_names(max(recommendations), collect_list(poiName, 0, 0))='[(\'Pirates of the Caribbean\', 1.0013824701309204), ("Ellen\'s Energy Adventure", 0.9788758158683777), (\'Disney Junior - Live on Stage!\', 0.7382391691207886), ("Frontierland Shootin\' Arcade", 0.4183497130870819), (\'Test Track\', 0.34399351477622986), (\'Casey Jr. Circus Train\', 0.23990291357040405), (\'Pirates of the Caribbean\', 0.19927802681922913)]')