In [6]:
import csv
import numpy as np
from scipy.sparse import lil_matrix


num_users = 945
num_movies = 1682
num_features = num_users + num_movies
num_ratings_train = 90570
num_ratings_test = 9430


#Function to upload the datasets
def loadDataset(filename, lines, columns):
# Sparse Matric    
    X = scipy.sparse.lil_matrix((lines, columns)).astype('float32')
#  Y is the movie rating 
    Y = []  
    line = 0
    with open (filename, 'r') as f:
        samples = csv.reader(f, delimiter='\t')
        for userId, movieId, rating, timestap in samples:
            X[line, int(userId)] = 1
            X[line, int(num_users) + int(movieId) -1] = 1
            Y.append(int(rating))
            line=line+1  #index for loop

    Y=np.array(Y).astype('float32')
    return X, Y

#Split data for trainning and test
X_train, Y_train = loadDataset('/home/ec2-user/SageMaker/database/ml-100k/ua.base.shuffled', num_ratings_train, num_features)
X_test, Y_test = loadDataset('/home/ec2-user/SageMaker/database/ml-100k/ua.test', num_ratings_test, num_features)

print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)


(90570, 2627)
(90570,)
(9430, 2627)
(9430,)


In [9]:
import io,boto3
import sagemaker.amazon.common as smac
import sagemaker

bucket = sagemaker.Session().default_bucket()
prefix = 'fm-movielens'
train_key      = 'train.protobuf'
train_prefix   = '{}/{}'.format(prefix, 'train')
test_key       = 'test.protobuf'
test_prefix    = '{}/{}'.format(prefix, 'test')
output_prefix  = 's3://{}/{}/output'.format(bucket, prefix)


# Function that converts a dataset to the RecordIO-wrapped protobuf and uploads it to an S3 bucket 
def writeDatasetToProtobuf(X, Y, bucket, prefix, key):

    buf = io.BytesIO()  # in memory binary stream
    smac.write_spmatrix_to_sparse_tensor(buf, X, Y)  # sparse matrix
    buf.seek(0)
    print(buf)
    obj = '{}/{}'.format(prefix, key)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
    print('Wrote dataset: {}/{}'.format(bucket,obj))
    return 's3://{}/{}'.format(bucket,obj)
    
train_data = writeDatasetToProtobuf(X_train, Y_train, bucket, train_prefix, train_key)    
test_data = writeDatasetToProtobuf(X_test, Y_test, bucket, test_prefix, test_key)    

print(train_data)
print(test_data)
print('Output: {}'.format(output_prefix))

<_io.BytesIO object at 0x7f0058c84fc0>
Wrote dataset: sagemaker-us-east-1-027893685092/fm-movielens/train/train.protobuf
<_io.BytesIO object at 0x7f0058c84fc0>
Wrote dataset: sagemaker-us-east-1-027893685092/fm-movielens/test/test.protobuf
s3://sagemaker-us-east-1-027893685092/fm-movielens/train/train.protobuf
s3://sagemaker-us-east-1-027893685092/fm-movielens/test/test.protobuf
Output: s3://sagemaker-us-east-1-027893685092/fm-movielens/output


In [11]:
from sagemaker import image_uris

sess = sagemaker.Session()

#Find the name of the factorization machines container, configure the estimator function and hyperparameters

region = boto3.Session().region_name
#define algorythm to use in container
container = image_uris.retrieve('factorization-machines', region)
role =  sagemaker.get_execution_role()


fm = sagemaker.estimator.Estimator(container,
                                   role, 
                                   instance_count=1, 
                                   instance_type='ml.c5.xlarge',
                                   output_path=output_prefix,
                                   sagemaker_session=sess)
#required hyperparameters
fm.set_hyperparameters(feature_dim= num_features,
                      predictor_type='regressor',
                      mini_batch_size=1000,
                      num_factors=64,
                      epochs=10)
#launch the training job
fm.fit({'train': train_data, 'test':test_data})

2020-12-01 00:01:18 Starting - Starting the training job...
2020-12-01 00:01:23 Starting - Launching requested ML instances.........
2020-12-01 00:02:54 Starting - Preparing the instances for training......
2020-12-01 00:04:14 Downloading - Downloading input data
2020-12-01 00:04:14 Training - Downloading the training image...
2020-12-01 00:04:41 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  from numpy.testing import nosetester[0m
[34m[12/01/2020 00:04:42 INFO 139969969338176] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'factors_lr': u'0.0001', u'linear_init_sigma': u'0.01', u'epochs': 1, u'_wd': u'1.0', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'factors_init_sigma': u'0.001', u'_log_level': u'info', u'bias_init_method': u'normal', u'linear_init_method': u'normal', u

In [53]:
from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import JSONSerializer

#Create endpoint for inferance
endpoint_name = 'fm-movielens-100k'


In [54]:
import json


#This appears to be due to the removal of the methods in version 2.x of the SageMaker.  Fix bug below.
class FMSerializer(JSONSerializer):
    def serialize(self, data):
       js = {'instances': []}
       for row in data:
              js['instances'].append({'features': row.tolist()})
       return json.dumps(js)



fm_predictor = fm.deploy(endpoint_name = endpoint_name,
                         initial_instance_count=1,
                         instance_type='ml.t2.medium',
                         serializer=FMSerializer(),
                         deserializer= JSONDeserializer())


-----------------!

In [55]:
#Send the first of the test set for prediction
result = fm_predictor.predict(X_test[:3].toarray())
print(result)



{'predictions': [{'score': 3.384589433670044}, {'score': 3.4442405700683594}, {'score': 3.6831166744232178}]}


In [58]:
#Delete endpoint

fm_predictor.delete_endpoint()

In [62]:
import boto3

from sagemaker import image_uris

num_users = 945
num_movies = 1682
num_features = num_users + num_movies
num_ratings_train = 90570
num_ratings_test = 9430

sess = sagemaker.Session()

#Find the name of the factorization machines container, configure the estimator function and hyperparameters

region = boto3.Session().region_name
#define algorythm to use in container
container = image_uris.retrieve('pca', region)
role =  sagemaker.get_execution_role()


pca = sagemaker.estimator.Estimator(container,
                                   role, 
                                   instance_count=1, 
                                   instance_type='ml.c5.xlarge',
                                   output_path=output_prefix
                                   )
#required hyperparameters
pca.set_hyperparameters(feature_dim= num_features,
                      num_components = 64,
                      mini_batch_size=1024,
                      )


In [63]:
#launch the training job
pca.fit({'train': train_data, 'test':test_data})

2020-12-01 02:18:40 Starting - Starting the training job...
2020-12-01 02:18:42 Starting - Launching requested ML instances......
2020-12-01 02:19:57 Starting - Preparing the instances for training.........
2020-12-01 02:21:32 Downloading - Downloading input data
2020-12-01 02:21:32 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[12/01/2020 02:21:49 INFO 140040044738368] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'_num_gpus': u'auto', u'_log_level': u'info', u'subtract_mean': u'true', u'force_dense': u'true', u'epochs': 1, u'algorithm_mode': u'regular', u'extra_components': u'-1', u'_kvstore': u'dist_sync', u'_num_kv_servers': u'auto'}[0m
[34m[12/01/2020 02:21:49 INFO 140040044738368] Merging with provided configuration from /opt/ml/input/config/hyperparameters.json: {u'feature_dim': u'2627', u'mi

In [None]:
import json


#This appears to be due to the removal of the methods in version 2.x of the SageMaker.  Fix bug below.
class FMSerializer(JSONSerializer):
    def serialize(self, data):
       js = {'instances': []}
       for row in data:
              js['instances'].append({'features': row.tolist()})
       return json.dumps(js)


pca_predictor = pca.deploy(endpoint_name = 'pca-movielens-100k',
                         initial_instance_count=1,
                         instance_type='ml.t2.medium',
                          serializer=FMSerializer(),
                         deserializer= JSONDeserializer())

                         

--

In [None]:
#Send the first of the test set for prediction
result = pca_predictor.predict(X_test[:0].toarray())
print(result)

In [None]:
#Delete endpoint

pca_predictor.delete_endpoint()