In [8]:
!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip ml-100k.zip

--2021-04-06 12:52:53--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: 'ml-100k.zip'

     0K .......... .......... .......... .......... ..........  1%  110K 43s
    50K .......... .......... .......... .......... ..........  2%  217K 32s
   100K .......... .......... .......... .......... ..........  3% 1.55M 22s
   150K .......... .......... .......... .......... ..........  4%  214K 22s
   200K .......... .......... .......... .......... ..........  5% 3.05M 18s
   250K .......... .......... .......... .......... ..........  6%  247K 18s
   300K .......... .......... .......... .......... ..........  7% 2.52M 15s
   350K .......... .......... .......... .......... ..........  8% 1.67M 13s
   400K .......... .........

In [25]:
import pandas as pd
data = pd.read_csv('D:/ml-100k/ua.base',sep="	")
data[:5]

Unnamed: 0,1,1.1,5,874965758
0,1,2,3,876893171
1,1,3,4,878542960
2,1,4,3,876893119
3,1,5,3,889751712
4,1,6,5,887431973


In [26]:
from sklearn.utils import shuffle
data = shuffle(data)

In [27]:
data[:5]

Unnamed: 0,1,1.1,5,874965758
9836,98,435,5,880498967
35449,354,922,4,891216825
62713,629,729,4,880117852
36299,368,777,2,889783586
66840,664,318,5,876525044


In [28]:
num_users = 943
num_movies = 1682
num_features = num_users+num_movies
num_ratings_train = 90570
num_ratings_test = 9430

In [30]:
import csv
import numpy as np
from scipy.sparse import lil_matrix
def loadDataset(filename, lines, columns):
    X = lil_matrix((lines, columns)).astype('float32')
    Y = []
    line=0
    with open(filename,'r') as f:
        samples=csv.reader(f,delimiter='\t')
        for userId,movieId,rating,timestamp in samples:
            X[line,int(userId)-1] = 1
            X[line,int(num_users)+int(movieId)-1] = 1
            Y.append(int(rating))
            line=line+1
    Y=np.array(Y).astype('float32')
    return X,Y

In [35]:
X_train, Y_train = loadDataset('D:/ml-100k/ua.base',num_ratings_train,num_features)
X_test, Y_test = loadDataset('D:/ml-100k/ua.test',num_ratings_test,num_features)

In [36]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(90570, 2625)
(90570,)
(9430, 2625)
(9430,)


In [37]:
import io, boto3
import sagemaker.amazon.common as smac

def writeDatasetToProtobuf(X, Y, bucket, prefix, key):
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, X, Y)
    buf.seek(0)
    obj = '{}/{}'.format(prefix, key)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
    return 's3://{}/{}'.format(bucket,obj)

In [38]:
import sagemaker
bucket = sagemaker.Session().default_bucket()
prefix = 'fm-movielens'
train_key = 'train.protobuf'
train_prefix = '{}/{}'.format(prefix, 'train')
test_key = 'test.protobuf'
test_prefix = '{}/{}'.format(prefix, 'test')

In [39]:
output_prefix = 's3://{}/{}/output'.format(bucket,prefix)
train_data = writeDatasetToProtobuf(X_train, Y_train,bucket, train_prefix, train_key)
test_data = writeDatasetToProtobuf(X_test, Y_test,bucket, test_prefix, test_key)

In [40]:
import boto3
region = boto3.Session().region_name

def resolve_sm_role():
    client = boto3.client('iam', region_name=region)
    response_roles = client.list_roles(PathPrefix='/',
                                       # Marker='string'
                                       MaxItems=999) 
    for role in response_roles['Roles']:
        if role['RoleName'].startswith('AmazonSageMaker-ExecutionRole-'):
               # print('Resolved SageMaker IAM Role to: ' + str(role))
                return role['Arn']
    raise Exception('Could not resolve what should be the SageMaker role to be used')
                        #resolve_sm_role()
                        #role = get_execution_role()
role = resolve_sm_role()
role

'arn:aws:iam::501851762093:role/service-role/AmazonSageMaker-ExecutionRole-20210211T005553'

In [41]:
from sagemaker import image_uris
region=boto3.Session().region_name
container=image_uris.retrieve('factorization-machines',
region)
fm=sagemaker.estimator.Estimator(
container,
role=role,
instance_count=1,
instance_type='ml.m4.xlarge',
output_path=output_prefix)
fm.set_hyperparameters(
feature_dim=num_features,
predictor_type='regressor',
num_factors=64,
epochs=10)

In [42]:
fm.fit({'train': train_data, 'test': test_data})

2021-04-06 08:11:54 Starting - Starting the training job...
2021-04-06 08:12:18 Starting - Launching requested ML instancesProfilerReport-1617696715: InProgress
......
2021-04-06 08:13:18 Starting - Preparing the instances for training......
2021-04-06 08:14:39 Downloading - Downloading input data...
2021-04-06 08:14:59 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  from collections import Mapping, MutableMapping, Sequence[0m
  """[0m
  """[0m
[34m[04/06/2021 08:15:26 INFO 140519436588864] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-conf.json: {'epochs': 1, 'mini_batch_size': '1000', 'use_bias': 'true', 'use_linear': 'true', 'bias_lr': '0.1', 'linear_lr': '0.001', 'factors_lr': '0.0001', 'bias_wd': '0.01', 'linear_wd': '0.001', 'factors_wd': '0.00001', 'bias_init_method': 'normal', 'bias_init_sigma': '0.01', 'li


2021-04-06 08:15:42 Uploading - Uploading generated training model
2021-04-06 08:15:42 Completed - Training job completed
Training seconds: 76
Billable seconds: 76


In [48]:
from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import JSONSerializer
endpoint_name = 'fm-movielens-100k'
class FMSerializer(JSONSerializer):
    def serialize(self, data):
        js = {'instances': []}
        for row in data:
            js['instances'].append({'features': row.tolist()})
            return json.dumps(js)
fm_predictor = fm.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    serializer=FMSerializer(),
    deserializer= JSONDeserializer()
)

--------------!

In [49]:
result = fm_predictor.predict(X_test[:3].toarray())
print(result)

NameError: name 'json' is not defined

In [50]:
fm_predictor.delete_endpoint()