# Factorization Machines on MovieLens

### Download ml-100k dataset

In [None]:
%%sh
wget http://files.grouplens.org/datasets/movielens/ml-25m.zip
unzip ml-25m.zip

### Build training set and test set

In [15]:
num_users=162541
num_movies=62423
num_ratings=25000095

max_movieid=209171

num_features=num_users+max_movieid

In [16]:
print(num_features)

371712


In [17]:
import csv, sys
import numpy as np
from scipy.sparse import lil_matrix

def loadDataset(filename, lines, columns):
    # Features are one-hot encoded in a sparse matrix
    X = lil_matrix((lines, columns)).astype('float32')
    # Labels are stored in a vector
    Y = []
    line=0
    with open(filename,'r') as f:
        samples=csv.reader(f,delimiter=',')
        next(samples)  # Skip header
        for userId,movieId,rating,timestamp in samples:
            X[line,int(userId)-1] = 1
            X[line,int(num_users)+int(movieId)-1] = 1
            Y.append(float(rating))
            line=line+1
    Y=np.array(Y).astype('float32')
    return X,Y

In [None]:
%%time
X, Y = loadDataset('ml-25m/ratings.csv', num_ratings, num_features)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.05, random_state=59)

In [None]:
print(X_train.shape)
print(Y_train.shape)

print(X_test.shape)
print(Y_test.shape)

### Convert to protobuf and save to S3

In [3]:
import sagemaker, boto3

In [7]:
bucket = sagemaker.Session().default_bucket()
prefix = 'fm-movielens-25m'

train_key      = 'train.protobuf'
train_prefix   = '{}/{}'.format(prefix, 'train')

test_key       = 'test.protobuf'
test_prefix    = '{}/{}'.format(prefix, 'test')
output_prefix  = 's3://{}/{}/output'.format(bucket, prefix)

In [None]:
%%time
import io
import sagemaker.amazon.common as smac

def writeDatasetToProtobuf(X, Y, bucket, prefix, key):
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, X, Y)
    buf.seek(0)
    obj = '{}/{}'.format(prefix, key)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
    return 's3://{}/{}'.format(bucket,obj)
    
train_data = writeDatasetToProtobuf(X_train, Y_train, bucket, train_prefix, train_key)    
test_data  = writeDatasetToProtobuf(X_test, Y_test, bucket, test_prefix, test_key)    
  
print(train_data)
print(test_data)
print('Output: {}'.format(output_prefix))

### Run training job

In [1]:
# If you want to use existing files

train_data = 's3://sagemaker-eu-west-1-613904931467/fm-movielens-25m/train/train.protobuf'
test_data  = 's3://sagemaker-eu-west-1-613904931467/fm-movielens-25m/test/test.protobuf'

In [4]:
from sagemaker import image_uris

region = boto3.Session().region_name    
container = image_uris.retrieve('factorization-machines', region)
print(container)

438346466558.dkr.ecr.eu-west-1.amazonaws.com/factorization-machines:1


In [10]:
fm = sagemaker.estimator.Estimator(container,
                                   role=sagemaker.get_execution_role(),
                                   instance_count=1, 
                                   instance_type='ml.c5.xlarge',
                                   output_path=output_prefix,
                                   volume_size=1
                                   )

fm.set_hyperparameters(feature_dim=num_features,
                      predictor_type='regressor',
                      num_factors=64,
                      epochs=1)

s3_train_data = sagemaker.TrainingInput(train_data, 
                                        distribution='FullyReplicated', 
                                        content_type='application/x-recordio-protobuf',
                                        s3_data_type='S3Prefix',
                                        input_mode='Pipe')

s3_test_data = sagemaker.TrainingInput(test_data,
                                             distribution='FullyReplicated', 
                                             content_type='application/x-recordio-protobuf', 
                                             s3_data_type='S3Prefix',
                                             input_mode='Pipe')
                                             
fm.fit({'train': s3_train_data, 'test': s3_test_data})

2020-07-24 14:36:24 Starting - Starting the training job...
2020-07-24 14:36:26 Starting - Launching requested ML instances......
2020-07-24 14:37:32 Starting - Preparing the instances for training...
2020-07-24 14:38:13 Downloading - Downloading input data...
2020-07-24 14:38:51 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  from numpy.testing import nosetester[0m
[34m[07/24/2020 14:38:53 INFO 140594729453376] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'factors_lr': u'0.0001', u'linear_init_sigma': u'0.01', u'epochs': 1, u'_wd': u'1.0', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'factors_init_sigma': u'0.001', u'_log_level': u'info', u'bias_init_method': u'normal', u'linear_init_method': u'normal', u'linear_lr': u'0.001', u'factors_init_method': u'normal', u'_tuning

[34m[07/24/2020 14:39:24 INFO 140594729453376] Iter[0] Batch [3000]#011Speed: 96754.07 samples/sec[0m
[34m[07/24/2020 14:39:24 INFO 140594729453376] #quality_metric: host=algo-1, epoch=0, batch=3000 train rmse <loss>=1.08619105479[0m
[34m[07/24/2020 14:39:24 INFO 140594729453376] #quality_metric: host=algo-1, epoch=0, batch=3000 train mse <loss>=1.17981100752[0m
[34m[07/24/2020 14:39:24 INFO 140594729453376] #quality_metric: host=algo-1, epoch=0, batch=3000 train absolute_loss <loss>=0.857865714338[0m
[34m[07/24/2020 14:39:29 INFO 140594729453376] Iter[0] Batch [3500]#011Speed: 96385.60 samples/sec[0m
[34m[07/24/2020 14:39:29 INFO 140594729453376] #quality_metric: host=algo-1, epoch=0, batch=3500 train rmse <loss>=1.08259293534[0m
[34m[07/24/2020 14:39:29 INFO 140594729453376] #quality_metric: host=algo-1, epoch=0, batch=3500 train mse <loss>=1.17200746365[0m
[34m[07/24/2020 14:39:29 INFO 140594729453376] #quality_metric: host=algo-1, epoch=0, batch=3500 train absolute_l

[34m[07/24/2020 14:40:55 INFO 140594729453376] Iter[0] Batch [11000]#011Speed: 86153.02 samples/sec[0m
[34m[07/24/2020 14:40:55 INFO 140594729453376] #quality_metric: host=algo-1, epoch=0, batch=11000 train rmse <loss>=1.06857943605[0m
[34m[07/24/2020 14:40:55 INFO 140594729453376] #quality_metric: host=algo-1, epoch=0, batch=11000 train mse <loss>=1.14186201114[0m
[34m[07/24/2020 14:40:55 INFO 140594729453376] #quality_metric: host=algo-1, epoch=0, batch=11000 train absolute_loss <loss>=0.84838495915[0m
[34m[07/24/2020 14:41:00 INFO 140594729453376] Iter[0] Batch [11500]#011Speed: 86476.72 samples/sec[0m
[34m[07/24/2020 14:41:00 INFO 140594729453376] #quality_metric: host=algo-1, epoch=0, batch=11500 train rmse <loss>=1.06830448283[0m
[34m[07/24/2020 14:41:00 INFO 140594729453376] #quality_metric: host=algo-1, epoch=0, batch=11500 train mse <loss>=1.14127446804[0m
[34m[07/24/2020 14:41:00 INFO 140594729453376] #quality_metric: host=algo-1, epoch=0, batch=11500 train abs

[34m[07/24/2020 14:42:36 INFO 140594729453376] Iter[0] Batch [19500]#011Speed: 83664.09 samples/sec[0m
[34m[07/24/2020 14:42:36 INFO 140594729453376] #quality_metric: host=algo-1, epoch=0, batch=19500 train rmse <loss>=1.06566293783[0m
[34m[07/24/2020 14:42:36 INFO 140594729453376] #quality_metric: host=algo-1, epoch=0, batch=19500 train mse <loss>=1.13563749707[0m
[34m[07/24/2020 14:42:36 INFO 140594729453376] #quality_metric: host=algo-1, epoch=0, batch=19500 train absolute_loss <loss>=0.846873935623[0m
[34m[07/24/2020 14:42:42 INFO 140594729453376] Iter[0] Batch [20000]#011Speed: 83646.31 samples/sec[0m
[34m[07/24/2020 14:42:42 INFO 140594729453376] #quality_metric: host=algo-1, epoch=0, batch=20000 train rmse <loss>=1.06556133732[0m
[34m[07/24/2020 14:42:42 INFO 140594729453376] #quality_metric: host=algo-1, epoch=0, batch=20000 train mse <loss>=1.13542096359[0m
[34m[07/24/2020 14:42:42 INFO 140594729453376] #quality_metric: host=algo-1, epoch=0, batch=20000 train ab


2020-07-24 14:45:18 Completed - Training job completed
Training seconds: 425
Billable seconds: 425


### Deploy model

In [11]:
endpoint_name = 'fm-movielens-25m'
fm_predictor = fm.deploy(endpoint_name=endpoint_name,
                         instance_type='ml.t2.medium', initial_instance_count=1)

---------------!

In [12]:
import json

def fm_serializer(data):
    js = {'instances': []}
    for row in data:
        js['instances'].append({'features': row.tolist()})
    return json.dumps(js)

fm_predictor.content_type = 'application/json'
fm_predictor.serializer = fm_serializer

### Run predictions

In [None]:
result = fm_predictor.predict(X_test[:3].toarray())
print(result)

In [None]:
fm_predictor.delete_endpoint()