# Recommendation System on Amazon SageMaker - Beginner (Factorization Machine)

In this notebook, we are going to build a simple movie recommendation model with Factorization machine using Amazon Sagemaker

In [11]:
import sagemaker
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role

import boto3, csv, io, json
import numpy as np
from scipy.sparse import lil_matrix

Dataset Description including number of users/moview are in official [Movielens website](https://grouplens.org/datasets/movielens/)

In [20]:
import sagemaker
import boto3

# Ensure to use a valid AWS region
region = 'us-west-2'  # Example region

# Create a SageMaker session with the region
sess = sagemaker.Session(boto_session=boto3.Session(region_name=region))

# Now continue with the rest of the code
sm_boto3 = boto3.client("sagemaker", region_name=region)
bucket = sess.default_bucket()
prefix = 'sagemaker/movielens'
print('Sagemaker session :', sess)
print('S3 bucket :', bucket)
print('Prefix :', prefix)
print('Region selected :', region)


Sagemaker session : <sagemaker.session.Session object at 0x0000026C3B32B350>
S3 bucket : sagemaker-us-west-2-503561413948
Prefix : sagemaker/movielens
Region selected : us-west-2


In [18]:
nbUsers=943
nbMovies=1682
nbFeatures=nbUsers+nbMovies

nbRatingsTrain=90570
nbRatingsTest=9430

In [22]:
# For each user, build a list of rated movies.
# We'd need this to add random negative samples.
moviesByUser = {}
for userId in range(nbUsers):
    moviesByUser[str(userId)]=[]

with open('./ml-100k/ua.base','r') as f:
    samples=csv.reader(f,delimiter='\t')
    for userId,movieId,rating,timestamp in samples:
        moviesByUser[str(int(userId)-1)].append(int(movieId)-1) 

In [23]:
def loadDataset(filename, lines, columns):
    # Features are one-hot encoded in a sparse matrix
    X = lil_matrix((lines, columns)).astype('float32')
    # Labels are stored in a vector
    Y = []
    line=0
    with open(filename,'r') as f:
        samples=csv.reader(f,delimiter='\t')
        for userId,movieId,rating,timestamp in samples:
            X[line,int(userId)-1] = 1
            X[line,int(nbUsers)+int(movieId)-1] = 1
            if int(rating) >= 4:
                Y.append(1)
            else:
                Y.append(0)
            line=line+1
            
    Y=np.array(Y).astype('float32')
    return X,Y

In [25]:
X_train, Y_train = loadDataset('./ml-100k/ua.base', nbRatingsTrain, nbFeatures)
X_test, Y_test = loadDataset('./ml-100k/ua.test',nbRatingsTest,nbFeatures)

In [26]:
print(X_train.shape)
print(Y_train.shape)

assert X_train.shape == (nbRatingsTrain, nbFeatures)
assert Y_train.shape == (nbRatingsTrain, )
zero_labels = np.count_nonzero(Y_train)

print("Training labels: %d zeros, %d ones" % (zero_labels, nbRatingsTrain-zero_labels))

print(X_test.shape)
print(Y_test.shape)

assert X_test.shape  == (nbRatingsTest, nbFeatures)
assert Y_test.shape  == (nbRatingsTest, )
zero_labels = np.count_nonzero(Y_test)
print("Test labels: %d zeros, %d ones" % (zero_labels, nbRatingsTest-zero_labels))

(90570, 2625)
(90570,)
Training labels: 49906 zeros, 40664 ones
(9430, 2625)
(9430,)
Test labels: 5469 zeros, 3961 ones


# 3. Convert to protobuf and upload data to S3 bucket

In [27]:
train_key      = 'train.protobuf'
train_prefix   = '{}/{}'.format(prefix, 'train3')

test_key       = 'test.protobuf'
test_prefix    = '{}/{}'.format(prefix, 'test3')

output_prefix  = 's3://{}/{}/output'.format(bucket, prefix)

In [29]:
def writeDatasetToProtobuf(X, Y, bucket, prefix, key):
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, X, Y)
    buf.seek(0)
    obj = '{}/{}'.format(prefix, key)
    boto3.resource('s3',region_name='us-west-2').Bucket(bucket).Object(obj).upload_fileobj(buf)
    return 's3://{}/{}'.format(bucket,obj)
    
train_data = writeDatasetToProtobuf(X_train, Y_train, bucket, train_prefix, train_key)    
test_data  = writeDatasetToProtobuf(X_test, Y_test, bucket, test_prefix, test_key)    
  
print(train_data)
print(test_data)
print('Output: {}'.format(output_prefix))

s3://sagemaker-us-west-2-503561413948/sagemaker/movielens/train3/train.protobuf
s3://sagemaker-us-west-2-503561413948/sagemaker/movielens/test3/test.protobuf
Output: s3://sagemaker-us-west-2-503561413948/sagemaker/movielens/output


# 4. Start Training

In [43]:
# Specify Docker Contatiner
import sagemaker
from sagemaker import image_uris

# Specify region and algorithm
region = 'us-west-2'
algorithm = 'factorization-machines'
version = 'latest'

# Get the container image URI
container = image_uris.retrieve(
    framework=algorithm,
    region=region,
    version=version
)

print(container)


Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: latest.


174872318107.dkr.ecr.us-west-2.amazonaws.com/factorization-machines:1


In [51]:
from sagemaker.estimator import Estimator 

fm = Estimator(container,                             # The container that contains the algorithm
               role = 'arn:aws:iam::503561413948:role/blasagerole',  # Your IAM role
               instance_count=1,                       # Updated parameter name
               instance_type='ml.c4.xlarge',           # Updated parameter name
               output_path=output_prefix,
               sagemaker_session=sess)

fm.set_hyperparameters(feature_dim=nbFeatures,
                      predictor_type='binary_classifier',
                      num_factors=64,
                      epochs=10)

fm.fit({'train': train_data, 'test': test_data})


INFO:sagemaker:Creating training-job with name: factorization-machines-2024-12-02-17-57-11-704


2024-12-02 17:57:14 Starting - Starting the training job...
2024-12-02 17:57:29 Starting - Preparing the instances for training...
2024-12-02 17:58:20 Downloading - Downloading the training image...........Docker entrypoint called with argument(s): train
Running default environment configuration script
  if num_device is 1 and 'dist' not in kvstore:
[12/02/2024 18:00:30 INFO 139763737245504] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-conf.json: {'epochs': 1, 'mini_batch_size': '1000', 'use_bias': 'true', 'use_linear': 'true', 'bias_lr': '0.1', 'linear_lr': '0.001', 'factors_lr': '0.0001', 'bias_wd': '0.01', 'linear_wd': '0.001', 'factors_wd': '0.00001', 'bias_init_method': 'normal', 'bias_init_sigma': '0.01', 'linear_init_method': 'normal', 'linear_init_sigma': '0.01', 'factors_init_method': 'normal', 'factors_init_sigma': '0.001', 'batch_metrics_publish_interval': '500', '_data_format': 'record', '_kvstore': 'auto', '_learnin

### We should be able to check the logs of how Sagemaker takes cares of training like above

# 5. Model Deploy 

You can easily deploy the model with **.deploy** method. This creates a RESTful HTTP endpoint that can be intergrated to any of the applications that we are trying to use. You can also check whether the model is created in your Sagemaker console.

In [53]:
from time import strftime, gmtime
timestamp = strftime('%d-%H-%M-%S', gmtime())

fm_predictor = fm.deploy(
    endpoint_name = 'movielens-{}'.format(timestamp),
    initial_instance_count=1,
    instance_type='ml.m5.large')

INFO:sagemaker:Creating model with name: factorization-machines-2024-12-02-18-14-33-419
INFO:sagemaker:Creating endpoint-config with name movielens-02-18-14-33
INFO:sagemaker:Creating endpoint with name movielens-02-18-14-33


-------!

# 6. Predicting with the test set

In [55]:
result = fm_predictor.predict(X_test[1000].toarray())
print(result)

ParamValidationError: Parameter validation failed:
Invalid type for parameter Body, value: [[0. 0. 0. ... 0. 0. 0.]], type: <class 'numpy.ndarray'>, valid types: <class 'bytes'>, <class 'bytearray'>, file-like object