# Factorization Machines - building predictions for Movie Ratings

## Act 1 - Download and prepare the data

In [None]:
import sagemaker
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role
from sagemaker.predictor import json_deserializer

import boto3, csv, io, json
import numpy as np
from scipy.sparse import lil_matrix
import pandas as pd

bucket = 'YOURBUCKETFROMLAB1' #BUCKET BUCKET LAB1 --> should not start with s3://, just the name
prefix = 'sagemaker/fm-movielens'

In [None]:
!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -o ml-100k.zip

In [None]:
%cd ~/SageMaker/ml-100k
!shuf ua.base -o ua.base.shuffled
print 'shuffed base info\n'
!head -10 ua.base.shuffled
print 'shuffed base info\n'
!head -10 ua.test

## Act 2 - Build training set and test set

The dataset contains this

ml-100k contains multiple text files, but we’re only going to use two of them to build our model:

    ua.base (90,570 samples) will be our training set.
    ua.test (9,430 samples) will be our test set.

Both files have the same tab-separated format:

    user id (integer between 1 and 943)
    movie id (integer between 1 and 1682)
    rating (integer between 1 and 5)
    timestamp (epoch-based integer)

But -- what if we didn't know this, how would we discover it....Let's load it into pandas and look:



In [None]:
NO_HEADER = -1
COLUMN_NAMES = ['user_id','movie_id','rating','timestamp']

ua_base = pd.read_table('~/SageMaker/ml-100k/ua.base', header=NO_HEADER, names=COLUMN_NAMES)
print('Number of entries in base dataset' + str(len(ua_base)))

ua_test = pd.read_table('~/SageMaker/ml-100k/ua.test', header=NO_HEADER, names=COLUMN_NAMES)
print('Number of entries in test dataset' + str(len(ua_test)))

print('user count: ' + str(ua_base['user_id'].nunique()))
print('movie count: ' + str(ua_base['movie_id'].nunique()) + '\n')

print(ua_base.head())

In [None]:
nbUsers=ua_base['user_id'].nunique()
nbMovies=1682
nbFeatures=nbUsers+nbMovies

nbRatingsTrain=len(ua_base)
nbRatingsTest=len(ua_test)

In [None]:
# For each user, build a list of rated movies.
# We'd need this to add random negative samples.
moviesByUser = {}
for userId in range(nbUsers):
    moviesByUser[str(userId)]=[]

with open('ua.base.shuffled','r') as f:
    samples=csv.reader(f,delimiter='\t')
    for userId,movieId,rating,timestamp in samples:
        moviesByUser[str(int(userId)-1)].append(int(movieId)-1) 

In [None]:
def loadDataset(filename, lines, columns):
    # Features are one-hot encoded in a sparse matrix
    X = lil_matrix((lines, columns)).astype('float32')
    # Labels are stored in a vector
    Y = []
    line=0
    with open(filename,'r') as f:
        samples=csv.reader(f,delimiter='\t')
        for userId,movieId,rating,timestamp in samples:
            X[line,int(userId)-1] = 1
            X[line,int(nbUsers)+int(movieId)-1] = 1
            if int(rating) >= 4:
                Y.append(1)
            else:
                Y.append(0)
            line=line+1
            
    Y=np.array(Y).astype('float32')
    return X,Y

In [None]:
X_train, Y_train = loadDataset('ua.base.shuffled', nbRatingsTrain, nbFeatures)
X_test, Y_test = loadDataset('ua.test',nbRatingsTest,nbFeatures)

In [None]:
print('--------------------------------')
print('X Train Shape : ' + str(X_train.shape) + ' should be equal to ' + str((nbRatingsTrain, nbFeatures)))
assert X_train.shape == (nbRatingsTrain, nbFeatures)

print('Y Train Shape : ' + str(Y_train.shape) + ' should be equal to ' + str((nbRatingsTrain, )))
assert Y_train.shape == (nbRatingsTrain, )

zero_labels = np.count_nonzero(Y_train)
print("Training labels: %d zeros, %d ones" % (zero_labels, nbRatingsTrain-zero_labels))

print('--------------------------------')
print('X test shape ' + str(X_test.shape))
print('Y test shape ' + str(Y_test.shape))
assert X_test.shape  == (nbRatingsTest, nbFeatures)
assert Y_test.shape  == (nbRatingsTest, )
zero_labels = np.count_nonzero(Y_test)
print("Test labels: %d zeros, %d ones" % (zero_labels, nbRatingsTest-zero_labels))

### Convert to protobuf and save to S3

In [None]:
train_key      = 'train.protobuf'
train_prefix   = '{}/{}'.format(prefix, 'train3')

test_key       = 'test.protobuf'
test_prefix    = '{}/{}'.format(prefix, 'test3')

output_prefix  = 's3://{}/{}/output'.format(bucket, prefix)

In [None]:
def writeDatasetToProtobuf(X, Y, bucket, prefix, key):
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, X, Y)
    buf.seek(0)
    obj = '{}/{}'.format(prefix, key)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
    return 's3://{}/{}'.format(bucket,obj)
    
train_data = writeDatasetToProtobuf(X_train, Y_train, bucket, train_prefix, train_key)    
test_data  = writeDatasetToProtobuf(X_test, Y_test, bucket, test_prefix, test_key)    
  
print(train_data)
print(test_data)
print('Output: {}'.format(output_prefix))

### Run training job

In [None]:
region_name = boto3.Session().region_name
container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, "factorization-machines", "latest")
print('Using SageMaker container: {} ({})'.format(container, region_name))


In [None]:
# BUILD CODE GOES HERE FOR MODEL...

## Act 3 - Deploy model

In [None]:
fm_predictor = fm.deploy(instance_type='ml.c4.xlarge', initial_instance_count=1)

In [None]:
def fm_serializer(data):
    js = {'instances': []}
    for row in data:
        js['instances'].append({'features': row.tolist()})
    #print js
    return json.dumps(js)

fm_predictor.content_type = 'application/json'
fm_predictor.serializer = fm_serializer
fm_predictor.deserializer = json_deserializer

### Run predictions

In [None]:
result = fm_predictor.predict(X_test[1000:1010].toarray())
print(result)
print (Y_test[1000:1010])