In [None]:
#This code is for windows but first you have to download wget from this 
#link(https://eternallybored.org/misc/wget/) and install. for installation pls follow the 
#link video (https://www.jcchouinard.com/wget/)
!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip ml-100k.zip

import pandas as pd
#if you confused about the seprator so just copy the symbol from the csv 
#file and past it here.
data = pd.read_csv('D:/ml-100k/ua.base',sep="	")#kindly use your own path
data[:5]

#As the dataset is ordered by user ID, we shuffle it as a precaution. Then, we take
#a look at the first few lines:
#%cd ml-100k
#!shuf ua.base -o ua.base.shuffled
#!head -5 ua.base.shuffled

#for shuffling use this code
from sklearn.utils import shuffle
data = shuffle(data)

#We define sizing constants:
num_users = 943
num_movies = 1682
num_features = num_users+num_movies
num_ratings_train = 90570
num_ratings_test = 9430

#Now, let's write a function to load a dataset into a sparse matrix. Based on the
#previous explanation, we go through the dataset line by line. In the X matrix,
#we set the appropriate user and movie columns to 1. We also store the rating in the
#Y vector:
import csv
import numpy as np
from scipy.sparse import lil_matrix
def loadDataset(filename, lines, columns):
    X = lil_matrix((lines, columns)).astype('float32')
    Y = []
    line=0
    with open(filename,'r') as f:
        samples=csv.reader(f,delimiter='\t')
        for userId,movieId,rating,timestamp in samples:
            X[line,int(userId)-1] = 1
            X[line,int(num_users)+int(movieId)-1] = 1
            Y.append(int(rating))
            line=line+1
    Y=np.array(Y).astype('float32')
    return X,Y

#We then process the training and test datasets.
X_train, Y_train = loadDataset('D:/ml-100k/ua.base',num_ratings_train,num_features)
X_test, Y_test = loadDataset('D:/ml-100k/ua.test',num_ratings_test,num_features)

#We check that the shapes are what we expect
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

#Now, let's write a function that converts a dataset to the RecordIO-wrapped
#protobuf, and uploads it to an S3 bucket. We first create an in-memory binary
#stream with io.BytesIO(). Then, we use the life-saving write_spmatrix_
#to_sparse_tensor() function to write the sample matrix and the label vector to
#that buffer in protobuf format. Finally, we use boto3 to upload the buffer to S3:
import io, boto3
import sagemaker.amazon.common as smac

def writeDatasetToProtobuf(X, Y, bucket, prefix, key):
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, X, Y)
    buf.seek(0)
    obj = '{}/{}'.format(prefix, key)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
    return 's3://{}/{}'.format(bucket,obj)

#Had our data been stored in a numpy array instead of lilmatrix, we would
#have used the write_numpy_to_dense_tensor() function instead. It has the
#same effect.
#We apply this function to both datasets, and we store their S3 paths
import sagemaker
bucket = sagemaker.Session().default_bucket()
prefix = 'fm-movielens'
train_key = 'train.protobuf'
train_prefix = '{}/{}'.format(prefix, 'train')
test_key = 'test.protobuf'
test_prefix = '{}/{}'.format(prefix, 'test')

output_prefix = 's3://{}/{}/output'.format(bucket,prefix)
train_data = writeDatasetToProtobuf(X_train, Y_train,bucket, train_prefix, train_key)
test_data = writeDatasetToProtobuf(X_test, Y_test,bucket, test_prefix, test_key)

import boto3
region = boto3.Session().region_name

def resolve_sm_role():
    client = boto3.client('iam', region_name=region)
    response_roles = client.list_roles(PathPrefix='/',
                                       # Marker='string'
                                       MaxItems=999) 
    for role in response_roles['Roles']:
        if role['RoleName'].startswith('AmazonSageMaker-ExecutionRole-'):
               # print('Resolved SageMaker IAM Role to: ' + str(role))
                return role['Arn']
    raise Exception('Could not resolve what should be the SageMaker role to be used')
                        #resolve_sm_role()
                        #role = get_execution_role()
role = resolve_sm_role()
role

#What comes next is SageMaker business as usual. We find the name of the
#Factorization Machines container, configure the Estimator function, and set the
#hyperparameters:
from sagemaker import image_uris
region=boto3.Session().region_name
container=image_uris.retrieve('factorization-machines',
region)
fm=sagemaker.estimator.Estimator(
container,
role=role,
instance_count=1,
instance_type='ml.m4.xlarge',#you can use ml.m5.xlarge both for free tier.
output_path=output_prefix)
fm.set_hyperparameters(
feature_dim=num_features,
predictor_type='regressor',
num_factors=64,
epochs=10)

#We then launch the training job. Did you notice that we didn't configure training
#inputs? We're simply passing the location of the two protobuf files. As protobuf
#is the default format for Factorization Machines (as well as other built-in
#algorithms), we can save a step:
fm.fit({'train': train_data, 'test': test_data})

#We'll now send samples to the endpoint in JSON format (https://docs.aws.
#amazon.com/sagemaker/latest/dg/fact-machines.html#fminputoutput).
#For this purpose, we write a custom serializer to convert input
#data to JSON. The default JSON deserializer will be used automatically since we set
#the content type to 'application/json':
from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import JSONSerializer
endpoint_name = 'fm-movielens-100k'
class FMSerializer(JSONSerializer):
    def serialize(self, data):
        js = {'instances': []}
        for row in data:
            js['instances'].append({'features': row.tolist()})
            return json.dumps(js)
fm_predictor = fm.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",#you can use ml.m5.xlarge both for free tier.
    serializer=FMSerializer(),
    deserializer= JSONDeserializer()
)

#We send the first three samples of the test set for prediction
result = fm_predictor.predict(X_test[:3].toarray())
print(result)

#Finally, we delete the endpoint.
fm_predictor.delete_endpoint()

