In [41]:
import numpy as np
import sys
import time
import pandas as pd
sys.path.append("../../")
sys.path.append("../../reco_utils/recommender/rlrmc/")

from reco_utils.dataset.python_splitters import python_random_split
from reco_utils.dataset.python_splitters import python_stratified_split
from reco_utils.dataset import movielens
from reco_utils.recommender.rlrmc.RLRMCdataset import RLRMCdataset 
from reco_utils.recommender.rlrmc.RLRMCalgorithm import RLRMCalgorithm 
# Pymanopt installation is required via
# pip install pymanopt 
from reco_utils.evaluation.python_evaluation import (
    rmse, mae
)
from reco_utils.dataset.python_splitters import (
    python_random_split, 
    python_chrono_split, 
    python_stratified_split
)

# import logging

# %load_ext autoreload
# %autoreload 2

In [2]:
print("Pandas version: {}".format(pd.__version__))
print("System version: {}".format(sys.version))


Pandas version: 0.25.3
System version: 3.6.10 |Anaconda, Inc.| (default, Jan  7 2020, 15:18:16) [MSC v.1916 64 bit (AMD64)]


In [32]:
# Select Movielens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '10m'

# Model parameters

# rank of the model, a positive integer (usually small), required parameter
rank_parameter = 1
# regularization parameter multiplied to loss function, a positive number (usually small), required parameter
regularization_parameter = 0.5
# initialization option for the model, 'svd' employs singular value decomposition, optional parameter
initialization_flag = 'svd' #default is 'random'
# maximum number of iterations for the solver, a positive integer, optional parameterj
maximum_iteration = 500 #optional, default is 100
# maximum time in seconds for the solver, a positive integer, optional parameter
maximum_time = 1000#optional, default is 1000

# Verbosity of the intermediate results
verbosity=0 #optional parameter, valid values are 0,1,2, default is 0
# Whether to compute per iteration train RMSE (and test RMSE, if test data is given)
compute_iter_rmse=True #optional parameter, boolean value, default is False

In [4]:
df = pd.read_csv("fakedata.csv")

In [5]:
df

Unnamed: 0,userID,itemID,rating,timestamp,Products
0,578,38,3,1015394450,nachos
1,887,67,5,1195846014,cauliflower
2,5330,135,5,781019670,dill
3,3096,196,2,1157315272,salmon
4,1936,171,3,868237049,negroni
...,...,...,...,...,...
1048570,1286,50,2,402681105,garden-tomato
1048571,3740,68,1,407086778,endive
1048572,5230,1,5,842093718,ice-cream
1048573,3271,9,5,174429406,donut


In [33]:
## If both validation and test sets are required
# train, validation, test = python_random_split(df,[0.6, 0.2, 0.2])

## If validation set is not required
train, test = python_random_split(df,[0.8, 0.2])

## If test set is not required
# train, validation = python_random_split(df,[0.8, 0.2])

## If both validation and test sets are not required (i.e., the complete dataset is for training the model)
# train = df

In [42]:
train, test = python_chrono_split(
    df,[0.8, 0.2]
)

In [49]:
train

Unnamed: 0,userID,itemID,rating,timestamp,Products
679367,1,119,2,4274891,coconut
60263,1,126,5,15449905,grapefruit
384260,1,143,4,19752316,peppermint
813394,1,187,3,25955515,charoset
157509,1,117,1,29138349,fruit-salad
...,...,...,...,...,...
392055,5999,56,2,1283415553,parsnip
142856,5999,119,2,1285648371,coconut
95395,5999,132,2,1287612126,sweet-cherry
244101,5999,102,1,1291090125,blueberry


In [50]:
test

Unnamed: 0,userID,itemID,rating,timestamp,Products
836384,1,62,1,1307871446,carrot
447495,1,180,3,1309656496,white-wine
708391,1,4,5,1310385758,nougat
472270,1,181,4,1310957521,popcorn
762711,1,155,2,1319189237,asparagus-jp
...,...,...,...,...,...
398,5999,7,4,1545829731,chocolate-mousse
804226,5999,175,5,1546199293,tea
233937,5999,117,1,1552534304,fruit-salad
162195,5999,78,3,1562553436,daiquiri


In [43]:
# data = RLRMCdataset(train=train, validation=validation, test=test)
data = RLRMCdataset(train=train, test=test) # No validation set
# data = RLRMCdataset(train=train, validation=validation) # No test set
# data = RLRMCdataset(train=train) # No validation or test set

In [44]:
model = RLRMCalgorithm(rank = rank_parameter,
                       C = regularization_parameter,
                       model_param = data.model_param,
                       initialize_flag = initialization_flag,
                       maxiter=maximum_iteration,
                       max_time=maximum_time)

In [45]:
model

<reco_utils.recommender.rlrmc.RLRMCalgorithm.RLRMCalgorithm at 0x2260ee99ef0>

In [46]:
start_time = time.time()

model.fit(data,verbosity=verbosity)

# fit_and_evaluate will compute RMSE on the validation set (if given) at every iteration
# model.fit_and_evaluate(data,verbosity=verbosity)

train_time = time.time() - start_time # train_time includes both model initialization and model training time. 

print("Took {} seconds for training.".format(train_time))

Took 3.4726624488830566 seconds for training.


In [47]:
## Obtain predictions on (userID,itemID) pairs (60586,54775) and (52681,36519) in Movielens 10m dataset
# output = model.predict([60586,52681],[54775,36519]) # Movielens 10m dataset

# Obtain prediction on the full test set
predictions_ndarr = model.predict(test['userID'].values,test['itemID'].values)

In [48]:
predictions_df = pd.DataFrame(data={"userID": test['userID'].values, "itemID":test['itemID'].values, "prediction":predictions_ndarr})

## Compute test RMSE 
eval_rmse = rmse(test, predictions_df)
## Compute test MAE 
eval_mae = mae(test, predictions_df)

print("RMSE:\t%f" % eval_rmse,
      "MAE:\t%f" % eval_mae, sep='\n')

RMSE:	2.384315
MAE:	1.744371


In [61]:
predictions_df['prediction']

15.02004972525179

In [None]:
prediction_df

In [52]:
predictions_df.max()

userID        5999.00000
itemID         199.00000
prediction      15.02005
dtype: float64

In [57]:
test.head(10)

Unnamed: 0,userID,itemID,rating,timestamp,Products
836384,1,62,1,1307871446,carrot
447495,1,180,3,1309656496,white-wine
708391,1,4,5,1310385758,nougat
472270,1,181,4,1310957521,popcorn
762711,1,155,2,1319189237,asparagus-jp
891955,1,171,4,1320837145,negroni
152045,1,1,4,1322050828,ice-cream
809884,1,95,4,1331221627,tequila-sunrise
261566,1,139,5,1331566609,ginger
25950,1,82,2,1332011556,mai-tai
