# Neural Collaborative Filtering - Amazon Review Dataset

## Imports and Global Variables

In [1]:
import pandas as pd

from recommenders.models.ncf.ncf_singlenode import NCF
from recommenders.datasets.ncd import Dataset as NCFDataset
from recommenders.datasets.python_splitters import python_chrono_split

In [2]:
DATA_PATH = "../data/amazon_reviews/Office_Products.csv"
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"
LOO_TEST_FILE = "loo_test.csv"

MIN_REVIEWS = 15
SEED = 42
EPOCHS = 100 
BATCH_SIZE = 256

## Data

In [3]:
# Load Dataframe
df = pd.read_csv(DATA_PATH, names=["itemID", "userID", "rating", "timestamp"])
df

Unnamed: 0,itemID,userID,rating,timestamp
0,0140503528,A2WJLOXXIB7NF3,3.0,1162512000
1,0140503528,A1RKICUK0GG6VF,5.0,1147132800
2,0140503528,A1QA5E50M398VW,5.0,1142035200
3,0140503528,A3N0HBW8IP8CZQ,5.0,980294400
4,0140503528,A1K1JW1C5CUSUZ,5.0,964915200
...,...,...,...,...
5581308,B01HJF6JIA,A1MZM2RPZNEAV7,5.0,1519084800
5581309,B01HJF6JIA,A3MY8HH0ZQDXIL,5.0,1513641600
5581310,B01HJF6JIA,A1FNE2AS2H4UCN,5.0,1507766400
5581311,B01HJF6JIA,A13K8XJJE5OWBJ,5.0,1507680000


In [4]:
# Look at number of unqiue column values
df.nunique()

itemID        306800
userID       3404914
rating             5
timestamp       6985
dtype: int64

In [5]:
# Get value counts per user id 
vc = df.userID.value_counts() 

# Filter only users with more reviews than MIN_REVIEWS 
df = df[df.userID.isin(vc[vc > MIN_REVIEWS].index)] 

A3TTVIQ9RRHNIB    438
AC5HC2TJVWOFT     339
A3OXHLG6DIBRW8    337
A3N77PJ4KP3CJP    285
AVU1ILDDYW301     225
                 ... 
A26XZ0XKDY68TI      1
A3CRLM7USFSS13      1
A17Y39FI2YVK0K      1
A34RN56CII0XPS      1
A13K8XJJE5OWBJ      1
Name: userID, Length: 3404914, dtype: int64

In [7]:
# Split into train and test
train, test = python_chrono_split(df, .75)

# Filter out any users or items in test that do note appear in training set
test = test[test["userID"].isin(train["userID"].unique())]
test = test[test["itemID"].isin(train["itemID"].unique())]

# Leave one out testing
loo_test = test.groupby("userID").last().reset_index()

In [None]:
# Save datasets into csv files
train.to_csv(TRAIN_FILE, index=False)
test.to_csv(TEST_FILE, index=False)
loo_test.to_csv(LOO_TEST_FILE, index=False)

In [None]:
data = NCFDataset(train_file=TRAIN_FILE, test_file=LOO_TEST_FILE, seed=SEED, overwrite_test_file_full=True)

## Model

In [None]:
model = NCF (
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=4,
    layer_sizes=[16,8,4],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED
)

In [None]:
with Timer() as train_time:
    model.fit(data)

print("Took {} seconds for training.".format(train_time.interval))