# Neural Collaborative Filtering - Amazon Review Dataset

## Imports and Global Variables

In [1]:
import pandas as pd

from recommenders.models.ncf.dataset import Dataset as NCFDataset
from recommenders.datasets.python_splitters import python_chrono_split

In [2]:
DATA_PATH = "../data/amazon_reviews/Instant_Video.csv"
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"
LOO_TEST_FILE = "loo_test.csv"

MIN_REVIEWS = 15
SEED = 42
EPOCHS = 100 
BATCH_SIZE = 256

## Data

In [3]:
# Load Dataframe
df = pd.read_csv(DATA_PATH, names=["itemID", "userID", "rating", "timestamp"])
df

Unnamed: 0,itemID,userID,rating,timestamp
0,A1EE2E3N7PW666,B000GFDAUG,5.0,1202256000
1,AGZ8SM1BGK3CK,B000GFDAUG,5.0,1198195200
2,A2VHZ21245KBT7,B000GIOPK2,4.0,1215388800
3,ACX8YW2D5EGP6,B000GIOPK2,4.0,1185840000
4,A9RNMO9MUSMTJ,B000GIOPK2,2.0,1281052800
...,...,...,...,...
583928,A9MBSKL8LTFN9,B00LTMHUW4,5.0,1405987200
583929,A3U3RFL0XE4F7V,B00LTMJ29S,4.0,1405382400
583930,AEO407GROR6JB,B00LU8ONBI,1.0,1405468800
583931,A2YXWWVABHWIXN,B00LU8ONBI,5.0,1405728000


In [4]:
# Change user and item identifies to integers
df["userID"] = df.groupby(["userID"]).ngroup()
df["itemID"] = df.groupby(["itemID"]).ngroup()
df

Unnamed: 0,itemID,userID,rating,timestamp
0,45550,0,5.0,1202256000
1,367442,0,5.0,1198195200
2,212056,1,4.0,1215388800
3,354802,1,4.0,1185840000
4,344843,1,2.0,1281052800
...,...,...,...,...
583928,344395,23961,5.0,1405987200
583929,320050,23962,4.0,1405382400
583930,360238,23963,1.0,1405468800
583931,222722,23963,5.0,1405728000


In [5]:
# Look at number of unqiue column values
df.nunique()

itemID       426922
userID        23965
rating            5
timestamp      3027
dtype: int64

In [6]:
# Get value counts per user id 
vc = df.userID.value_counts() 

# Filter only users with more reviews than MIN_REVIEWS 
df = df[df.userID.isin(vc[vc > MIN_REVIEWS].index)] 

In [7]:
# Split into train and test
train, test = python_chrono_split(df, .75)

# Filter out any users or items in test that do note appear in training set
test = test[test["userID"].isin(train["userID"].unique())]
test = test[test["itemID"].isin(train["itemID"].unique())]

# Leave one out testing
loo_test = test.groupby("userID").last().reset_index()

In [8]:
# Save datasets into csv files
train.to_csv(TRAIN_FILE, index=False)
test.to_csv(TEST_FILE, index=False)
loo_test.to_csv(LOO_TEST_FILE, index=False)

In [9]:
data = NCFDataset(train_file=TRAIN_FILE, test_file=LOO_TEST_FILE, seed=SEED, overwrite_test_file_full=True)

INFO:recommenders.models.ncf.dataset:Indexing train.csv ...
INFO:recommenders.models.ncf.dataset:Indexing loo_test.csv ...
INFO:recommenders.models.ncf.dataset:Creating full leave-one-out test file loo_test_full.csv ...
100%|███████████████████████████████████████| 3315/3315 [03:36<00:00, 15.35it/s]
INFO:recommenders.models.ncf.dataset:Indexing loo_test_full.csv ...


## Model