In [46]:
import os
import gzip
import subprocess
import pandas as pd
import numpy as np
from datetime import datetime

In [47]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def get_df(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [48]:
DATASET = 'Grocery_and_Gourmet_Food'
RAW_PATH = os.path.join('./', DATASET)
DATA_FILE = 'reviews_{}_5.json.gz'.format(DATASET)
META_FILE = 'meta_{}.json.gz'.format(DATASET)

RANDOM_SEED = 0
NEG_ITEMS = 99

# Load Data

1. Load interaction data and item metadata
2. Filter out unuseful items in metadata
3. Calculate basic statistics

In [49]:
# download data if not exists

if not os.path.exists(RAW_PATH):
    subprocess.call('mkdir ' + RAW_PATH, shell=True)
if not os.path.exists(os.path.join(RAW_PATH, DATA_FILE)):
    print('Downloading interaction data into ' + RAW_PATH)
    subprocess.call(
        'cd {} && curl -O http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_{}_5.json.gz'
        .format(RAW_PATH, DATASET), shell=True)
if not os.path.exists(os.path.join(RAW_PATH, META_FILE)):
    print('Downloading item metadata into ' + RAW_PATH)
    subprocess.call(
        'cd {} && curl -O http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_{}.json.gz'
        .format(RAW_PATH, DATASET), shell=True)

In [50]:
data_df = get_df(os.path.join(RAW_PATH, DATA_FILE))
data_df.head()

Unnamed: 0,reviewerID,helpful,reviewText,reviewerName,summary,asin,overall,reviewTime,unixReviewTime
0,A1VEELTKS8NLZB,"[0, 0]",Just another flavor of Kit Kat but the taste i...,Amazon Customer,Good Taste,616719923X,4.0,"06 1, 2013",1370044800
1,A14R9XMZVJ6INB,"[0, 1]",I bought this on impulse and it comes from Jap...,amf0001,"3.5 stars, sadly not as wonderful as I had hoped",616719923X,3.0,"05 19, 2014",1400457600
2,A27IQHDZFQFNGG,"[3, 4]",Really good. Great gift for any fan of green t...,Caitlin,Yum!,616719923X,4.0,"10 8, 2013",1381190400
3,A31QY5TASILE89,"[0, 0]","I had never had it before, was curious to see ...",DebraDownSth,Unexpected flavor meld,616719923X,5.0,"05 20, 2013",1369008000
4,A2LWK003FFMCI5,"[1, 2]",I've been looking forward to trying these afte...,Diana X.,"Not a very strong tea flavor, but still yummy ...",616719923X,4.0,"05 26, 2013",1369526400


In [51]:
meta_df = get_df(os.path.join(RAW_PATH, META_FILE))
meta_df.head()

Unnamed: 0,title,imUrl,salesRank,asin,description,categories,related,price,brand
0,100 Percent All Natural Vanilla Extract,http://ecx.images-amazon.com/images/I/41gFi5h0...,{'Grocery & Gourmet Food': 374004},0657745316,This is real vanilla extract made with only 3 ...,[[Grocery & Gourmet Food]],{'also_viewed': ['B001GE8N4Y']},,
1,Pure Darjeeling Tea: Loose Leaf,http://ecx.images-amazon.com/images/I/51hs8sox...,{'Grocery & Gourmet Food': 620307},0700026444,"Silverpot Tea, Pure Darjeeling, is an exquisit...",[[Grocery & Gourmet Food]],,,
2,WWE Kids Todler Velvet Slippers featuring John...,http://ecx.images-amazon.com/images/I/518SEST5...,,1403796890,Must have for any WWE Fan\n \n \n \nFeaturing ...,[[Grocery & Gourmet Food]],,3.99,
3,Archer Farms Strawberry Dragonfruit Drink Mix ...,http://ecx.images-amazon.com/images/I/51CFQIis...,{'Grocery & Gourmet Food': 620322},141278509X,Infused with Vitamins and Electrolytes Good So...,[[Grocery & Gourmet Food]],{'also_viewed': ['B0051IETTY']},,
4,Mio Energy Liquid Water Enhancer Black Cherry ...,http://ecx.images-amazon.com/images/I/51EUsMcn...,{'Grocery & Gourmet Food': 268754},1453060375,MiO Energy is your portable energy source givi...,[[Grocery & Gourmet Food]],"{'also_viewed': ['B006MSEOJ2', 'B005VOOQLO', '...",11.99,Mio


In [52]:
# Only retain items that appear in interaction data

useful_meta_df = meta_df[meta_df['asin'].isin(data_df['asin'])].reset_index(drop=True)
all_items = set(useful_meta_df['asin'].values.tolist())

def related_filter(related_dict):
    out_dict = dict()
    if related_dict is not np.nan:
        for r in related_dict:
            out_dict[r] = list(all_items & set(related_dict[r]))
    return out_dict

useful_meta_df['related'] = useful_meta_df['related'].apply(related_filter)

### Statistics

In [53]:
n_users = data_df['reviewerID'].value_counts().size
n_items = data_df['asin'].value_counts().size
n_clicks = len(data_df)
min_time = data_df['unixReviewTime'].min()
max_time = data_df['unixReviewTime'].max()

In [54]:
time_format = '%Y-%m-%d'

print('# Users:', n_users)
print('# Items:', n_items)
print('# Interactions:', n_clicks)
print('Time Span: {}/{}'.format(
    datetime.utcfromtimestamp(min_time).strftime(time_format),
    datetime.utcfromtimestamp(max_time).strftime(time_format))
)

# Users: 14681
# Items: 8713
# Interactions: 151254
Time Span: 2000-08-09/2014-07-23


# Build Dataset

### Interaction data

In [55]:
np.random.seed(RANDOM_SEED)

In [56]:
out_df = data_df.rename(columns={'asin': 'item_id', 'reviewerID': 'user_id', 'unixReviewTime': 'time'})
out_df = out_df[['user_id', 'item_id', 'time']]
out_df = out_df.drop_duplicates(['user_id', 'item_id', 'time'])
out_df = out_df.sort_values(by=['time', 'user_id'], kind='mergesort').reset_index(drop=True)
out_df.head()

Unnamed: 0,user_id,item_id,time
0,A1KXONFPU2XQ5K,B00004S1C5,965779200
1,A23GFTVIETX7DS,B0000CH39R,1068249600
2,A281NPSIMI1C2R,B0000DBN1H,1073433600
3,A3M174IC0VXOS2,B0000537AF,1075593600
4,A218J1WI08045B,B0001EQN88,1082073600


In [57]:
# reindex (start from 1)

uids = sorted(out_df['user_id'].unique())
user2id = dict(zip(uids, range(1, len(uids) + 1)))
iids = sorted(out_df['item_id'].unique())
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out_df['user_id'] = out_df['user_id'].apply(lambda x: user2id[x])
out_df['item_id'] = out_df['item_id'].apply(lambda x: item2id[x])
out_df.head()

Unnamed: 0,user_id,item_id,time
0,2177,3,965779200
1,4161,18,1068249600
2,4698,23,1073433600
3,10146,6,1075593600
4,3915,126,1082073600


In [58]:
# leave one out spliting

clicked_item_set = dict()
for user_id, seq_df in out_df.groupby('user_id'):
    clicked_item_set[user_id] = set(seq_df['item_id'].values.tolist())
    
def generate_dev_test(data_df):
    result_dfs = []
    n_items = data_df['item_id'].value_counts().size
    for idx in range(2):
        result_df = data_df.groupby('user_id').tail(1).copy()
        data_df = data_df.drop(result_df.index)
        neg_items = np.random.randint(1, n_items + 1, (len(result_df), NEG_ITEMS))
        for i, uid in enumerate(result_df['user_id'].values):
            user_clicked = clicked_item_set[uid]
            for j in range(len(neg_items[i])):
                while neg_items[i][j] in user_clicked:
                    neg_items[i][j] = np.random.randint(1, n_items + 1)
        result_df['neg_items'] = neg_items.tolist()
        result_dfs.append(result_df)
    return result_dfs, data_df

In [59]:
leave_df = out_df.groupby('user_id').head(1)
data_df = out_df.drop(leave_df.index)

[test_df, dev_df], data_df = generate_dev_test(data_df)
train_df = pd.concat([leave_df, data_df]).sort_index()

len(train_df), len(dev_df), len(test_df)

(121892, 14681, 14681)

In [60]:
train_df.head()

Unnamed: 0,user_id,item_id,time
0,2177,3,965779200
1,4161,18,1068249600
2,4698,23,1073433600
3,10146,6,1075593600
4,3915,126,1082073600


In [61]:
test_df.head()

Unnamed: 0,user_id,item_id,time,neg_items
104,6185,762,1149206400,"[2733, 3265, 4860, 7892, 4374, 5875, 6745, 346..."
203,3299,1096,1154044800,"[1209, 2493, 771, 8287, 5996, 2345, 3092, 3913..."
369,12801,319,1158278400,"[5394, 4318, 4606, 2563, 6214, 4653, 6236, 424..."
525,10679,511,1163980800,"[1054, 2978, 5492, 3894, 2680, 4951, 2666, 305..."
528,4501,509,1164067200,"[4516, 6730, 3291, 1563, 8653, 3124, 1839, 696..."


In [62]:
# save results

train_df.to_csv(os.path.join(RAW_PATH, 'train.csv'), sep='\t', index=False)
dev_df.to_csv(os.path.join(RAW_PATH, 'dev.csv'), sep='\t', index=False)
test_df.to_csv(os.path.join(RAW_PATH, 'test.csv'), sep='\t', index=False)

### Item Metadata

In [63]:
# level-2 category

l2_cate_lst = list()
for cate_lst in useful_meta_df['categories']:
    l2_cate_lst.append(cate_lst[0][2] if len(cate_lst[0]) > 2 else np.nan)
useful_meta_df['l2_category'] = l2_cate_lst  
l2_cates = sorted(useful_meta_df['l2_category'].dropna().unique())
l2_dict = dict(zip(l2_cates, range(1, len(l2_cates) + 1)))
useful_meta_df['l2_category'] = useful_meta_df['l2_category'].apply(lambda x: l2_dict[x] if x == x else 0)

In [64]:
item_meta_data = dict()
for idx in range(len(useful_meta_df)):
    info = useful_meta_df.iloc[idx]['related']
    item_meta_data[idx] = {
        'item_id': item2id[useful_meta_df.iloc[idx]['asin']],
        'i_category': useful_meta_df.iloc[idx]['l2_category'],
        'r_complement': list(map(lambda x: item2id[x], info['also_bought'])) if 'also_bought' in info else [],
        'r_substitute': list(map(lambda x: item2id[x], info['also_viewed'])) if 'also_viewed' in info else [],
    }

item_meta_df = pd.DataFrame.from_dict(item_meta_data, orient='index')
item_meta_df = item_meta_df[['item_id', 'i_category', 'r_complement', 'r_substitute']]
item_meta_df.head()

Unnamed: 0,item_id,i_category,r_complement,r_substitute
0,1,0,"[6285, 194, 6427, 1647, 7856, 287, 3710, 401, ...","[6285, 194, 4169, 6068, 2286, 6426, 6191]"
1,2,0,"[5880, 3844, 5634, 844, 4198, 843, 5666, 5665,...","[8281, 1020, 2192, 844, 841, 843, 1651, 3075, ..."
2,3,21,"[4879, 7578, 161, 2202, 2627, 2901, 141, 2135]","[16, 5828, 109, 4555]"
3,4,0,"[1749, 8528, 6838, 8073, 4851, 7899, 7299, 573...","[4398, 7274, 4851]"
4,5,0,"[365, 351, 364, 690, 696, 368, 371, 475, 370, ...","[365, 364, 363, 354, 353, 7838, 360, 357, 371,..."


In [65]:
# save results

item_meta_df.to_csv(os.path.join(RAW_PATH, 'item_meta.csv'), sep='\t', index=False)