In [4]:
import os
import gzip
import subprocess
import pandas as pd
import numpy as np
from datetime import datetime

In [5]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def get_df(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [6]:
DATASET = 'Beauty'
RAW_PATH = os.path.join('./', DATASET)
DATA_FILE = 'reviews_{}_5.json.gz'.format(DATASET)
META_FILE = 'meta_{}.json.gz'.format(DATASET)

RANDOM_SEED = 0
NEG_ITEMS = 1000

# Load Data

1. Load interaction data and item metadata
2. Filter out unuseful items in metadata
3. Calculate basic statistics

In [7]:
# download data if not exists

if not os.path.exists(RAW_PATH):
    subprocess.call('mkdir ' + RAW_PATH, shell=True)
if not os.path.exists(os.path.join(RAW_PATH, DATA_FILE)):
    print('Downloading interaction data into ' + RAW_PATH)
    subprocess.call(
        'cd {} && curl -O http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_{}_5.json.gz'
        .format(RAW_PATH, DATASET), shell=True)
if not os.path.exists(os.path.join(RAW_PATH, META_FILE)):
    print('Downloading item metadata into ' + RAW_PATH)
    subprocess.call(
        'cd {} && curl -O http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_{}.json.gz'
        .format(RAW_PATH, DATASET), shell=True)

In [8]:
data_df = get_df(os.path.join(RAW_PATH, DATA_FILE))
data_df.head()

Unnamed: 0,reviewTime,asin,reviewText,summary,overall,helpful,reviewerName,unixReviewTime,reviewerID
0,"01 30, 2014",7806397051,Very oily and creamy. Not at all what I expect...,Don't waste your money,1.0,"[3, 4]",Andrea,1391040000,A1YJEY40YUW4SE
1,"04 18, 2014",7806397051,This palette was a decent price and I was look...,OK Palette!,3.0,"[1, 1]",Jessica H.,1397779200,A60XNB876KYML
2,"09 6, 2013",7806397051,The texture of this concealer pallet is fantas...,great quality,4.0,"[0, 1]",Karen,1378425600,A3G6XNM240RMWA
3,"12 8, 2013",7806397051,I really can't tell what exactly this thing is...,Do not work on my face,2.0,"[2, 2]",Norah,1386460800,A1PQFP6SAJ6D80
4,"10 19, 2013",7806397051,"It was a little smaller than I expected, but t...",It's okay.,3.0,"[0, 0]",Nova Amor,1382140800,A38FVHZTNQ271F


In [9]:
meta_df = get_df(os.path.join(RAW_PATH, META_FILE))
meta_df.head()

Unnamed: 0,title,asin,categories,description,imUrl,salesRank,price,related,brand
0,Bio-Active Anti-Aging Serum (Firming Ultra-Hyd...,205616461,"[[Beauty, Skin Care, Face, Creams & Moisturize...","As we age, our once youthful, healthy skin suc...",http://ecx.images-amazon.com/images/I/41DecrGO...,{'Health & Personal Care': 461765},,,
1,Eco Friendly Ecotools Quality Natural Bamboo C...,558925278,"[[Beauty, Tools & Accessories, Makeup Brushes ...",Mineral Powder Brush--Apply powder or mineral ...,http://ecx.images-amazon.com/images/I/51L%2BzY...,{'Beauty': 402875},,,
2,Mastiha Body Lotion,733001998,"[[Beauty, Skin Care, Body, Moisturizers, Lotio...","From the Greek island of Chios, this Mastiha b...",http://ecx.images-amazon.com/images/I/311WK5y1...,{'Beauty': 540255},,,
3,Hello Kitty Lustre Lipstick (See sellers comme...,737104473,"[[Beauty, Makeup, Lips, Lipstick]]",Limited edition Hello Kitty Lipstick featuring...,http://ecx.images-amazon.com/images/I/31u6Hrzk...,{'Beauty': 931125},,,
4,Stephanie Johnson Mermaid Round Snap Mirror,762451459,"[[Beauty, Tools & Accessories, Mirrors, Makeup...","The mermaid is an elusive (okay, mythical) cre...",http://ecx.images-amazon.com/images/I/41y2%2BF...,,19.98,,


In [10]:
# Only retain items that appear in interaction data

useful_meta_df = meta_df[meta_df['asin'].isin(data_df['asin'])].reset_index(drop=True)
all_items = set(useful_meta_df['asin'].values.tolist())

def related_filter(related_dict):
    out_dict = dict()
    if related_dict is not np.nan:
        for r in related_dict:
            out_dict[r] = list(all_items & set(related_dict[r]))
    return out_dict

useful_meta_df['related'] = useful_meta_df['related'].apply(related_filter)

### Statistics

In [11]:
n_users = data_df['reviewerID'].value_counts().size
n_items = data_df['asin'].value_counts().size
n_clicks = len(data_df)
min_time = data_df['unixReviewTime'].min()
max_time = data_df['unixReviewTime'].max()

In [12]:
time_format = '%Y-%m-%d'

print('# Users:', n_users)
print('# Items:', n_items)
print('# Interactions:', n_clicks)
print('Time Span: {}/{}'.format(
    datetime.utcfromtimestamp(min_time).strftime(time_format),
    datetime.utcfromtimestamp(max_time).strftime(time_format))
)

# Users: 22363
# Items: 12101
# Interactions: 198502
Time Span: 2002-06-12/2014-07-23


# Build Dataset

### Interaction data

In [13]:
np.random.seed(RANDOM_SEED)

In [14]:
out_df = data_df.rename(columns={'asin': 'item_id', 'reviewerID': 'user_id', 'unixReviewTime': 'time'})
out_df = out_df[['user_id', 'item_id', 'time']]
out_df = out_df.drop_duplicates(['user_id', 'item_id', 'time'])
out_df = out_df.sort_values(by=['time', 'user_id'], kind='mergesort').reset_index(drop=True)
out_df.head()

Unnamed: 0,user_id,item_id,time
0,A281NPSIMI1C2R,B0000535UX,1023840000
1,A281NPSIMI1C2R,B0000535UM,1024185600
2,A281NPSIMI1C2R,B0000535UN,1024185600
3,AWIF8AR75LL9L,B000065DK4,1036627200
4,A281NPSIMI1C2R,B000052Y33,1052611200


In [15]:
# reindex (start from 1)

uids = sorted(out_df['user_id'].unique())
user2id = dict(zip(uids, range(1, len(uids) + 1)))
iids = sorted(out_df['item_id'].unique())
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out_df['user_id'] = out_df['user_id'].apply(lambda x: user2id[x])
out_df['item_id'] = out_df['item_id'].apply(lambda x: item2id[x])
out_df.head()

Unnamed: 0,user_id,item_id,time
0,7188,104,1023840000
1,7188,101,1024185600
2,7188,102,1024185600
3,21815,152,1036627200
4,7188,23,1052611200


In [16]:
# leave one out spliting

clicked_item_set = dict()
for user_id, seq_df in out_df.groupby('user_id'):
    clicked_item_set[user_id] = set(seq_df['item_id'].values.tolist())
    
def generate_dev_test(data_df):
    result_dfs = []
    n_items = data_df['item_id'].value_counts().size
    for idx in range(2):
        result_df = data_df.groupby('user_id').tail(1).copy()
        data_df = data_df.drop(result_df.index)
        neg_items = np.random.randint(1, n_items + 1, (len(result_df), NEG_ITEMS))
        for i, uid in enumerate(result_df['user_id'].values):
            user_clicked = clicked_item_set[uid]
            for j in range(len(neg_items[i])):
                while neg_items[i][j] in user_clicked:
                    neg_items[i][j] = np.random.randint(1, n_items + 1)
        result_df['neg_items'] = neg_items.tolist()
        result_dfs.append(result_df)
    return result_dfs, data_df

In [17]:
leave_df = out_df.groupby('user_id').head(1)
data_df = out_df.drop(leave_df.index)

[test_df, dev_df], data_df = generate_dev_test(data_df)
train_df = pd.concat([leave_df, data_df]).sort_index()

len(train_df), len(dev_df), len(test_df)

(153776, 22363, 22363)

In [18]:
train_df.head()

Unnamed: 0,user_id,item_id,time
0,7188,104,1023840000
1,7188,101,1024185600
2,7188,102,1024185600
3,21815,152,1036627200
4,7188,23,1052611200


In [19]:
test_df.head()

Unnamed: 0,user_id,item_id,time,neg_items
41,12076,2747,1084838400,"[2733, 10800, 9846, 3265, 4860, 9226, 7892, 43..."
108,11223,1274,1113264000,"[1170, 4058, 11509, 6614, 184, 11461, 4208, 53..."
132,11386,572,1119830400,"[11064, 4990, 9516, 6468, 512, 2680, 4367, 271..."
208,4428,1078,1131926400,"[11452, 3628, 9168, 9020, 3350, 7653, 7666, 66..."
243,4751,956,1138924800,"[4225, 3501, 7525, 1386, 4784, 10950, 11754, 7..."


In [20]:
# save results

train_df.to_csv(os.path.join(RAW_PATH, 'train.csv'), sep='\t', index=False)
dev_df.to_csv(os.path.join(RAW_PATH, 'dev.csv'), sep='\t', index=False)
test_df.to_csv(os.path.join(RAW_PATH, 'test.csv'), sep='\t', index=False)