In [1]:
import os
import gzip
import subprocess
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def get_df(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [3]:
DATASET = 'Cell_Phones_and_Accessories'
RAW_PATH = os.path.join('./', DATASET)
DATA_FILE = 'reviews_{}_5.json.gz'.format(DATASET)
META_FILE = 'meta_{}.json.gz'.format(DATASET)

# Load Data

1. Load interaction data and item metadata
2. Filter out unuseful items
3. Calculate basic statistics

In [5]:
# download data if not exists

if not os.path.exists(RAW_PATH):
    subprocess.call('mkdir ' + RAW_PATH, shell=True)
if not os.path.exists(os.path.join(RAW_PATH, DATA_FILE)):
    print('Downloading interaction data into ' + RAW_PATH)
    subprocess.call(
        'cd {} && curl -O http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_{}_5.json.gz'
        .format(RAW_PATH, DATASET), shell=True)
if not os.path.exists(os.path.join(RAW_PATH, META_FILE)):
    print('Downloading item metadata into ' + RAW_PATH)
    subprocess.call(
        'cd {} && curl -O http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_{}.json.gz'
        .format(RAW_PATH, DATASET), shell=True)

Downloading interaction data into ./Cell_Phones_and_Accessories
Downloading item metadata into ./Cell_Phones_and_Accessories


In [6]:
data_df = get_df(os.path.join(RAW_PATH, DATA_FILE))
data_df.head()

Unnamed: 0,asin,unixReviewTime,reviewerID,helpful,reviewTime,overall,reviewText,summary,reviewerName
0,120401325X,1400630400,A30TL5EWN6DFXT,"[0, 0]","05 21, 2014",4.0,They look good and stick good! I just don't li...,Looks Good,christina
1,120401325X,1389657600,ASY55RVNIL0UD,"[0, 0]","01 14, 2014",5.0,These stickers work like the review says they ...,Really great product.,emily l.
2,120401325X,1403740800,A2TMXE2AFO7ONB,"[0, 0]","06 26, 2014",5.0,These are awesome and make my phone look so st...,LOVE LOVE LOVE,Erica
3,120401325X,1382313600,AWJ0WZQYMYFQ4,"[4, 4]","10 21, 2013",4.0,Item arrived in great time and was in perfect ...,Cute!,JM
4,120401325X,1359849600,ATX7CZYFXI1KW,"[2, 3]","02 3, 2013",5.0,"awesome! stays on, and looks great. can be use...",leopard home button sticker for iphone 4s,patrice m rogoza


In [7]:
meta_df = get_df(os.path.join(RAW_PATH, META_FILE))
meta_df.head()

Unnamed: 0,related,asin,price,brand,title,categories,description,salesRank,imUrl
0,"{'also_bought': ['B00C56IXFG', 'B008ZUQWOK', '...",0110400550,3.33,,Pink &amp; White 3d Melt Ice-cream Skin Hard C...,"[[Cell Phones & Accessories, Cases, Basic Cases]]",Pink & White 3D Melt Ice-Cream Skin Hard Case ...,{'Cell Phones & Accessories': 83460},http://ecx.images-amazon.com/images/I/31zn6SOL...
1,"{'buy_after_viewing': ['B008RU7UL2', 'B00698LY...",011040047X,1.94,,Purple Hard Case Cover for Iphone 4 4s 4g with...,"[[Cell Phones & Accessories, Cases, Basic Cases]]",Purple Hard Case Cover for iPhone 4 4S 4G With...,{'Cell Phones & Accessories': 495795},http://ecx.images-amazon.com/images/I/41WCZc2d...
2,"{'buy_after_viewing': ['B00530RXP2', 'B004SH9B...",0195866479,2.94,,Hello Kitty Light-weighted Chrome Case Black C...,"[[Cell Phones & Accessories, Cases, Basic Cases]]","Thin and light weighted,\nCase's unique design...",{'Cell Phones & Accessories': 371302},http://ecx.images-amazon.com/images/I/41fy1%2B...
3,"{'buy_after_viewing': ['B0042FV2SI', 'B00869D2...",0214514706,0.94,,Cool Summer Breeze in the Ocean Beach Collecti...,"[[Cell Phones & Accessories, Cases, Basic Cases]]",Product Name: Cool Summer Breeze In The Ocean...,{'Cell Phones & Accessories': 778100},http://ecx.images-amazon.com/images/I/415cmp6Q...
4,"{'buy_after_viewing': ['B008EU7HRM', 'B00869D2...",0214714705,5.79,,Cool Summer Breeze In The Ocean Beach Collecti...,"[[Cell Phones & Accessories, Cases, Basic Cases]]",Product Name: Cool Summer Breeze In The Ocean...,{'Cell Phones & Accessories': 654894},http://ecx.images-amazon.com/images/I/41XDwPt2...


In [8]:
# Filter items

useful_meta_df = meta_df[meta_df['asin'].isin(data_df['asin'])]
items_with_info = useful_meta_df['related'].apply(lambda x: x is not np.nan)
useful_meta_df = useful_meta_df[items_with_info].reset_index(drop=True)

all_items = set(useful_meta_df['asin'].values.tolist())
def related_filter(related_dict):
    out_dict = dict()
    for r in related_dict:
        out_dict[r] = list(all_items & set(related_dict[r]))
    return out_dict

useful_meta_df['related'] = useful_meta_df['related'].apply(related_filter)
data_df = data_df[data_df['asin'].isin(all_items)]

### Statistics

In [9]:
n_users = data_df['reviewerID'].value_counts().size
n_items = data_df['asin'].value_counts().size
n_clicks = len(data_df)
min_time = data_df['unixReviewTime'].min()
max_time = data_df['unixReviewTime'].max()

In [10]:
time_format = '%Y-%m-%d'

print('# Users:', n_users)
print('# Items:', n_items)
print('# Interactions:', n_clicks)
print('Time Span: {}/{}'.format(
    datetime.utcfromtimestamp(min_time).strftime(time_format),
    datetime.utcfromtimestamp(max_time).strftime(time_format))
)

# Users: 27878
# Items: 10329
# Interactions: 193228
Time Span: 2001-02-22/2014-07-23


# Build Dataset

### Interaction data

In [11]:
np.random.seed(2019)
NEG_ITEMS = 99

In [12]:
out_df = data_df.rename(columns={'asin': 'item_id', 'reviewerID': 'user_id', 'unixReviewTime': 'time'})
out_df = out_df[['user_id', 'item_id', 'time']]
out_df = out_df.drop_duplicates(['user_id', 'item_id', 'time'])
out_df.sort_values(by=['time', 'user_id', 'item_id'], inplace=True)
out_df.head()

Unnamed: 0,user_id,item_id,time
357,A3TB9HNQR54B5V,B00002X29G,982800000
417,A2BH04B9G9LOYA,B000056PYW,1033689600
529,A1KD8NJPZ01R37,B0000SX3BK,1070668800
533,A10RMVX6EE90N6,B0000SX3BK,1072051200
531,A5JLAU2ARJ0BO,B0000SX3BK,1074729600


In [13]:
# reindex (start from 1)

uids = sorted(out_df['user_id'].unique())
user2id = dict(zip(uids, range(1, len(uids) + 1)))
iids = sorted(out_df['item_id'].unique())
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out_df['user_id'] = out_df['user_id'].apply(lambda x: user2id[x])
out_df['item_id'] = out_df['item_id'].apply(lambda x: item2id[x])
out_df = out_df.reset_index(drop=True)
out_df.head()

Unnamed: 0,user_id,item_id,time
0,20746,28,982800000
1,9642,30,1033689600
2,4134,37,1070668800
3,207,37,1072051200
4,21654,37,1074729600


In [14]:
# leave one out spliting

clicked_item_set = dict()
for user_id, seq_df in out_df.groupby('user_id'):
    clicked_item_set[user_id] = set(seq_df['item_id'].values.tolist())
    
def generate_dev_test(data_df):
    result_dfs = []
    for idx in range(2):
        result_df = data_df.groupby('user_id').tail(1).copy()
        data_df = data_df.drop(result_df.index)
        neg_items = np.random.randint(1, len(iids) + 1, (len(result_df), NEG_ITEMS))
        for i, uid in enumerate(result_df['user_id'].values):
            user_clicked = clicked_item_set[uid]
            for j in range(len(neg_items[i])):
                while neg_items[i][j] in user_clicked:
                    neg_items[i][j] = np.random.randint(1, len(iids) + 1)
        result_df['neg_items'] = neg_items.tolist()
        result_dfs.append(result_df)
    return result_dfs, data_df

In [15]:
leave_df = out_df.groupby('user_id').head(1)
data_df = out_df.drop(leave_df.index)

[test_df, dev_df], data_df = generate_dev_test(data_df)
train_df = pd.concat([leave_df, data_df]).sort_index()

len(train_df), len(dev_df), len(test_df)

(137478, 27873, 27877)

In [16]:
train_df.head()

Unnamed: 0,user_id,item_id,time
0,20746,28,982800000
1,9642,30,1033689600
2,4134,37,1070668800
3,207,37,1072051200
4,21654,37,1074729600


In [17]:
test_df.head()

Unnamed: 0,user_id,item_id,time,neg_items
170,15404,52,1142467200,"[7241, 4287, 2073, 8990, 7440, 2445, 107, 1041..."
510,25372,127,1184371200,"[5901, 5034, 6442, 6091, 4520, 7092, 5569, 991..."
929,8947,317,1212710400,"[5209, 8630, 3382, 1697, 8040, 1012, 7834, 988..."
1562,18576,279,1237248000,"[8958, 7882, 6324, 442, 3395, 210, 7774, 8051,..."
1770,671,393,1244419200,"[6465, 7666, 4410, 8849, 997, 5050, 5692, 1014..."


In [18]:
# save results

train_df.to_csv(os.path.join(RAW_PATH, 'train.csv'), sep='\t', index=False)
dev_df.to_csv(os.path.join(RAW_PATH, 'dev.csv'), sep='\t', index=False)
test_df.to_csv(os.path.join(RAW_PATH, 'test.csv'), sep='\t', index=False)

### Item Metadata

In [19]:
# l2 category

l2_cate_lst = list()
for cate_lst in useful_meta_df['categories']:
    l2_cate_lst.append(cate_lst[0][2] if len(cate_lst[0]) > 2 else np.nan)
useful_meta_df['l2_category'] = l2_cate_lst  
l2_cates = sorted(useful_meta_df['l2_category'].dropna().unique())
l2_dict = dict(zip(l2_cates, range(1, len(l2_cates) + 1)))
useful_meta_df['l2_category'] = useful_meta_df['l2_category'].apply(lambda x: l2_dict[x] if x == x else 0)

In [20]:
item_meta_data = dict()
for idx in range(len(useful_meta_df)):
    info = useful_meta_df.iloc[idx]['related']
    item_meta_data[idx] = {
        'item_id': item2id[useful_meta_df.iloc[idx]['asin']],
        'category': useful_meta_df.iloc[idx]['l2_category'],
        'r_complement': list(map(lambda x: item2id[x], info['also_bought'])) if 'also_bought' in info else [],
        'r_substitute': list(map(lambda x: item2id[x], info['also_viewed'])) if 'also_viewed' in info else [],
    }

item_meta_df = pd.DataFrame.from_dict(item_meta_data, orient='index')
item_meta_df = item_meta_df[['item_id', 'category', 'r_complement', 'r_substitute']]
item_meta_df.head()

Unnamed: 0,item_id,category,r_complement,r_substitute
0,1,4,"[6351, 6475, 8748, 6540, 4520, 7417, 4016, 797...",[]
1,2,5,"[8761, 8135, 9047, 6695, 7977, 7843, 7858, 873...",[]
2,3,9,"[7006, 1421, 932, 9341, 4920, 4778, 8392, 8109...",[]
3,4,4,"[3985, 1184, 1353, 3671, 1085, 2214, 1094, 108...",[]
4,5,4,"[3985, 1184, 2978, 3671, 1085, 2214, 1021, 109...",[]


In [21]:
# save results

item_meta_df.to_csv(os.path.join(RAW_PATH, 'item_meta.csv'), sep='\t', index=False)