In [91]:
import os
import gzip
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [92]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def get_df(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [93]:
DATASET = 'Home_and_Kitchen'
RAW_PATH = os.path.join('./', DATASET)
DATA_FILE = 'reviews_{}_5.json.gz'.format(DATASET)
META_FILE = 'meta_{}.json.gz'.format(DATASET)

# Load Data

1. Load interaction data and item metadata
2. Filter out unuseful items
3. Calculate basic statistics

In [94]:
data_df = get_df(os.path.join(RAW_PATH, DATA_FILE))
data_df.head()

Unnamed: 0,reviewerID,reviewText,reviewTime,reviewerName,overall,summary,asin,unixReviewTime,helpful
0,APYOBQE6M18AA,My daughter wanted this book and the price on ...,"10 19, 2013",Martin Schwartz,5.0,Best Price,615391206,1382140800,"[0, 0]"
1,A1JVQTAGHYOL7F,I bought this zoku quick pop for my daughterr ...,"06 18, 2014",Michelle Dinh,5.0,zoku,615391206,1403049600,"[0, 0]"
2,A3UPYGJKZ0XTU4,There is no shortage of pop recipes available ...,"05 5, 2013",mirasreviews,4.0,"Excels at Sweet Dessert Pops, but Falls Short ...",615391206,1367712000,"[26, 27]"
3,A2MHCTX43MIMDZ,This book is a must have if you get a Zoku (wh...,"08 4, 2011","M. Johnson ""Tea Lover""",5.0,Creative Combos,615391206,1312416000,"[14, 18]"
4,AHAI85T5C2DH3,This cookbook is great. I have really enjoyed...,"06 7, 2014",PugLover,4.0,A must own if you own the Zoku maker...,615391206,1402099200,"[0, 0]"


In [95]:
meta_df = get_df(os.path.join(RAW_PATH, META_FILE))
meta_df.head()

Unnamed: 0,title,asin,salesRank,categories,imUrl,description,related,brand,price
0,"Ninjas, Piranhas, and Galileo",76144011,{'Books': 6285595},[[Home & Kitchen]],http://g-ecx.images-amazon.com/images/G/01/x-s...,,,,
1,Le Creuset Kiwi (Green) Butter Dish Stoneware,130350591,{'Kitchen & Dining': 459680},"[[Home & Kitchen, Kitchen & Dining, Dining & E...",http://ecx.images-amazon.com/images/I/21zcx6RC...,Each piece of Le Creuset dinnerware is crafted...,,,
2,Martha Stewart's Wedding Cakes,307394530,"{'Arts, Crafts & Sewing': 3597}","[[Home & Kitchen, Artwork, Posters & Prints]]",http://ecx.images-amazon.com/images/I/51A4FWuj...,Of all the decisions that go into planning a w...,"{'bought_together': ['0789327333', '144630163X...",Random House,14.99
3,,439903491,{'Software': 7065},"[[Home & Kitchen, Artwork, Posters & Prints]]",http://ecx.images-amazon.com/images/I/61tVxcko...,Shiver me timbers! Solve I SPY pirate picture ...,"{'bought_together': ['B000GCBOR0'], 'also_view...",,29.99
4,Build A Maloof Inspired Low Back Dining Chair ...,578060604,,"[[Home & Kitchen, Furniture, Kitchen & Dining ...",http://ecx.images-amazon.com/images/I/41aCELWJ...,The Maloofinspired Low Back Dining Chair is no...,{'also_viewed': ['B004IO6RS8']},,


In [96]:
# Filter items

useful_meta_df = meta_df[meta_df['asin'].isin(data_df['asin'])]
items_with_info = useful_meta_df['related'].apply(lambda x: x is not np.nan)
useful_meta_df = useful_meta_df[items_with_info].reset_index(drop=True)

all_items = set(useful_meta_df['asin'].values.tolist())
def related_filter(related_dict):
    out_dict = dict()
    for r in related_dict:
        out_dict[r] = list(all_items & set(related_dict[r]))
    return out_dict

useful_meta_df['related'] = useful_meta_df['related'].apply(related_filter)
data_df = data_df[data_df['asin'].isin(all_items)]

### Statistics

In [97]:
n_users = data_df['reviewerID'].value_counts().size
n_items = data_df['asin'].value_counts().size
n_clicks = len(data_df)
min_time = data_df['unixReviewTime'].min()
max_time = data_df['unixReviewTime'].max()

In [98]:
time_format = '%Y-%m-%d'

print('# Users:', n_users)
print('# Items:', n_items)
print('# Interactions:', n_clicks)
print('Time Span: {}/{}'.format(
    datetime.utcfromtimestamp(min_time).strftime(time_format),
    datetime.utcfromtimestamp(max_time).strftime(time_format))
)

# Users: 66519
# Items: 27230
# Interactions: 541581
Time Span: 2000-05-02/2014-07-23


# Build Dataset

### Interaction data

In [99]:
np.random.seed(2019)
NEG_ITEMS = 99

In [100]:
out_df = data_df.rename(columns={'asin': 'item_id', 'reviewerID': 'user_id', 'unixReviewTime': 'time'})
out_df = out_df[['user_id', 'item_id', 'time']]
out_df = out_df.drop_duplicates(['user_id', 'item_id', 'time'])
out_df.sort_values(by=['time', 'user_id', 'item_id'], inplace=True)
out_df.head()

Unnamed: 0,user_id,item_id,time
8036,A12XRHXJRFNKPT,B00004RFKS,957225600
14750,A1CKLHQQG32AQ9,B00004SPED,957225600
13578,A1LUUO72VAFKFJ,B00004SGE2,957225600
12697,A25PD5YQS1E5Z9,B00004S9EQ,957225600
8023,A3H78NCT3DJMY2,B00004RFKS,957225600


In [101]:
# reindex (start from 1)

uids = sorted(out_df['user_id'].unique())
user2id = dict(zip(uids, range(1, len(uids) + 1)))
iids = sorted(out_df['item_id'].unique())
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out_df['user_id'] = out_df['user_id'].apply(lambda x: user2id[x])
out_df['item_id'] = out_df['item_id'].apply(lambda x: item2id[x])
out_df = out_df.reset_index(drop=True)
out_df.head()

Unnamed: 0,user_id,item_id,time
0,1452,226,957225600
1,6134,482,957225600
2,10582,450,957225600
3,20270,405,957225600
4,43461,226,957225600


In [102]:
# leave one out spliting

clicked_item_set = dict()
for user_id, seq_df in out_df.groupby('user_id'):
    clicked_item_set[user_id] = set(seq_df['item_id'].values.tolist())
    
def generate_dev_test(data_df):
    result_dfs = []
    for idx in range(2):
        result_df = data_df.groupby('user_id').tail(1).copy()
        data_df = data_df.drop(result_df.index)
        neg_items = np.random.randint(1, len(iids) + 1, (len(result_df), NEG_ITEMS))
        for i, uid in enumerate(result_df['user_id'].values):
            user_clicked = clicked_item_set[uid]
            for j in range(len(neg_items[i])):
                while neg_items[i][j] in user_clicked:
                    neg_items[i][j] = np.random.randint(1, len(iids) + 1)
        result_df['neg_items'] = neg_items.tolist()
        result_dfs.append(result_df)
    return result_dfs, data_df

In [103]:
leave_df = out_df.groupby('user_id').head(1)
data_df = out_df.drop(leave_df.index)

[test_df, dev_df], data_df = generate_dev_test(data_df)
train_df = pd.concat([leave_df, data_df]).sort_index()

len(train_df), len(dev_df), len(test_df)

(408571, 66495, 66515)

In [104]:
train_df.head()

Unnamed: 0,user_id,item_id,time
0,1452,226,957225600
1,6134,482,957225600
2,10582,450,957225600
3,20270,405,957225600
4,43461,226,957225600


In [105]:
test_df.head()

Unnamed: 0,user_id,item_id,time,neg_items
68,12398,362,962841600,"[7241, 26995, 12070, 4287, 18457, 25374, 7440,..."
95,44024,165,965865600,"[12888, 15648, 24337, 16209, 3992, 14595, 2082..."
132,8812,497,971049600,"[15451, 17441, 15148, 21575, 23822, 11071, 109..."
146,66017,425,973468800,"[7072, 12443, 21984, 4696, 10553, 12291, 17334..."
164,33136,19,974937600,"[11439, 24089, 7768, 22349, 1355, 8958, 24266,..."


In [106]:
# save results

train_df.to_csv(os.path.join(RAW_PATH, 'train.csv'), sep='\t', index=False)
dev_df.to_csv(os.path.join(RAW_PATH, 'dev.csv'), sep='\t', index=False)
test_df.to_csv(os.path.join(RAW_PATH, 'test.csv'), sep='\t', index=False)

### Item Metadata

In [107]:
# l2 category

l2_cate_lst = list()
for cate_lst in useful_meta_df['categories']:
    l2_cate_lst.append(cate_lst[0][2] if len(cate_lst[0]) > 2 else np.nan)
useful_meta_df['l2_category'] = l2_cate_lst  
l2_cates = sorted(useful_meta_df['l2_category'].dropna().unique())
l2_dict = dict(zip(l2_cates, range(1, len(l2_cates) + 1)))
useful_meta_df['l2_category'] = useful_meta_df['l2_category'].apply(lambda x: l2_dict[x] if x == x else 0)

In [108]:
item_meta_data = dict()
for idx in range(len(useful_meta_df)):
    info = useful_meta_df.iloc[idx]['related']
    item_meta_data[idx] = {
        'item_id': item2id[useful_meta_df.iloc[idx]['asin']],
        'category': useful_meta_df.iloc[idx]['l2_category'],
        'r_complement': list(map(lambda x: item2id[x], info['also_bought'])) if 'also_bought' in info else [],
        'r_substitute': list(map(lambda x: item2id[x], info['also_viewed'])) if 'also_viewed' in info else [],
    }

item_meta_df = pd.DataFrame.from_dict(item_meta_data, orient='index')
item_meta_df = item_meta_df[['item_id', 'category', 'r_complement', 'r_substitute']]
item_meta_df.head()

Unnamed: 0,item_id,category,r_complement,r_substitute
0,1,57,"[25648, 5965, 17178, 17057, 26020, 3337, 26482...","[15814, 26062, 12463, 25640, 17178, 17057, 260..."
1,2,0,[],[]
2,3,5,"[4588, 22633, 16598, 10732, 22232, 10139, 2568...","[13494, 23706, 16598, 22232, 18442, 369, 25533..."
3,4,74,"[19043, 25755, 25152, 22593, 17254]","[25150, 25152, 12804, 22593]"
4,5,57,[18532],[]


In [109]:
# save results

item_meta_df.to_csv(os.path.join(RAW_PATH, 'item_meta.csv'), sep='\t', index=False)