In [1]:
import gzip
import zipfile
import random
import os
import re
import subprocess
import pandas as pd
import numpy as np
from datetime import datetime
import json


DATASET = 'Clothing_Shoes_and_Jewelry'
RAW_PATH = os.path.join('../../data', DATASET)
DATA_FILE = 'reviews_{}_5.json.gz'.format(DATASET)
META_FILE = 'meta_{}.json.gz'.format(DATASET)

RANDOM_SEED = 42

# Load Data

# 1. Load interaction data and item metadata
# 2. Filter out unuseful items
# 3. Calculate basic statistics

In [2]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def get_df(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [3]:
# download data if not exists

if not os.path.exists(RAW_PATH):
    subprocess.call('mkdir ' + RAW_PATH, shell=True)
if not os.path.exists(os.path.join(RAW_PATH, DATA_FILE)):
    print('Downloading interaction data into ' + RAW_PATH)
    subprocess.call(
        'cd {} && curl -O http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_{}_5.json.gz'
        .format(RAW_PATH, DATASET), shell=True)
if not os.path.exists(os.path.join(RAW_PATH, META_FILE)):
    print('Downloading item metadata into ' + RAW_PATH)
    subprocess.call(
        'cd {} && curl -O http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_{}.json.gz'
        .format(RAW_PATH, DATASET), shell=True)

In [4]:
data_df = get_df(os.path.join(RAW_PATH, DATA_FILE))
data_df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1KLRMWW2FWPL4,31887,"Amazon Customer ""cameramom""","[0, 0]",This is a great tutu and at a really great pri...,5.0,Great tutu- not cheaply made,1297468800,"02 12, 2011"
1,A2G5TCU2WDFZ65,31887,Amazon Customer,"[0, 0]",I bought this for my 4 yr old daughter for dan...,5.0,Very Cute!!,1358553600,"01 19, 2013"
2,A1RLQXYNCMWRWN,31887,Carola,"[0, 0]",What can I say... my daughters have it in oran...,5.0,I have buy more than one,1357257600,"01 4, 2013"
3,A8U3FAMSJVHS5,31887,Caromcg,"[0, 0]","We bought several tutus at once, and they are ...",5.0,"Adorable, Sturdy",1398556800,"04 27, 2014"
4,A3GEOILWLK86XM,31887,CJ,"[0, 0]",Thank you Halo Heaven great product for Little...,5.0,Grammy's Angels Love it,1394841600,"03 15, 2014"


In [5]:
meta_df = get_df(os.path.join(RAW_PATH, META_FILE))
meta_df.head()

Unnamed: 0,asin,related,title,price,salesRank,imUrl,brand,categories,description
0,37214,"{'also_viewed': ['B00JO8II76', 'B00DGN4R1Q', '...",Purple Sequin Tiny Dancer Tutu Ballet Dance Fa...,6.99,{'Clothing': 1233557},http://ecx.images-amazon.com/images/I/31mCncNu...,Big Dreams,"[[Clothing, Shoes & Jewelry, Girls], [Clothing...",
1,31887,"{'also_bought': ['0000031852', '0000031895', '...",Ballet Dress-Up Fairy Tutu,6.79,{'Sports &amp; Outdoors': 8547},http://ecx.images-amazon.com/images/I/314qZjYe...,Boutique Cutie,"[[Clothing, Shoes & Jewelry, Girls, Clothing, ...",This adorable basic ballerina tutu is perfect ...
2,123456479,"{'also_bought': ['B000BMTCK6', 'B0006JCGUM', '...",SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / ...,64.98,{'Kitchen & Dining': 16987},http://ecx.images-amazon.com/images/I/413tGhqo...,,"[[Clothing, Shoes & Jewelry, Novelty, Costumes...",Elegance par excellence. Hand-crafted of the f...
3,456844570,"{'also_viewed': ['B008MTRT1O', 'B00BUG47S4', '...",RiZ Women's Beautify Crafted &frac12; Rimmed F...,,{'Clothing': 1180499},http://ecx.images-amazon.com/images/I/31QZTHxv...,,"[[Clothing, Shoes & Jewelry, Women, Accessorie...",
4,456808574,"{'also_viewed': ['B00A6JD8EA', 'B00BFL4FTA', '...",Lantin White Visor Wrap Around Ski Style Aviat...,,{'Clothing': 1038084},http://ecx.images-amazon.com/images/I/31UsrgT5...,,"[[Clothing, Shoes & Jewelry, Women, Accessorie...",


In [6]:
# Only retain items that appear in interaction data

useful_meta_df = meta_df[meta_df['asin'].isin(data_df['asin'])].reset_index(drop=True)
all_items = set(useful_meta_df['asin'].values.tolist())

def related_filter(related_dict):
    out_dict = dict()
    if related_dict is not np.nan:
        for r in related_dict:
            out_dict[r] = list(all_items & set(related_dict[r]))
    return out_dict

useful_meta_df['related'] = useful_meta_df['related'].apply(related_filter)

In [7]:
n_users = data_df['reviewerID'].value_counts().size
n_items = data_df['asin'].value_counts().size
n_clicks = len(data_df)
min_time = data_df['unixReviewTime'].min()
max_time = data_df['unixReviewTime'].max()

In [8]:
time_format = '%Y-%m-%d'

print('# Users:', n_users)
print('# Items:', n_items)
print('# Interactions:', n_clicks)
print('Time Span: {}/{}'.format(
    datetime.utcfromtimestamp(min_time).strftime(time_format),
    datetime.utcfromtimestamp(max_time).strftime(time_format))
)

# Users: 39387
# Items: 23033
# Interactions: 278677
Time Span: 2003-03-29/2014-07-23


In [9]:
np.random.seed(RANDOM_SEED)

In [10]:
out_df = data_df.rename(columns={'asin': 'item_id', 'reviewerID': 'user_id', 'unixReviewTime': 'time'})
out_df = out_df[['user_id', 'item_id', 'time']]
out_df = out_df.drop_duplicates(['user_id', 'item_id', 'time'])
out_df = out_df.sort_values(by=['time', 'user_id'], kind='mergesort').reset_index(drop=True)
out_df.head()

Unnamed: 0,user_id,item_id,time
0,A1X2LENOF84LCQ,B0000ZEPGA,1048896000
1,A1GPGBHBI6T2HJ,B000051SEP,1071187200
2,A1Z54EM24Y40LL,B0007YXUS8,1090022400
3,A2K3J2X8KDY47N,B0000B35EL,1095984000
4,AB2YZA2HLY75H,B0000B35D9,1096416000


In [11]:
# reindex (start from 1)

uids = sorted(out_df['user_id'].unique())
user2id = dict(zip(uids, range(1, len(uids) + 1)))
iids = sorted(out_df['item_id'].unique())
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out_df['user_id'] = out_df['user_id'].apply(lambda x: user2id[x])
out_df['item_id'] = out_df['item_id'].apply(lambda x: item2id[x])
out_df.head()

Unnamed: 0,user_id,item_id,time
0,9531,182,1048896000
1,4734,12,1071187200
2,10137,728,1090022400
3,16250,102,1095984000
4,32292,98,1096416000


In [12]:
# leave one out spliting

clicked_item_set = dict()
for user_id, seq_df in out_df.groupby('user_id'):
    clicked_item_set[user_id] = set(seq_df['item_id'].values.tolist())

In [13]:
def generate_dev_test(data_df, rate=0.2):
    user_groups = data_df.groupby('user_id')
    user_ids = list(user_groups.groups.keys())
    counts, gts = {}, {}
    n_users = data_df['user_id'].max()
    neg_avg = 0
    for i in range(1, n_users+1):
        if i not in user_ids:
            continue

        cur_group = user_groups.get_group(i)
        gt_num = int(np.ceil(rate * cur_group['item_id'].count()))
        counts[i] = gt_num  # 保存每个用户gt的个数
        neg_avg += gt_num
        cur_tail = cur_group.tail(gt_num)
        cur_gt = cur_tail['item_id'].values
        gts[i] = cur_gt.tolist()  # 保存每个用户的ft
    with open('{}/{}_count.json'.format(RAW_PATH, DATASET), 'w') as fs:
        json.dump(counts, fs)
    print("4. gt count保存完毕")
    with open('{}/{}_ground_truth.json'.format(RAW_PATH, DATASET), 'w') as fs:
        json.dump(gts, fs)
    print("5. gt保存完毕")

    neg_num = int(np.ceil(neg_avg/n_users) * 10)
    print('neg items的数量为{}'.format(neg_num))
    result_dfs = []
    for idx in range(2):
        result_df = None
        user_groups = data_df.groupby('user_id')
        user_ids = list(user_groups.groups.keys())
        for i in range(1, n_users + 1):
            if i not in user_ids:
                continue
            cur_tail = user_groups.get_group(i).tail(counts[i])
            if result_df is None:
                result_df = cur_tail.copy()
            else:
                result_df = pd.concat([result_df, cur_tail], axis=0)
        result_df = result_df.copy()
        # result_df = data_df.groupby('user_id').tail(1).copy()
        # 去除形成test和dev的数据
        data_df = data_df.drop(result_df.index)
        
        result_df = result_df.groupby('user_id').head(1)
        neg_items = []
        a_items = set(np.arange(1, n_items + 1, 1))
        for i in range(1, n_users+1):
            if i not in user_ids:
                continue
            # neg_num = min(counts[i] * 200, n_items - len(clicked_item_set[i])) - counts[i]
            gt_len = counts[i]
            if gt_len > 10:
                gt_len = 10
            cneg_num = neg_num - gt_len + 1
            unclicked = list(set(clicked_item_set[i]) ^ a_items)
            if len(unclicked) <= cneg_num:
                neg_items.append(gts[i][1:gt_len] + unclicked)
            else:
                neg_items.append(gts[i][1:gt_len] + random.sample(unclicked, cneg_num))
            # else:
            #     neg_items.append(random.sample(unclicked, cneg_num))
        result_df['neg_items'] = neg_items
        result_dfs.append(result_df)
    return result_dfs, data_df

In [14]:
leave_df = out_df.groupby('user_id').head(1)
data_df = out_df.drop(leave_df.index)

[test_df, dev_df], data_df = generate_dev_test(data_df)
train_df = pd.concat([leave_df, data_df]).sort_index()

len(train_df), len(dev_df), len(test_df)

4. gt count保存完毕
5. gt保存完毕
neg items的数量为20


(160007, 39387, 39387)

In [15]:
train_df

Unnamed: 0,user_id,item_id,time
0,9531,182,1048896000
1,4734,12,1071187200
2,10137,728,1090022400
3,16250,102,1095984000
4,32292,98,1096416000
...,...,...,...
278590,11781,22036,1405987200
278591,11781,22524,1405987200
278626,23479,19808,1405987200
278627,23479,20818,1405987200


In [16]:
test_df.head()

Unnamed: 0,user_id,item_id,time,neg_items
35842,1,17820,1352332800,"[1884, 22196, 12048, 11504, 327, 4601, 5093, 1..."
144112,2,15785,1383177600,"[20385, 9294, 22761, 6637, 3461, 6933, 15008, ..."
183184,3,20281,1389225600,"[11228, 9389, 5641, 17643, 11208, 6508, 20459,..."
275474,4,21708,1405036800,"[4765, 15201, 13435, 995, 11098, 4974, 2980, 2..."
114642,5,20468,1375920000,"[10352, 19441, 8704, 15690, 12125, 13703, 1378..."


In [17]:
# save results

train_df.to_csv(os.path.join(RAW_PATH, 'train.csv'), sep='\t', index=False)
dev_df.to_csv(os.path.join(RAW_PATH, 'dev.csv'), sep='\t', index=False)
test_df.to_csv(os.path.join(RAW_PATH, 'test.csv'), sep='\t', index=False)

# Item Metadata

In [55]:
all_class = []

In [56]:
useful_meta_df['categories'].values.tolist()[0]

[['Clothing, Shoes & Jewelry', 'Girls', 'Clothing', 'Active', 'Active Skirts']]

In [57]:
useful_meta_df.iloc[0, 6]

'Boutique Cutie'

In [58]:
# 找到所有类别
t_df = useful_meta_df.copy()
for idx, cur_cl in enumerate(useful_meta_df['categories'].values.tolist()):
    temp = []
    for c in cur_cl:
        for t in c:
            if t not in all_class:
                all_class.append(t)
            if t not in temp:
                temp.append('i_' + t)
    t_df.iat[idx, 7] = temp
print(all_class)

['Clothing, Shoes & Jewelry', 'Girls', 'Clothing', 'Active', 'Active Skirts', 'Novelty, Costumes & More', 'Jewelry Accessories', 'Jewelry Boxes & Organizers', 'Jewelry Boxes', 'Luggage & Travel Gear', 'Software', 'Education & Reference', 'Languages', 'Boys', 'Costumes & Accessories', 'Costumes', 'Kids & Baby', 'Travel Accessories', 'Travel Wallets', 'Shoes & Accessories: International Shipping Available', 'Luggage Straps', 'Electronics', 'GPS & Navigation', 'Sports & Handheld GPS', 'Handheld GPS Units', 'Women', 'Watches', 'Wrist Watches', 'Men', 'Luggage Locks', 'Gifts', 'Jewelry: International Shipping Available', 'Fashion Watches', 'Watch Gifts', 'Available for International Shipping', 'Skagen Watches', 'Backpacks', "Kids' Backpacks", 'Sports & Outdoors', 'Accessories', 'Sport Watches', "Men's Athletic Watches", 'C', 'Converse', 'Street, Surf & Skate', 'Shoes', 'Fashion Sneakers', 'Work Wear & Uniforms', 'Novelty', 'Underwear', 'Bras', 'Shirts', 'Active Shirts & Tees', 'Big & Tall',

In [59]:
t_df

Unnamed: 0,asin,related,title,price,salesRank,imUrl,brand,categories,description
0,0000031887,"{'also_bought': ['B005JJ2762', 'B002GZGI4E', '...",Ballet Dress-Up Fairy Tutu,6.79,{'Sports &amp; Outdoors': 8547},http://ecx.images-amazon.com/images/I/314qZjYe...,Boutique Cutie,"[i_Clothing, Shoes & Jewelry, i_Girls, i_Cloth...",This adorable basic ballerina tutu is perfect ...
1,0123456479,"{'also_bought': ['B000P4DEYU', 'B00194Q262', '...",SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / ...,64.98,{'Kitchen & Dining': 16987},http://ecx.images-amazon.com/images/I/413tGhqo...,,"[i_Clothing, Shoes & Jewelry, i_Novelty, Costu...",Elegance par excellence. Hand-crafted of the f...
2,1608299953,"{'also_bought': ['1617160377'], 'bought_togeth...",,179.00,{'Software': 818},http://ecx.images-amazon.com/images/I/512RG9NT...,,"[i_Clothing, Shoes & Jewelry, i_Luggage & Trav...",
3,1617160377,"{'also_bought': ['1608299953'], 'bought_togeth...",,179.00,{'Software': 1137},http://ecx.images-amazon.com/images/I/51meMRCN...,,"[i_Clothing, Shoes & Jewelry, i_Luggage & Trav...",
4,B00001WRHJ,"{'also_bought': ['B0009ETG02', 'B00551RXJK', '...",Toy Story 2: Woody Deluxe Costume - Variation ...,,{'Clothing': 22838},http://ecx.images-amazon.com/images/I/41BcQ%2B...,,"[i_Clothing, Shoes & Jewelry, i_Boys, i_Clothi...",
...,...,...,...,...,...,...,...,...,...
23028,B00KA602SY,"{'also_bought': ['B00EFT9C56', 'B00E0L0IX4', '...",GURAIO Women's Silvering Cross Loose Vest,,{'Clothing': 991},http://ecx.images-amazon.com/images/I/4193We49...,,"[i_Clothing, Shoes & Jewelry, i_Women, i_Cloth...",
23029,B00KCWMG5S,"{'also_bought': ['B00902G8RS'], 'also_viewed':...",Classic Designs Womens Stretch Poplin Cargo Re...,16.50,{'Clothing': 44785},http://ecx.images-amazon.com/images/I/31z3Fwpq...,,"[i_Clothing, Shoes & Jewelry, i_Women, i_Cloth...",
23030,B00KF9180W,"{'also_bought': [], 'also_viewed': ['B0012TWSS...",[2 PACK] Multi-Purpose Sports Balaclava - For ...,,,http://ecx.images-amazon.com/images/I/41Ludg7F...,,"[i_Clothing, Shoes & Jewelry, i_Men, i_Accesso...",The Trendy Swede Multi-Purpose Sports Balaclav...
23031,B00KGCLROK,"{'also_viewed': ['B000ILEKUM', 'B00G28ISY4', '...",Mato &amp; Hash Toeless Half Toe Yoga Socks Wi...,,{'Clothing': 39937},http://ecx.images-amazon.com/images/I/51HXxcR2...,,"[i_Sports & Outdoors, i_Exercise & Fitness, i_...",


In [60]:
genres = ['i_' + c for c in all_class]

In [61]:
genres

['i_Clothing, Shoes & Jewelry',
 'i_Girls',
 'i_Clothing',
 'i_Active',
 'i_Active Skirts',
 'i_Novelty, Costumes & More',
 'i_Jewelry Accessories',
 'i_Jewelry Boxes & Organizers',
 'i_Jewelry Boxes',
 'i_Luggage & Travel Gear',
 'i_Software',
 'i_Education & Reference',
 'i_Languages',
 'i_Boys',
 'i_Costumes & Accessories',
 'i_Costumes',
 'i_Kids & Baby',
 'i_Travel Accessories',
 'i_Travel Wallets',
 'i_Shoes & Accessories: International Shipping Available',
 'i_Luggage Straps',
 'i_Electronics',
 'i_GPS & Navigation',
 'i_Sports & Handheld GPS',
 'i_Handheld GPS Units',
 'i_Women',
 'i_Watches',
 'i_Wrist Watches',
 'i_Men',
 'i_Luggage Locks',
 'i_Gifts',
 'i_Jewelry: International Shipping Available',
 'i_Fashion Watches',
 'i_Watch Gifts',
 'i_Available for International Shipping',
 'i_Skagen Watches',
 'i_Backpacks',
 "i_Kids' Backpacks",
 'i_Sports & Outdoors',
 'i_Accessories',
 'i_Sport Watches',
 "i_Men's Athletic Watches",
 'i_C',
 'i_Converse',
 'i_Street, Surf & Skate'

In [62]:
item_df = t_df.copy()

In [63]:
item_df

Unnamed: 0,asin,related,title,price,salesRank,imUrl,brand,categories,description
0,0000031887,"{'also_bought': ['B005JJ2762', 'B002GZGI4E', '...",Ballet Dress-Up Fairy Tutu,6.79,{'Sports &amp; Outdoors': 8547},http://ecx.images-amazon.com/images/I/314qZjYe...,Boutique Cutie,"[i_Clothing, Shoes & Jewelry, i_Girls, i_Cloth...",This adorable basic ballerina tutu is perfect ...
1,0123456479,"{'also_bought': ['B000P4DEYU', 'B00194Q262', '...",SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / ...,64.98,{'Kitchen & Dining': 16987},http://ecx.images-amazon.com/images/I/413tGhqo...,,"[i_Clothing, Shoes & Jewelry, i_Novelty, Costu...",Elegance par excellence. Hand-crafted of the f...
2,1608299953,"{'also_bought': ['1617160377'], 'bought_togeth...",,179.00,{'Software': 818},http://ecx.images-amazon.com/images/I/512RG9NT...,,"[i_Clothing, Shoes & Jewelry, i_Luggage & Trav...",
3,1617160377,"{'also_bought': ['1608299953'], 'bought_togeth...",,179.00,{'Software': 1137},http://ecx.images-amazon.com/images/I/51meMRCN...,,"[i_Clothing, Shoes & Jewelry, i_Luggage & Trav...",
4,B00001WRHJ,"{'also_bought': ['B0009ETG02', 'B00551RXJK', '...",Toy Story 2: Woody Deluxe Costume - Variation ...,,{'Clothing': 22838},http://ecx.images-amazon.com/images/I/41BcQ%2B...,,"[i_Clothing, Shoes & Jewelry, i_Boys, i_Clothi...",
...,...,...,...,...,...,...,...,...,...
23028,B00KA602SY,"{'also_bought': ['B00EFT9C56', 'B00E0L0IX4', '...",GURAIO Women's Silvering Cross Loose Vest,,{'Clothing': 991},http://ecx.images-amazon.com/images/I/4193We49...,,"[i_Clothing, Shoes & Jewelry, i_Women, i_Cloth...",
23029,B00KCWMG5S,"{'also_bought': ['B00902G8RS'], 'also_viewed':...",Classic Designs Womens Stretch Poplin Cargo Re...,16.50,{'Clothing': 44785},http://ecx.images-amazon.com/images/I/31z3Fwpq...,,"[i_Clothing, Shoes & Jewelry, i_Women, i_Cloth...",
23030,B00KF9180W,"{'also_bought': [], 'also_viewed': ['B0012TWSS...",[2 PACK] Multi-Purpose Sports Balaclava - For ...,,,http://ecx.images-amazon.com/images/I/41Ludg7F...,,"[i_Clothing, Shoes & Jewelry, i_Men, i_Accesso...",The Trendy Swede Multi-Purpose Sports Balaclav...
23031,B00KGCLROK,"{'also_viewed': ['B000ILEKUM', 'B00G28ISY4', '...",Mato &amp; Hash Toeless Half Toe Yoga Socks Wi...,,{'Clothing': 39937},http://ecx.images-amazon.com/images/I/51HXxcR2...,,"[i_Sports & Outdoors, i_Exercise & Fitness, i_...",


In [64]:
item_df['asin']

0        0000031887
1        0123456479
2        1608299953
3        1617160377
4        B00001WRHJ
            ...    
23028    B00KA602SY
23029    B00KCWMG5S
23030    B00KF9180W
23031    B00KGCLROK
23032    B00KKXCJQU
Name: asin, Length: 23033, dtype: object

In [65]:
drop_item_df = item_df.take([0, 7], axis=1)

In [66]:
drop_item_df

Unnamed: 0,asin,categories
0,0000031887,"[i_Clothing, Shoes & Jewelry, i_Girls, i_Cloth..."
1,0123456479,"[i_Clothing, Shoes & Jewelry, i_Novelty, Costu..."
2,1608299953,"[i_Clothing, Shoes & Jewelry, i_Luggage & Trav..."
3,1617160377,"[i_Clothing, Shoes & Jewelry, i_Luggage & Trav..."
4,B00001WRHJ,"[i_Clothing, Shoes & Jewelry, i_Boys, i_Clothi..."
...,...,...
23028,B00KA602SY,"[i_Clothing, Shoes & Jewelry, i_Women, i_Cloth..."
23029,B00KCWMG5S,"[i_Clothing, Shoes & Jewelry, i_Women, i_Cloth..."
23030,B00KF9180W,"[i_Clothing, Shoes & Jewelry, i_Men, i_Accesso..."
23031,B00KGCLROK,"[i_Sports & Outdoors, i_Exercise & Fitness, i_..."


In [67]:
genres

['i_Clothing, Shoes & Jewelry',
 'i_Girls',
 'i_Clothing',
 'i_Active',
 'i_Active Skirts',
 'i_Novelty, Costumes & More',
 'i_Jewelry Accessories',
 'i_Jewelry Boxes & Organizers',
 'i_Jewelry Boxes',
 'i_Luggage & Travel Gear',
 'i_Software',
 'i_Education & Reference',
 'i_Languages',
 'i_Boys',
 'i_Costumes & Accessories',
 'i_Costumes',
 'i_Kids & Baby',
 'i_Travel Accessories',
 'i_Travel Wallets',
 'i_Shoes & Accessories: International Shipping Available',
 'i_Luggage Straps',
 'i_Electronics',
 'i_GPS & Navigation',
 'i_Sports & Handheld GPS',
 'i_Handheld GPS Units',
 'i_Women',
 'i_Watches',
 'i_Wrist Watches',
 'i_Men',
 'i_Luggage Locks',
 'i_Gifts',
 'i_Jewelry: International Shipping Available',
 'i_Fashion Watches',
 'i_Watch Gifts',
 'i_Available for International Shipping',
 'i_Skagen Watches',
 'i_Backpacks',
 "i_Kids' Backpacks",
 'i_Sports & Outdoors',
 'i_Accessories',
 'i_Sport Watches',
 "i_Men's Athletic Watches",
 'i_C',
 'i_Converse',
 'i_Street, Surf & Skate'

In [68]:
for g in genres:
    drop_item_df[g] = 0
drop_item_df

  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_item_df[g] = 0
  drop_ite

Unnamed: 0,asin,categories,"i_Clothing, Shoes & Jewelry",i_Girls,i_Clothing,i_Active,i_Active Skirts,"i_Novelty, Costumes & More",i_Jewelry Accessories,i_Jewelry Boxes & Organizers,...,i_Suit Pants,i_Jewelry Chests,i_Leisure Sports & Game Room,i_French Connection,i_COACH,i_Skirt Sets,i_BLVD,i_Electronics & Gadgets,i_Belly Chains,i_Boy Meets Girl
0,0000031887,"[i_Clothing, Shoes & Jewelry, i_Girls, i_Cloth...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0123456479,"[i_Clothing, Shoes & Jewelry, i_Novelty, Costu...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1608299953,"[i_Clothing, Shoes & Jewelry, i_Luggage & Trav...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1617160377,"[i_Clothing, Shoes & Jewelry, i_Luggage & Trav...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,B00001WRHJ,"[i_Clothing, Shoes & Jewelry, i_Boys, i_Clothi...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23028,B00KA602SY,"[i_Clothing, Shoes & Jewelry, i_Women, i_Cloth...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23029,B00KCWMG5S,"[i_Clothing, Shoes & Jewelry, i_Women, i_Cloth...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23030,B00KF9180W,"[i_Clothing, Shoes & Jewelry, i_Men, i_Accesso...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23031,B00KGCLROK,"[i_Sports & Outdoors, i_Exercise & Fitness, i_...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [69]:
drop_item_df

Unnamed: 0,asin,categories,"i_Clothing, Shoes & Jewelry",i_Girls,i_Clothing,i_Active,i_Active Skirts,"i_Novelty, Costumes & More",i_Jewelry Accessories,i_Jewelry Boxes & Organizers,...,i_Suit Pants,i_Jewelry Chests,i_Leisure Sports & Game Room,i_French Connection,i_COACH,i_Skirt Sets,i_BLVD,i_Electronics & Gadgets,i_Belly Chains,i_Boy Meets Girl
0,0000031887,"[i_Clothing, Shoes & Jewelry, i_Girls, i_Cloth...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0123456479,"[i_Clothing, Shoes & Jewelry, i_Novelty, Costu...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1608299953,"[i_Clothing, Shoes & Jewelry, i_Luggage & Trav...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1617160377,"[i_Clothing, Shoes & Jewelry, i_Luggage & Trav...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,B00001WRHJ,"[i_Clothing, Shoes & Jewelry, i_Boys, i_Clothi...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23028,B00KA602SY,"[i_Clothing, Shoes & Jewelry, i_Women, i_Cloth...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23029,B00KCWMG5S,"[i_Clothing, Shoes & Jewelry, i_Women, i_Cloth...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23030,B00KF9180W,"[i_Clothing, Shoes & Jewelry, i_Men, i_Accesso...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23031,B00KGCLROK,"[i_Sports & Outdoors, i_Exercise & Fitness, i_...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [70]:
for idx in range(len(drop_item_df)):
    # 添加新列
    for g in genres:
        if g in drop_item_df.loc[idx, 'categories']:
            drop_item_df.loc[idx, g] = 1
    # 将asin换成正确的item_id
    drop_item_df.loc[idx, 'asin'] = item2id[useful_meta_df.iloc[idx]['asin']]

In [71]:
drop_item_df

Unnamed: 0,asin,categories,"i_Clothing, Shoes & Jewelry",i_Girls,i_Clothing,i_Active,i_Active Skirts,"i_Novelty, Costumes & More",i_Jewelry Accessories,i_Jewelry Boxes & Organizers,...,i_Suit Pants,i_Jewelry Chests,i_Leisure Sports & Game Room,i_French Connection,i_COACH,i_Skirt Sets,i_BLVD,i_Electronics & Gadgets,i_Belly Chains,i_Boy Meets Girl
0,1,"[i_Clothing, Shoes & Jewelry, i_Girls, i_Cloth...",1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,"[i_Clothing, Shoes & Jewelry, i_Novelty, Costu...",1,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,3,"[i_Clothing, Shoes & Jewelry, i_Luggage & Trav...",1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,"[i_Clothing, Shoes & Jewelry, i_Luggage & Trav...",1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6,"[i_Clothing, Shoes & Jewelry, i_Boys, i_Clothi...",1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23028,23029,"[i_Clothing, Shoes & Jewelry, i_Women, i_Cloth...",1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23029,23030,"[i_Clothing, Shoes & Jewelry, i_Women, i_Cloth...",1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23030,23031,"[i_Clothing, Shoes & Jewelry, i_Men, i_Accesso...",1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23031,23032,"[i_Sports & Outdoors, i_Exercise & Fitness, i_...",1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [72]:
for idx in range(len(drop_item_df)):
    # 将asin换成正确的item_id
    drop_item_df.loc[idx, 'asin'] = item2id[useful_meta_df.iloc[idx]['asin']]

In [73]:
drop_item_df = drop_item_df.drop('categories', axis=1)

In [74]:
drop_item_df.rename(columns={'asin':'item_id'}, inplace=True)
drop_item_df

Unnamed: 0,item_id,"i_Clothing, Shoes & Jewelry",i_Girls,i_Clothing,i_Active,i_Active Skirts,"i_Novelty, Costumes & More",i_Jewelry Accessories,i_Jewelry Boxes & Organizers,i_Jewelry Boxes,...,i_Suit Pants,i_Jewelry Chests,i_Leisure Sports & Game Room,i_French Connection,i_COACH,i_Skirt Sets,i_BLVD,i_Electronics & Gadgets,i_Belly Chains,i_Boy Meets Girl
0,1,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1,0,0,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,3,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6,1,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23028,23029,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23029,23030,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23030,23031,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23031,23032,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [75]:
# save results

drop_item_df.to_csv(os.path.join(RAW_PATH, 'item_meta.csv'), sep='\t', index=False)

In [76]:
print(genres)

['i_Clothing, Shoes & Jewelry', 'i_Girls', 'i_Clothing', 'i_Active', 'i_Active Skirts', 'i_Novelty, Costumes & More', 'i_Jewelry Accessories', 'i_Jewelry Boxes & Organizers', 'i_Jewelry Boxes', 'i_Luggage & Travel Gear', 'i_Software', 'i_Education & Reference', 'i_Languages', 'i_Boys', 'i_Costumes & Accessories', 'i_Costumes', 'i_Kids & Baby', 'i_Travel Accessories', 'i_Travel Wallets', 'i_Shoes & Accessories: International Shipping Available', 'i_Luggage Straps', 'i_Electronics', 'i_GPS & Navigation', 'i_Sports & Handheld GPS', 'i_Handheld GPS Units', 'i_Women', 'i_Watches', 'i_Wrist Watches', 'i_Men', 'i_Luggage Locks', 'i_Gifts', 'i_Jewelry: International Shipping Available', 'i_Fashion Watches', 'i_Watch Gifts', 'i_Available for International Shipping', 'i_Skagen Watches', 'i_Backpacks', "i_Kids' Backpacks", 'i_Sports & Outdoors', 'i_Accessories', 'i_Sport Watches', "i_Men's Athletic Watches", 'i_C', 'i_Converse', 'i_Street, Surf & Skate', 'i_Shoes', 'i_Fashion Sneakers', 'i_Work W

In [77]:
len(genres)

1193