In [4]:
import  pandas as pd
from utils import ProdParser, EntityIndexer

In [5]:
order_prod_temp1 = pd.read_csv('../data/instacart/order_products__prior.csv')
order_prod_temp2 = pd.read_csv('../data/instacart/order_products__train.csv')
order_prod = pd.concat([order_prod_temp1, order_prod_temp2], axis=0)

order_info = pd.read_csv('../data/instacart/orders.csv')[['order_id', 'user_id']]
order_prod = pd.merge(left=order_prod, right=order_info, on='order_id')
aisle = pd.read_csv('../data/instacart/aisles.csv')
dept = pd.read_csv('../data/instacart/departments.csv')
prod_info = pd.read_csv('../data/instacart/products.csv')
prod_info = pd.merge(prod_info, aisle, on='aisle_id')
prod_info = pd.merge(prod_info, dept, on='department_id')
prod_info['product_name'] = prod_info['product_name'] + ' ' + prod_info['aisle'] + ' ' + prod_info['department']

In [6]:
prod_cnt = order_prod.groupby('product_id')['order_id'].count().reset_index(name='count')
pop_prod = prod_cnt[prod_cnt['count'] >= 10]['product_id'].to_frame()

In [7]:
tokenizer = ProdParser()
parsed_prod_info = prod_info.copy()
parsed_prod_info['tokens'] = prod_info['product_name'].map(lambda x: tokenizer.process(x))
parsed_prod_info = parsed_prod_info[parsed_prod_info['tokens'].map(lambda x: len(x)) > 0]

In [8]:
# only consider product with at least 10 purchase and context is valid
valid_prod_order = pd.merge(left=order_prod, right=pop_prod, on='product_id')
valid_prod_order = pd.merge(left=valid_prod_order, right=parsed_prod_info['product_id'].to_frame(), on='product_id')

# order size needs >= 2
order_size = valid_prod_order.groupby('order_id')['product_id'].count().reset_index(name='size')
valid_order = order_size[order_size['size'] >= 2]['order_id'].to_frame()
valid_prod_order = pd.merge(left=valid_prod_order, right=valid_order, on='order_id')

# valid_prod_id = valid_prod_order['product_id'].drop_duplicates()
# valid_prod_info = pd.merge(left=prod_info, right=valid_prod_id, on='product_id')

In [9]:
prod_indexer = EntityIndexer('prod')
token_indexer = EntityIndexer('token')
user_indexer = EntityIndexer('user')

_ = parsed_prod_info['tokens'].map(lambda x: token_indexer.index(x))
_ = parsed_prod_info['product_id'].map(lambda x: prod_indexer.index(x))
_ = valid_prod_order['user_id'].map(lambda x: user_indexer.index(x))

In [10]:
prod_indexer.save()
token_indexer.save()
user_indexer.save()

In [11]:
#dump context into files
with open('../data/instacart/ml_context.tsv', 'w') as f:
    for id_, tokens in zip(parsed_prod_info['product_id'], parsed_prod_info['tokens']):
        line = [prod_indexer.entity2ind[id_]]
        for t in tokens:
            line.append(token_indexer.entity2ind[t])
        f.write('\t'.join([str(t) for t in line]) + '\n')


In [12]:
#now dump the order
from collections import defaultdict
basket = defaultdict(list)
print('collecting order info')
for order_id, product_id, atc_order, user_id in zip(valid_prod_order['order_id'],
                                           valid_prod_order['product_id'],
                                           valid_prod_order['add_to_cart_order'],
                                           valid_prod_order['user_id']):
    basket[(order_id, user_id)].append((atc_order, product_id))
basket = {k:sorted(v) for k, v in basket.items()}


collecting order info


In [13]:
print('dumping data into file')
with open('../data/instacart/ml_history.tsv', 'w') as f:
    for (order_id, user_id), order_detail in basket.items():
        product_list = [prod_indexer.entity2ind[x[1]] for x in order_detail]
        ts_list = [x[0] for x in order_detail]
        line = str(user_indexer.entity2ind[user_id]) + '\t' + \
               ','.join([str(x) for x in ts_list]) + '\t' \
               + ','.join([str(x) for x in product_list]) + '\n'
        f.write(line)

dumping data into file
