In [None]:
## This code script is to sample and generate data for representation learning on Amazon networks.

In [None]:
import os
import math
import random
import json
import pickle
import itertools
import functools
from copy import deepcopy
from tqdm import tqdm
from collections import defaultdict
from typing import List, Dict, Set, Tuple
import numpy as np
random.seed(42)
np.random.seed(42)

In [None]:
def load_data(data_root:str, dataset:str, sub_dataset:str) -> Dict:
    """
    data_root: path to directory contains the data file.
    dataset: path to dataset (Amazon)
    subdataset: sub dataset name (e.g. sports)

    Returns:
    data: Dict, key is the doc id, and value is data entry
    """
    # read raw data
    data_path = os.path.join(data_root, dataset, sub_dataset, 'product.json')
    brand_dict = defaultdict(set)
    with open(data_path) as f:
        data = {}
        readin = f.readlines()
        for line in tqdm(readin, desc="Loading Data..."):
            tmp = eval(line.strip())
            k = tmp['asin']
            data[k] = tmp
            if 'brand' in tmp:
                brand_dict[tmp['brand']].add(k)
    for k in data:
        if 'related' not in data[k]:
            data[k]['related'] = {}
        if 'brand' in data[k]:
            data[k]['related']['cobrand'] = brand_dict[data[k]['brand']]
        else:
            data[k]['related']['cobrand'] = set()
    return data

In [None]:
def convert_and_dump(data: Dict, tuples: Set[Tuple[str, str]], path: str) -> None:
    """
    Dump the sampled pairs into jsonl file

    data: Dataset returned by `load_data`
    tuples: Sampled tuples
    path: path to save json file
    """
    print("Dump data to %s" % path)
    cnt = 0
    with open(path, 'w') as fout:
        for t in tqdm(tuples, desc="Processing %s" % path.split('/')[-1]):
            q, k = t
            if q in data and k in data and 'title' in data[q] and 'title' in data[k] and data[q]['title'].strip() != '' and data[k]['title'].strip() != '':
                cur = {}
                cur['q_text'] = data[q]['title']
                if 'description' in data[q]:
                    cur['q_text'] += ' ' + data[q]['description']
                cur['k_text'] = data[k]['title']
                if 'description' in data[k]:
                    cur['k_text'] += ' ' + data[k]['description']
                fout.write(json.dumps(cur)+'\n')
                cnt += 1
    print('%d entries written' % cnt)

In [None]:
def build_no_intermediate(data: Dict, type: List[str])-> Set[Tuple[str, str]]:
    """
    Build relationship by type, no intermediate node
    """
    pairs = set()
    for k0 in tqdm(data):
        if 'related' in data[k0]:
            if type[0] in data[k0]['related']:
                tmp = data[k0]['related'][type[0]]
                for k1 in tmp:
                    if k1 != k0:
                        if k1 < k0:
                            k0, k1 = k1, k0
                            pairs.add((k0, k1))
    return pairs

In [None]:
GENERATOR_DICT = {
    'also_viewed': build_no_intermediate,
    'also_bought': build_no_intermediate,
    'bought_together': build_no_intermediate,
    'cobrand': build_no_intermediate,
}

In [None]:
datasets = 'amazon'
sub_datasets = ['cloth', 'home', 'sports'][2]
base_dir = 'xxx/data/'
save_dir = f'xxx/data/{sub_datasets}/raw'

#save_dir = os.path.join(save_dir_base, datasets, sub_datasets)
print(save_dir)
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [None]:
cur_d = load_data(base_dir, datasets, sub_datasets)
print(len(cur_d))

In [None]:
for i, k in enumerate(cur_d):
    if i == 5: break
    print(k)
    print(cur_d[k])
    # break

In [None]:
also_viewed = GENERATOR_DICT['also_viewed'](cur_d, ['also_viewed'])
convert_and_dump(cur_d, also_viewed, os.path.join(save_dir, 'also_viewed.jsonl'))

In [None]:
also_bought = GENERATOR_DICT['also_bought'](cur_d, ['also_bought'])
convert_and_dump(cur_d, also_bought, os.path.join(save_dir, 'also_bought.jsonl'))

In [None]:
bought_together = GENERATOR_DICT['bought_together'](cur_d, ['bought_together'])
convert_and_dump(cur_d, bought_together, os.path.join(save_dir, 'bought_together.jsonl'))

In [None]:
cobrand = GENERATOR_DICT['cobrand'](cur_d, ['cobrand'])
convert_and_dump(cur_d, cobrand, os.path.join(save_dir, 'cobrand.jsonl'))