In [None]:
import os
import json
from tqdm import tqdm

import pickle

In [None]:
## nodes: items, brands
## item features: title, description, category, price, img
## item edges: also_viewed, buy_after_viewing, also_bought, bought_together
## brand features: name

In [None]:
raw_data_dir="/home/ec2-user/quic-efs/user/bowenjin/llm-graph-plugin/data/raw_data/amazon"
save_dir="/home/ec2-user/quic-efs/user/bowenjin/llm-graph-plugin/data/processed_data/amazon"

In [None]:
## read raw data files

def read_json_lines(file, id_key):
    data = {}
    with open(file) as f:
        readin = f.readlines()
        for line in tqdm(readin):
            #tmp = json.loads(line)
            tmp = eval(line)
            data[tmp[id_key]] = tmp
    return data

item_raw_data = read_json_lines(os.path.join(raw_data_dir, 'metadata.json'), 'asin')

In [None]:
list(item_raw_data.keys())[:5]

In [None]:
item_raw_data['0000143561']

In [None]:
related_name = set()
for itt in tqdm(item_raw_data):
    if "related" in item_raw_data[itt]:
        for kk in item_raw_data[itt]['related'].keys():
            related_name.add(kk)
print(related_name)

In [None]:
## construct node dictionary
## item features: title, description, category, price, img
## item edges: also_viewed, buy_after_viewing, also_bought, bought_together
## item neighbors: item, brand

## brand features: name
## brand neighbors: item

item_nodes = {}
brand_nodes = {}

brand_name2id = {}

for item_id in tqdm(item_raw_data):
    
    # brand nodes
    if 'brand' in item_raw_data[item_id] and item_raw_data[item_id]['brand'] != '':
        if item_raw_data[item_id]['brand'] not in brand_name2id:
            idd = f'brand_{len(brand_nodes)}'
            brand_name2id[item_raw_data[item_id]['brand']] = idd
            brand_nodes[idd] = {'features': {}, 'neighbors': {}}
            brand_nodes[idd]['features']['name'] = item_raw_data[item_id]['brand']
            brand_nodes[idd]['neighbors']['item'] = [item_id]
        else:
            brand_nodes[brand_name2id[item_raw_data[item_id]['brand']]]['neighbors']['item'].append(item_id)

    # item nodes
    item_nodes[item_id] = {'features': {}, 'neighbors': {}}
    ## add features
    item_nodes[item_id]['features']['title'] = item_raw_data[item_id]['title'] if "title" in item_raw_data[item_id] else ''
    item_nodes[item_id]['features']['description'] = item_raw_data[item_id]['description'] if "description" in item_raw_data[item_id] else ''
    item_nodes[item_id]['features']['price'] = item_raw_data[item_id]['price'] if "price" in item_raw_data[item_id] else ''
    item_nodes[item_id]['features']['img'] = item_raw_data[item_id]['imUrl'] if "imUrl" in item_raw_data[item_id] else ''
    item_nodes[item_id]['features']['category'] = [','.join(c) for c in item_raw_data[item_id]['categories']] if "categories" in item_raw_data[item_id] else []
    ## add neighbors
    if 'related' in item_raw_data[item_id]:
        item_nodes[item_id]['neighbors']['also_viewed_item'] = item_raw_data[item_id]['related']['also_viewed'] if "also_viewed" in item_raw_data[item_id]['related'] else []
        item_nodes[item_id]['neighbors']['buy_after_viewing_item'] = item_raw_data[item_id]['related']['buy_after_viewing'] if "buy_after_viewing" in item_raw_data[item_id]['related'] else []
        item_nodes[item_id]['neighbors']['also_bought_item'] = item_raw_data[item_id]['related']['also_bought'] if "also_bought" in item_raw_data[item_id]['related'] else []
        item_nodes[item_id]['neighbors']['bought_together_item'] = item_raw_data[item_id]['related']['bought_together'] if "bought_together" in item_raw_data[item_id]['related'] else []
    else:
        item_nodes[item_id]['neighbors']['also_viewed_item'] = []
        item_nodes[item_id]['neighbors']['buy_after_viewing_item'] = []
        item_nodes[item_id]['neighbors']['also_bought_item'] = []
        item_nodes[item_id]['neighbors']['bought_together_item'] = []
    item_nodes[item_id]['neighbors']['brand'] = [brand_name2id[item_raw_data[item_id]['brand']]] if ('brand' in item_raw_data[item_id] and item_raw_data[item_id]['brand'] != '') else []

# make the edges bidirectional
for item_id in tqdm(item_nodes):
    for rel in ['also_viewed_item', 'also_bought_item', 'bought_together_item']:
        for nid in item_nodes[item_id]['neighbors'][rel]:
            if nid not in item_nodes:
                item_nodes[item_id]['neighbors'][rel].remove(nid)
                continue
            if item_id not in item_nodes[nid]['neighbors'][rel]:
                item_nodes[nid]['neighbors'][rel].append(item_id)

In [None]:
## save graph
#pickle.dump({
#    'item_nodes': item_nodes,
#    'brand_nodes': brand_nodes,
#}, open(os.path.join(save_dir, 'graph.pkl'),"wb"))

json.dump({
    'item_nodes': item_nodes,
    'brand_nodes': brand_nodes,
}, open(os.path.join(save_dir, 'graph.json'),"w"), indent=4)