In [None]:
import os
import math
import random
import json
import pickle
import itertools
import functools
from copy import deepcopy
from tqdm import tqdm
from collections import defaultdict
from typing import List, Dict, Set, Tuple
import numpy as np
import seaborn as sns
random.seed(42)
np.random.seed(42)

## Load data

In [None]:
def load_data(data_root:str, dataset:str, sub_dataset:str) -> Dict:
    """
    data_root: path to directory contains the data file.
    dataset: path to dataset (Amazon)
    subdataset: sub dataset name (e.g. sports)

    Returns:
    data: Dict, key is the doc id, and value is data entry
    """
    # read raw data
    data_path = os.path.join(data_root, dataset, sub_dataset, 'product.json')
    brand_dict = defaultdict(set)
    with open(data_path) as f:
        data = {}
        readin = f.readlines()
        for line in tqdm(readin, desc="Loading Data..."):
            tmp = eval(line.strip())
            if ('title' in tmp and len(tmp['title'].strip()) != 0) or ('description' in tmp and len(tmp['description'].strip()) != 0):
                k = tmp['asin']
                data[k] = tmp
                if 'brand' in tmp:
                    brand_dict[tmp['brand']].add(k)
    for k in data:
        if 'related' not in data[k]:
            data[k]['related'] = {}
        if 'brand' in data[k]:
            data[k]['related']['cobrand'] = brand_dict[data[k]['brand']]
        else:
            data[k]['related']['cobrand'] = set()
    print(len(data))
    return data

In [None]:
def load_new_data(data_root, sub_dataset):
    data_path = os.path.join(data_root, sub_dataset, 'product.json')
    with open(data_path) as f:
        data = {}
        readin = f.readlines()
        for line in tqdm(readin, desc="Loading Data..."):
            tmp = eval(line.strip())
            k = tmp['asin']
            data[k] = tmp
    return data

In [None]:
def load_label_oracle(data_root, sub_dataset):
    label_oracle_path = os.path.join(data_root, 'amazon', sub_dataset, 'coarse_class.txt')
    label_oracle = set()
    with open(label_oracle_path) as f:
        for l in f:
            _, l, cnt = l.split('\t')
            label_oracle.add(l)
    return label_oracle

In [None]:
def load_both(old_base_dir, new_base_dir, dataset, subdataset):
    old_data = load_data(old_base_dir, dataset, subdataset)
    new_data = load_new_data(new_base_dir, subdataset)
    return old_data, new_data

In [None]:
def link_feature(old, new):
    for k in tqdm(old):
        if k in new and 'feature' in new[k] and len(new[k]['feature']) > 0:
            old[k]['feature'] = new[k]['feature']
    return old

In [None]:
def text_process(text):
    p_text = ' '.join(text.split('\r\n'))
    p_text = ' '.join(text.split('\n\r'))
    p_text = ' '.join(text.split('\n'))
    p_text = ' '.join(p_text.split('\t'))
    p_text = ' '.join(p_text.split('\rm'))
    p_text = ' '.join(p_text.split('\r'))
    p_text = ''.join(p_text.split('$'))
    p_text = ''.join(p_text.split('*'))

    return p_text

In [None]:
def get_title_description(data, k):
    tt = ''
    if 'title' in data[k]:
        tt = text_process(data[k]['title'])
    if 'description' in data[k]:
        tt += ' ' + text_process(data[k]['description'])
    return tt.strip()

In [None]:
datasets = 'amazon'

data_name = ['cloth', 'sports', 'home'][2]
old_raw_base_dir = 'xxx/data/'
new_raw_base_dir = 'xxx/data/'
save_base_dir = 'xxx/data/'

In [None]:
data = load_both(old_raw_base_dir, new_raw_base_dir, 'amazon', data_name)

In [None]:
data = link_feature(data[0], data[1])

In [None]:
label_oracle = load_label_oracle(old_raw_base_dir, data_name)

## regression task (price prediction)

In [None]:
def generate_regression(data, kw):
    ret = set()
    for k in tqdm(data, desc="Generate %s" % kw):
        if kw in data[k]:
            tt = get_title_description(data, k)
            if tt is not None:
                ret.add((tt, data[k][kw]))
    return ret

In [None]:
def write_regression(save_base_dir, data_name, task_name, data, theshold):
    tmp_base = os.path.join(save_base_dir, data_name, 'downstream', task_name)
    if not os.path.exists(tmp_base):
        os.makedirs(tmp_base)
    data_path = os.path.join(tmp_base, 'data.jsonl')
    print("Write to %s" % data_path)
    with open(data_path, 'w') as f:
        for p, v in tqdm(data, desc="Write %s" % task_name):
            #f.write('%s\t%s\n' % (p, str(v)))
            if v > theshold:
                continue
            f.write(json.dumps({"q_text":p, "label":v})+'\n')

In [None]:
# generate price prediction
res = generate_regression(data, 'price')
print(len(res))

In [None]:
## statistics of price
prices_list = []
for tmp in res:
    prices_list.append(tmp[1])
sns.kdeplot(prices_list)

In [None]:
### !! select the price theshold based on the density function above

theshold=100
write_regression(save_base_dir, data_name, 'price', res, theshold)

## brand prediction

In [None]:
def generate_brand(data):
    brand_dict = defaultdict(set)
    brand_json = []
    brand_json_dict = defaultdict(dict)
    brand_data_tsv = []
    for k in tqdm(data, desc="Generate Brand Data"):
        if 'brand' in data[k] and data[k]['brand'] != 'Unknown':
            brand_dict[data[k]['brand']].add(k)
    for b in tqdm(brand_dict, desc="Generate Brand Dict"):
        for iid in brand_dict[b]:
            brand_data_tsv.append((get_title_description(data, iid), b))
        if len(brand_dict[b]) >= 100:
            sampled = random.sample(list(brand_dict[b]), 100)
        else:
            sampled = list(brand_dict[b])
        cur = {'name': b, 'items': [get_title_description(data, x) for x in sampled]}
        brand_json.append(cur)
        brand_json_dict[b] = cur
    return brand_data_tsv, brand_json, brand_json_dict

In [None]:
def write_brand(save_base_dir, data_name, data, brand_info):
    tmp_base = os.path.join(save_base_dir, data_name, 'downstream', 'brand')
    if not os.path.exists(tmp_base):
        os.makedirs(tmp_base)
    data_path = os.path.join(tmp_base, 'data.tsv')
    meta_data_path = os.path.join(tmp_base, 'brand.jsonl')
    print("Write to %s" % data_path)
    print("Write to %s" % meta_data_path)
    with open(data_path, 'w') as f:
        for pname, vid in data:
            f.write("%s\t%s\n" % (pname, str(vid)))
    with open(meta_data_path, 'w') as f:
        for md in brand_info:
            f.write(json.dumps(md)+'\n')

In [None]:
def write_brand_advanced(save_base_dir, data_name, data, brand_info_dict):
    tmp_base = os.path.join(save_base_dir, data_name, 'downstream', 'brand')
    if not os.path.exists(tmp_base):
        os.makedirs(tmp_base)
    
    # brand name matching
    data_vn_path = os.path.join(tmp_base, 'data_bn.jsonl')
    with open(data_vn_path, 'w') as f:
        for pname, vid in tqdm(data):
            dd = {'q_text': pname, 'k_text': brand_info_dict[vid]['name']}
            ddr = json.dumps(dd)
            f.write(ddr + '\n')
    
    # brand item matching
    data_vp_path = os.path.join(tmp_base, 'data_bi.jsonl')
    with open(data_vp_path, 'w') as f:
        for pname, vid in tqdm(data):
            tmp_items = list(brand_info_dict[vid]['items'])
            if len(tmp_items) == 1:
                continue
            if pname in tmp_items:
                tmp_items.remove(pname)
            random.shuffle(tmp_items)
            dd = {'q_text': pname, 'k_text': ' '.join(tmp_items)}
            ddr = json.dumps(dd)
            f.write(ddr + '\n')

In [None]:
res_brand = generate_brand(data)
print(len(res_brand[0]), len(res_brand[1]))
write_brand(save_base_dir, data_name, res_brand[0], res_brand[1])
write_brand_advanced(save_base_dir, data_name, res_brand[0], res_brand[2])

## feature prediction

In [None]:
def generate_feature(data):
    ret = set()
    cnt = 0
    item_cnt = 0
    for k in tqdm(data, desc="Generate Feature"):
        if 'feature' in data[k]:
            item_cnt += 1
            tt = get_title_description(data, k)
            for c in data[k]['feature']:
                if c[:len('<span class')] != '<span class' and len(c) != 0:
                    ret.add((tt, text_process(c)))
                    cnt += 1
    print(f'feature/item = {cnt/item_cnt}')
    return ret

In [None]:
def write_feature(save_base_dir, data_name, task_name, data):
    tmp_base = os.path.join(save_base_dir, data_name, 'downstream', task_name)
    if not os.path.exists(tmp_base):
        os.makedirs(tmp_base)
    data_path = os.path.join(tmp_base, 'data.jsonl')
    print("Write to %s" % data_path)
    with open(data_path, 'w') as f:
        for tt, c in tqdm(data, desc="Write Feature"):
            dd = {'q_text': tt, 'k_text': c}
            ddr = json.dumps(dd)
            f.write(ddr + '\n')

In [None]:
f_res = generate_feature(data)
print(len(f_res))
write_feature(save_base_dir, data_name, 'feature_pred', f_res)

## classification

In [None]:
def generate_classification(data, label_oracle, threshold=10000):
    label_cnt = defaultdict(int)
    label2idx = dict()
    res = set()
    for k in tqdm(data):
        if 'categories' in data[k] and len(data[k]['categories']) == 1 and len(data[k]['categories'][0]) > 1:
            ln = data[k]['categories'][0][1]
            label_cnt[ln] += 1
    tmp_list = []
    for k in label_cnt:
        if label_cnt[k] >= threshold and k in label_oracle:
            tmp_list.append(k)
    label_lst = sorted(tmp_list)
    for idx,l in enumerate(label_lst):
        label2idx[l] = idx
    for k in tqdm(data):
        tt = get_title_description(data, k).strip()
        if len(tt) > 0 and 'categories' in data[k] and len(data[k]['categories']) == 1 and len(data[k]['categories'][0]) > 1:
            ln = data[k]['categories'][0][1]
            if ln in label2idx:
                res.add((tt, label2idx[ln]))
    print(len(label2idx))
    print(label2idx)
    print({x: label_cnt[x] for x in label2idx})
    return res, label2idx

In [None]:
def write_classification(save_base_dir, data_name, task_name, data):
    tmp_base = os.path.join(save_base_dir, data_name, 'downstream', task_name)
    if not os.path.exists(tmp_base):
        os.makedirs(tmp_base)
    data_path = os.path.join(tmp_base, 'data.jsonl')
    print("Write to %s" % data_path)
    with open(data_path, 'w') as f:
        for p, v in tqdm(data, desc="Write %s" % task_name):
            #f.write('%s\t%s\n' % (p, str(v)))
            f.write(json.dumps({"q_text":p, "label":v})+'\n')

In [None]:
def write_json(save_base_dir, data_name, task_name, data):
    tmp_base = os.path.join(save_base_dir, data_name, 'downstream', task_name)
    data_path = os.path.join(tmp_base, 'label.jsonl') 
    with open(data_path,'w') as fout:
        json.dump(data, fout, indent = 4)

In [None]:
classification_data, label2idx = generate_classification(data, label_oracle, 0)
print(len(classification_data))
write_classification(save_base_dir, data_name, 'coarse_classification', classification_data)
write_json(save_base_dir, data_name, 'coarse_classification', label2idx)

## retrieval

In [None]:
def init_retrieval_label(data):
    # statistics on label names
    label_name_stat = defaultdict(int)

    for did in tqdm(data):
        sample = data[did]
        c_list = list(set(sum(sample['categories'], [])))
        for c in c_list:
            label_name_stat[c] += 1
            
    # read label name dict
    label_name_dict = {}
    label_name_set = set()
    label_name2id_dict = {}

    for n in label_name_stat:
        if label_name_stat[n] > int(0.5 * len(data)):
            continue

        label_name_dict[len(label_name_dict)] = n
        label_name_set.add(n)
        label_name2id_dict[n] = len(label_name_dict) - 1

    print(f'Num of unique labels:{len(label_name_set)}')
    
    return label_name2id_dict

In [None]:
def write_retrieval_base(save_base_dir, data_name, task_name, data, label_name2id_dict):
    tmp_base = os.path.join(save_base_dir, data_name, 'downstream', task_name)
    if not os.path.exists(tmp_base):
        os.makedirs(tmp_base)

    random.seed(0)
    data_path = os.path.join(tmp_base, 'node_classification.jsonl')
    with open(data_path,'w') as fout:
        for q in tqdm(data):
            q_text = get_title_description(data, q)

            label_names_list = list(set(sum(data[q]['categories'], [])))
            label_names_list = [n for n in label_names_list if n in label_name2id_dict]
            label_ids_list = [label_name2id_dict[n] for n in label_names_list]

            if len(label_ids_list) != 0:
                fout.write(json.dumps({
                    'q_text':q_text,
                    'labels':label_ids_list,
                    'label_names':label_names_list
                })+'\n')

In [None]:
def write_retrieval(save_base_dir, data_name, task_name, label_name2id_dict):
    
    tmp_base = os.path.join(save_base_dir, data_name, 'downstream', task_name)
    if not os.path.exists(tmp_base):
        os.makedirs(tmp_base)
    
    label_json_path = os.path.join(tmp_base, 'documents.json')
    print("Write to %s" % label_json_path)
    labels_dict = []
    for lname in label_name2id_dict:
        if lname != 'null':
            labels_dict.append({'id':label_name2id_dict[lname], 'contents':lname})
    json.dump(labels_dict, open(label_json_path, 'w'), indent=4)

    label_path = os.path.join(tmp_base, 'documents.txt')
    print("Write to %s" % label_path)
    with open(label_path, 'w') as fout:
        for lname in label_name2id_dict:
            if lname == 'null':
                continue
            fout.write(str(label_name2id_dict[lname])+'\t'+lname+'\n')
    
    docid = 0
    data_class_path = os.path.join(tmp_base, 'node_classification.jsonl')
    print("Read from %s" % data_class_path)
    node_text_path = os.path.join(tmp_base, 'node_text.tsv')
    print("Write to %s" % node_text_path)
    trec_path = os.path.join(tmp_base, 'truth.trec')
    print("Write to %s" % trec_path)
    with open(data_class_path) as f, open(node_text_path, 'w') as fout1, open(trec_path, 'w') as fout2:
        readin = f.readlines()
        for line in tqdm(readin):
            tmp = json.loads(line)
            fout1.write(str(docid) + '\t' + tmp['q_text'] + '\n')
            for label in tmp['labels']:
                fout2.write(str(docid)+' '+str(0)+' '+str(label)+' '+str(1)+'\n')
            docid += 1

In [None]:
label_name2id_dict = init_retrieval_label(data)
write_retrieval_base(save_base_dir, data_name, 'retrieval', data, label_name2id_dict)
write_retrieval(save_base_dir, data_name, 'retrieval', label_name2id_dict)