In [1]:
import os
import random
import json
import pickle
from copy import deepcopy
from tqdm import tqdm
from collections import defaultdict

import numpy as np
from transformers import BertTokenizerFast

In [6]:
def load_data(dataset, sub_dataset):
    # read raw data
    with open(f'/home/yuhanli/GLBench/models/alignment/Patton/data/{sub_dataset}/papers_bert.json') as f:
        data = {}
        readin = f.readlines()
        for line in tqdm(readin, desc="Loading Data..."):
            tmp = eval(line.strip())
            data[tmp['paper']] = tmp
    return data

In [4]:
def generate_co_author_data(data):
    co_author = defaultdict(set)
    id2author = defaultdict(set)
    for idd in tqdm(data, desc="Generating co author..."):
        if 'author' not in data[idd] or not data[idd]['author']: continue
        for a in data[idd]['author']:
            co_author[a].add(idd)
            id2author[idd].add(a)

    # print(len(co_author))
    # avgv = 0
    # mx = 0
    # mm = float('inf')
    # for k in co_author:
    #     avgv += len(co_author[k])
    #     mx = max(mx, len(co_author[k]))
    #     mm = min(mm, len(co_author[k]))
    # print(avgv, len(co_author), avgv/len(co_author), mx, mm)
    return co_author, id2author

In [5]:
def generate_pairs(co_author):
    co_author_pairs = set()
    for author in tqdm(co_author, desc="Generating pairs..."):
        cur_v = list(co_author[author])
        for i in range(len(cur_v)):
            for j in range(i+1, len(cur_v)):
                if cur_v[i] != cur_v[j]:
                    if (cur_v[i], cur_v[j]) not in co_author_pairs:
                        co_author_pairs.add( (cur_v[i], cur_v[j]) )
    return list(co_author_pairs)

In [6]:
def sample_pairs(co_author_pair_list, n_train=128, n_val=128, n_test=10000):

    def do_sample(num, pos, tgt):
        for i in tqdm(range(num), desc="Sampling pairs..."):
            tgt.append(co_author_pair_list[idx[pos]])
            pos += 1
        return pos

    train_pairs = []
    val_pairs = []
    test_pairs = []
    idx = np.random.permutation(len(co_author_pair_list))
    pos = 0
    pos = do_sample(n_train, pos, train_pairs)
    pos = do_sample(n_val, pos, val_pairs)
    do_sample(n_test, pos, test_pairs)
    return train_pairs, val_pairs, test_pairs

In [11]:
def sample_neighbors(author2ids, id2author, train_pairs, val_pairs, test_pairs, n_neigbhor=5):


    def do_sample(candidates):
        if len(candidates) <= n_neigbhor:
            return candidates
        ret = set()
        while len(ret) < n_neigbhor:
            cur = np.random.randint(len(candidates))
            ret.add(candidates[cur])
        return list(ret)
    
    def get_possible_neighbor(authors):
        ret = set()
        for a in authors:
            for paper in author2ids[a]:
                ret.add(paper)
        return ret
    
    no_sample = defaultdict(set)
    for cur_pairs in [val_pairs, test_pairs]:
        for p in cur_pairs:
            no_sample[p[0]].add(p[1]) # We don't want to sample these in train
            no_sample[p[1]].add(p[0])
    
    train_with_neigbhor = []
    val_with_neighbor = []
    test_with_neighbor = []
    # Sample train neighbors
    for pair in tqdm(train_pairs, desc="Sample Train Neigbhors"):
        p,q = pair
        possible_neihbors_p = get_possible_neighbor(id2author[p])
        possible_neihbors_q = get_possible_neighbor(id2author[q])
        cur_sample_p = possible_neihbors_p - no_sample[p]
        cur_sample_q = possible_neihbors_q - no_sample[q]
        cur_sample_p.remove(q)
        cur_sample_p.remove(p)
        cur_sample_q.remove(p)
        cur_sample_q.remove(q)
        p_n = do_sample(list(cur_sample_p))
        q_n = do_sample(list(cur_sample_q))
        train_with_neigbhor.append((p, q, p_n, q_n))
    
    for pair in tqdm(val_pairs, desc="Sample Val Neigbhors"):
        p,q = pair
        possible_neihbors_p = get_possible_neighbor(id2author[p])
        possible_neihbors_q = get_possible_neighbor(id2author[q])
        possible_neihbors_p.remove(p)
        possible_neihbors_q.remove(q)
        p_n = do_sample(list(possible_neihbors_p))
        q_n = do_sample(list(possible_neihbors_q))
        val_with_neighbor.append((p, q, p_n, q_n))

    for pair in tqdm(test_pairs, desc="Sample Test Neigbhors"):
        p,q = pair
        possible_neihbors_p = get_possible_neighbor(id2author[p])
        possible_neihbors_q = get_possible_neighbor(id2author[q])
        possible_neihbors_p.remove(p)
        possible_neihbors_q.remove(q)
        p_n = do_sample(list(possible_neihbors_p))
        q_n = do_sample(list(possible_neihbors_q))
        test_with_neighbor.append((p, q, p_n, q_n))
    return train_with_neigbhor, val_with_neighbor, test_with_neighbor

In [8]:
def convert_and_dump(data, tuples, path):
    print("Dump data to %s" % path)
    with open(path, 'w') as fout:
        for t in tqdm(tuples, desc="Processing %s" % path.split('/')[-1]):
            q, k, q_n, k_n = t
            cur = {}
            cur['q_text'] = data[q]['title']
            cur['q_n_text'] = []
            for paper in q_n:
                cur['q_n_text'].append(data[paper]['title'])
            cur['k_text'] = data[k]['title']
            cur['k_n_text'] = []
            for paper in k_n:
                cur['k_n_text'].append(data[paper]['title'])
            fout.write(json.dumps(cur)+'\n')

In [9]:
random.seed(0)

In [10]:
datasets = ['MAG'][0]
sub_datasets = ['Mathematics'][0]
base_dir = '/shared/data3/wentao4/transfernet/data/'
task_name = 'co-author'
n_neighbor = 5
cur_d = load_data(datasets, sub_datasets)

FileNotFoundError: [Errno 2] No such file or directory: '/home/yuhanli/GLBench/models/alignment/Patton/data/MAG/Mathematics/papers_bert.json'

In [25]:
author2ids, id2author = generate_co_author_data(cur_d)

Generating co author...: 100%|██████████| 178670/178670 [00:03<00:00, 57977.93it/s] 


In [26]:
pairs = generate_pairs(author2ids)

Generating pairs...: 100%|██████████| 135247/135247 [00:01<00:00, 121495.48it/s]


In [27]:
train, val, test = sample_pairs(pairs)

Sampling pairs...: 100%|██████████| 128/128 [00:00<00:00, 434713.29it/s]
Sampling pairs...: 100%|██████████| 128/128 [00:00<00:00, 409825.12it/s]
Sampling pairs...: 100%|██████████| 10000/10000 [00:00<00:00, 482092.83it/s]


In [28]:
train_n, val_n, test_n = sample_neighbors(author2ids, id2author, train, val, test)

Sample Train Neigbhors: 100%|██████████| 128/128 [00:00<00:00, 8623.60it/s]
Sample Val Neigbhors: 100%|██████████| 128/128 [00:00<00:00, 10772.54it/s]
Sample Test Neigbhors: 100%|██████████| 10000/10000 [00:00<00:00, 15125.45it/s]


In [29]:
save_dir = os.path.join(base_dir, datasets, sub_datasets, task_name)
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
convert_and_dump(cur_d, train_n, os.path.join(save_dir, 'train.jsonl'))
convert_and_dump(cur_d, val_n, os.path.join(save_dir, 'val.jsonl'))
convert_and_dump(cur_d, test_n, os.path.join(save_dir, 'test.jsonl'))

Dump data to /shared/data3/wentao4/transfernet/data/MAG/Economics/co-author/train.jsonl


Processing train.jsonl: 100%|██████████| 128/128 [00:00<00:00, 21529.09it/s]


Dump data to /shared/data3/wentao4/transfernet/data/MAG/Economics/co-author/val.jsonl


Processing val.jsonl: 100%|██████████| 128/128 [00:00<00:00, 30384.91it/s]


Dump data to /shared/data3/wentao4/transfernet/data/MAG/Economics/co-author/test.jsonl


Processing test.jsonl: 100%|██████████| 10000/10000 [00:00<00:00, 45218.00it/s]
