In [5]:
import os
import pickle
import json
import numpy as np
from tqdm import tqdm
from collections import defaultdict

## Read file

In [3]:
# read dblp_whole_data
with open('DBLP_f_1990_now.json') as f:
    data = json.load(f)

In [11]:
# generate title_fos_dict: key title, value fos_list

title_fos_dict = {}
cnt = 0

for p in tqdm(data):
    if p['title'] in title_fos_dict:
        cnt += 1
    if 'fos' in p:
        title_fos_dict[p['title']] = p['fos']
    else:
        title_fos_dict[p['title']] = []
    
print(cnt)

100%|███████████████████████████████████████████████████████████████████████████| 4692018/4692018 [00:13<00:00, 350775.16it/s]

187944





## Generate data for paper classification

## generation from link prediction train set

In [16]:
## obtain fos_list for the query in each line

# read train data
train_texts = []
with open('DBLP_homo/train_p.tsv') as f:
    data = f.readlines()
    for line in tqdm(data):
        query, _ = line.strip().split('\$\$')
        query = query.split('\t')
        assert len(query) == 6
        train_texts.append(query[0])

100%|████████████████████████████████████████████████████████| 2518148/2518148 [00:05<00:00, 436911.00it/s]


In [5]:
# generate fos_list for texts_list

fos_list = []
fos_cnt_dict = defaultdict(int)

for text in tqdm(train_texts):
    fos_list.append(title_fos_dict[text])
    for f in title_fos_dict[text]:
        fos_cnt_dict[f['name']] += 1

print(len(fos_cnt_dict))

100%|█████████████████████████████████████████████████████████| 2518148/2518148 [00:26<00:00, 93835.68it/s]

108449





In [6]:
len(fos_list)

2518148

In [7]:
# generate fos & cnt pair
fos_cnt_list = []

for f in tqdm(fos_cnt_dict):
    fos_cnt_list.append((f,fos_cnt_dict[f]))

fos_cnt_list = sorted(fos_cnt_list, key=lambda x:-x[1])

100%|███████████████████████████████████████████████████████████| 108449/108449 [00:08<00:00, 13319.15it/s]


In [8]:
# select topk for as classification types

type_list = []
k = 30

for i in range(k):
    type_list.append(fos_cnt_list[i+1][0])

print(type_list)

['Artificial intelligence', 'Mathematics', 'Machine learning', 'Computer vision', 'Computer network', 'Mathematical optimization', 'Pattern recognition', 'Distributed computing', 'Data mining', 'Real-time computing', 'Algorithm', 'Control theory', 'Discrete mathematics', 'Engineering', 'Electronic engineering', 'Theoretical computer science', 'Combinatorics', 'Knowledge management', 'Multimedia', 'Computer security', 'World Wide Web', 'Human–computer interaction', 'Control engineering', 'Parallel computing', 'Information retrieval', 'Software', 'Artificial neural network', 'Communication channel', 'Simulation', 'Natural language processing']


In [9]:
# see how many paper do not have any fos

cnt = 0

labels = []

for fos in tqdm(fos_list):
    tmp_fos = [f['name'] for f in fos]
    if len(set(tmp_fos) & set(type_list)) == 0:
        cnt += 1
        
print(f'{cnt} paper do not have label')

100%|█████████████████████████████████████████████████████████| 2518148/2518148 [00:37<00:00, 67586.58it/s]

251185 paper do not have label





In [10]:
# generate label

labels = []

for fos in tqdm(fos_list):
    tmp_label = []
    tmp_fos = set([f['name'] for f in fos])
    for i in range(len(type_list)):
        if type_list[i] in tmp_fos:
            tmp_label.append(1)
        else:
            tmp_label.append(0)
    labels.append(tmp_label)
    assert len(tmp_label) == len(type_list)
assert len(labels) == len(fos_list)

np.save('/shared/data2/bowenj4/textnet/HeterformerClassifier/DBLP_embed/transductive_label.npy', np.array(labels))

100%|█████████████████████████████████████████████████████████| 2518148/2518148 [00:59<00:00, 42265.43it/s]


In [22]:
# save type_list

with open('/shared/data2/bowenj4/textnet/HeterformerClassifier/DBLP_embed/class_name.txt','w') as fout:
    for i, t in enumerate(type_list):
        fout.write(str(i)+'\t'+t+'\n')

## generation from link prediction test set

In [17]:
## obtain fos_list for the query in each line

# read test data
test_texts = []
with open('DBLP_homo/test_p.tsv') as f:
    data = f.readlines()
    for line in tqdm(data):
        query, _ = line.strip().split('\$\$')
        query = query.split('\t')
        assert len(query) == 6
        test_texts.append(query[0])

100%|██████████████████████████████████████████████████████████| 719445/719445 [00:01<00:00, 447108.30it/s]


In [12]:
# generate fos_list for texts_list

fos_list = []
fos_cnt_dict = defaultdict(int)

for text in tqdm(test_texts):
    fos_list.append(title_fos_dict[text])
    for f in title_fos_dict[text]:
        fos_cnt_dict[f['name']] += 1

print(len(fos_cnt_dict))

100%|███████████████████████████████████████████████████████████| 719445/719445 [00:08<00:00, 83653.39it/s]

79890





In [13]:
# generate fos & cnt pair
fos_cnt_list = []

for f in tqdm(fos_cnt_dict):
    fos_cnt_list.append((f,fos_cnt_dict[f]))

fos_cnt_list = sorted(fos_cnt_list, key=lambda x:-x[1])

100%|████████████████████████████████████████████████████████████| 79890/79890 [00:00<00:00, 185712.00it/s]


In [14]:
# see how many paper do not have any fos

cnt = 0

labels = []

for fos in tqdm(fos_list):
    tmp_fos = [f['name'] for f in fos]
    if len(set(tmp_fos) & set(type_list)) == 0:
        cnt += 1
        
print(f'{cnt} paper do not have label')

100%|██████████████████████████████████████████████████████████| 719445/719445 [00:03<00:00, 181505.66it/s]

72017 paper do not have label





In [15]:
# generate label

labels = []

for fos in tqdm(fos_list):
    tmp_label = []
    tmp_fos = set([f['name'] for f in fos])
    for i in range(len(type_list)):
        if type_list[i] in tmp_fos:
            tmp_label.append(1)
        else:
            tmp_label.append(0)
    labels.append(tmp_label)
    assert len(tmp_label) == len(type_list)
assert len(labels) == len(fos_list)

np.save('/shared/data2/bowenj4/textnet/HeterformerClassifier/DBLP_embed/inductive_label.npy', np.array(labels))

100%|███████████████████████████████████████████████████████████| 719445/719445 [00:07<00:00, 89948.93it/s]


## save papers (train & test set together)

In [18]:
print(len(train_texts), len(test_texts))

2518148 719445


In [19]:
# save train_texts concate with test_texts
with open('/shared/data2/bowenj4/textnet/HeterformerClassifier/DBLP_embed/train_test_text.tsv','w') as fout:
    for train_text in tqdm(train_texts):
        fout.write(train_text+'\n')
    for test_text in tqdm(test_texts):
        fout.write(test_text+'\n')

100%|███████████████████████████████████████████████████████| 2518148/2518148 [00:01<00:00, 1810504.14it/s]
100%|█████████████████████████████████████████████████████████| 719445/719445 [00:00<00:00, 1783010.80it/s]


## Generate data for author classification

In [6]:
train_authors_id2idx = pickle.load(open('DBLP_neighbour/random_train_authors_id2idx.pkl','rb'))
author_neighbours = pickle.load(open('DBLP_neighbour/author_neighbour.pkl','rb'))

In [9]:
train_authors_idx2id = {train_authors_id2idx[a]:a for a in tqdm(train_authors_id2idx)}

100%|██████████████████████████████████████████████████████████████████████████| 2717797/2717797 [00:02<00:00, 1224110.47it/s]


In [16]:
type_list = ['Artificial intelligence', 'Mathematics', 'Machine learning', 'Computer vision', 'Computer network', 'Mathematical optimization', 'Pattern recognition', 'Distributed computing', 'Data mining', 'Real-time computing', 'Algorithm', 'Control theory', 'Discrete mathematics', 'Engineering', 'Electronic engineering', 'Theoretical computer science', 'Combinatorics', 'Knowledge management', 'Multimedia', 'Computer security', 'World Wide Web', 'Human–computer interaction', 'Control engineering', 'Parallel computing', 'Information retrieval', 'Software', 'Artificial neural network', 'Communication channel', 'Simulation', 'Natural language processing']

In [7]:
author_neighbours

{'2312688602\tMakoto Satoh': ['Preliminary Design of a Network Protocol Learning Tool Based on the Comprehension of High School Students: Design by an Empirical Study Using a Simple Mind Map',
  'Algorithmic Thinking Learning Support System with eAssessment Function'],
 '2482909946\tRyo Muramatsu': ['Preliminary Design of a Network Protocol Learning Tool Based on the Comprehension of High School Students: Design by an Empirical Study Using a Simple Mind Map'],
 '2128134587\tMizue Kayama': ['Preliminary Design of a Network Protocol Learning Tool Based on the Comprehension of High School Students: Design by an Empirical Study Using a Simple Mind Map',
  'A drawing learning support system with auto-evaluating function based on the drawing process model',
  'A Collaborative Environment for New Learning Ecology and E-Pedagogy',
  'LAPCHAT: A Contents-Sharable Management System for Computer Supported Collaborative Learning',
  'The Collaborative Learning Support in the INTERNET Learning Spac

In [8]:
train_authors_id2idx

{'2659758194\tZheng-Yang Liu': 0,
 '2305927056\tSabine McConnell': 1,
 '2313570484\tO. I. Tacha': 2,
 '2974106686\tA. Lippman': 3,
 '2009237637\tSandra Ayache': 4,
 '2973913207\tT.E. LaPorta': 5,
 '2095592176\tAleksandra Wolanin': 6,
 '2636659414\tH. Marmanis': 7,
 '2228548924\tOscar Cuadros': 8,
 '1830286447\tHector Vazquez-Leal': 9,
 '2039895514\tClaude Castille': 10,
 '2490552613\tJoao Carlos E. Ferreira': 11,
 '2580274271\tKaichun Chang': 12,
 '2794837709\tShreejoy J. Tripathy': 13,
 '2394564507\tMaurizio Tortorici': 14,
 '2116921010\tYeu-Horng Shiau': 15,
 '2148289271\tGraciela Gonzalez': 16,
 '2714578088\tFiras Y Omar': 17,
 '2318301650\tJ. L. Summers': 18,
 '2389930721\tP. N. Chetty': 19,
 '2422756567\tWeidong Gu': 20,
 '157665041\tR. Cozot': 21,
 '2034032166\tShinji Abe': 22,
 '2794052373\tLubartTodd': 23,
 '334277605\tMichael Capps': 24,
 '2398856008\tChristian Hilbes': 25,
 '684468196\tLars Diening': 26,
 '2698836078\tAllison Mankin': 27,
 '2968844304\tZengguang Hou': 28,
 '2

In [10]:
train_authors_idx2id

{0: '2659758194\tZheng-Yang Liu',
 1: '2305927056\tSabine McConnell',
 2: '2313570484\tO. I. Tacha',
 3: '2974106686\tA. Lippman',
 4: '2009237637\tSandra Ayache',
 5: '2973913207\tT.E. LaPorta',
 6: '2095592176\tAleksandra Wolanin',
 7: '2636659414\tH. Marmanis',
 8: '2228548924\tOscar Cuadros',
 9: '1830286447\tHector Vazquez-Leal',
 10: '2039895514\tClaude Castille',
 11: '2490552613\tJoao Carlos E. Ferreira',
 12: '2580274271\tKaichun Chang',
 13: '2794837709\tShreejoy J. Tripathy',
 14: '2394564507\tMaurizio Tortorici',
 15: '2116921010\tYeu-Horng Shiau',
 16: '2148289271\tGraciela Gonzalez',
 17: '2714578088\tFiras Y Omar',
 18: '2318301650\tJ. L. Summers',
 19: '2389930721\tP. N. Chetty',
 20: '2422756567\tWeidong Gu',
 21: '157665041\tR. Cozot',
 22: '2034032166\tShinji Abe',
 23: '2794052373\tLubartTodd',
 24: '334277605\tMichael Capps',
 25: '2398856008\tChristian Hilbes',
 26: '684468196\tLars Diening',
 27: '2698836078\tAllison Mankin',
 28: '2968844304\tZengguang Hou',
 29

In [17]:
## construct author fos dict
author_fos_dict = {} # key: author, value: fos count dict

for a in tqdm(author_neighbours):
    assert a not in author_fos_dict
    author_fos_dict[a] = {f:0 for f in type_list}
    for p in author_neighbours[a]:
        for fos in title_fos_dict[p]:
            if fos['name'] in type_list:
                author_fos_dict[a][fos['name']] += 1

100%|████████████████████████████████████████████████████████████████████████████| 4883413/4883413 [03:07<00:00, 26015.00it/s]


In [18]:
author_fos_dict

{'2312688602\tMakoto Satoh': {'Artificial intelligence': 1,
  'Mathematics': 0,
  'Machine learning': 0,
  'Computer vision': 0,
  'Computer network': 0,
  'Mathematical optimization': 0,
  'Pattern recognition': 0,
  'Distributed computing': 0,
  'Data mining': 0,
  'Real-time computing': 0,
  'Algorithm': 0,
  'Control theory': 0,
  'Discrete mathematics': 0,
  'Engineering': 0,
  'Electronic engineering': 0,
  'Theoretical computer science': 0,
  'Combinatorics': 0,
  'Knowledge management': 0,
  'Multimedia': 1,
  'Computer security': 0,
  'World Wide Web': 0,
  'Human–computer interaction': 1,
  'Control engineering': 0,
  'Parallel computing': 0,
  'Information retrieval': 0,
  'Software': 0,
  'Artificial neural network': 0,
  'Communication channel': 0,
  'Simulation': 0,
  'Natural language processing': 0},
 '2482909946\tRyo Muramatsu': {'Artificial intelligence': 0,
  'Mathematics': 0,
  'Machine learning': 0,
  'Computer vision': 0,
  'Computer network': 0,
  'Mathematical o

In [20]:
# generate label

labels = []

for aidx in tqdm(range(2717797)):
    aid = train_authors_idx2id[aidx]
    fos = author_fos_dict[aid]

    tmp_label = []
    for i in range(len(type_list)):
        if fos[type_list[i]] != 0:
            tmp_label.append(1)
        else:
            tmp_label.append(0)
    labels.append(tmp_label)
    assert len(tmp_label) == len(type_list)
assert len(labels) == len(train_authors_idx2id)

np.save('/shared/data2/bowenj4/textnet/HeterformerClassifier/DBLP_embed/author_label.npy', np.array(labels))

100%|████████████████████████████████████████████████████████████████████████████| 2717797/2717797 [01:09<00:00, 38879.55it/s]


In [21]:
labels

[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,