In [None]:
import os
import json
import pickle
import random
from copy import deepcopy
from tqdm import tqdm

## Read DBLP

In [None]:
with open('data/dblp/dblp.v12.json') as f:
    data = json.load(f)

In [None]:
## simplify the data
simp_data = {}

no_ref = 0
no_fos = 0
no_authors = 0
no_ven = 0

for p in tqdm(data):
    assert p['id'] not in simp_data
    tmp_dict = {}
    tmp_dict['title'] = p['title']
    tmp_dict['year'] = p['year']
    
    if 'authors' in p:
        tmp_dict['authors'] = p['authors']  
    else:
        no_authors = no_authors + 1
    
    if 'references' in p:
        tmp_dict['references'] = p['references']  
    else:
        no_ref = no_ref + 1
        
    if 'fos' in p:
        tmp_dict['fos'] = p['fos']  
    else:
        no_fos = no_fos + 1
        
    if 'venue' in p:
        tmp_dict['venue'] = p['venue']  
    else:
        no_ven = no_ven + 1
    
    simp_data[p['id']] = tmp_dict
    
json.dump(simp_data, open('data/dblp/DBLP.json', "w"))
'''
DBLP.json: a dictionary
'''

In [None]:
json.dump(simp_data, open('data/dblp/DBLP.json', "w"))

In [None]:
print(f'no_ref:{no_ref}, no_fos:{no_fos}, no_authors:{no_authors}, no_ven:{no_ven}, all:{len(data)}')

In [None]:
## transfer to same format with origin
simp_data_origin_format = []

for k in tqdm(data):
    v = data[k]
    v['id'] = k
    simp_data_origin_format.append(v)
    
json.dump(simp_data_origin_format, open('DBLP_f.json', "w"))

In [None]:
## read simplfied data
with open('DBLP_f.json') as f:
    data = json.load(f)
'''
DBLP_f.json: a list
'''

## Data Split based on Year

In [None]:
# write
data_list = []
paper_id_list = set()

# filter based on year
for d in tqdm(data):
    if d['year'] > 1990:
        data_list.append(deepcopy(d))
        paper_id_list.add(d['id'])

# filter the ref
for d in tqdm(data_list):
    if 'references' in d:
        d['references'] = [ref for ref in d['references'] if str(ref) in paper_id_list]

#json.dump(data_list, open('DBLP_f_1990_now.json', "w"))

In [None]:
json.dump(data_list, open('DBLP_f_1990_now.json', "w"))

In [None]:
len(data_list)

In [None]:
# read
with open('DBLP_f_1990_now.json') as f:
    data = json.load(f)

## Statistics

In [None]:
author_fos = {} # key: (author_name, author_id), value: fos name set
venue = set() # element: venue name
author = set() # element: (author_name, author_id)
fos = set() # element: fos name

for p in tqdm(data):
    if 'venue' in p:
        venue.add(p['venue']['raw'])

    if 'authors' in p:
        for a in p['authors']:
            author.add((a['name'], a['id']))
            if 'fos' in p:
                if (a['name'], a['id']) not in author_fos:
                    author_fos[(a['name'], a['id'])] = set()
                for f in p['fos']:
                    author_fos[(a['name'], a['id'])].add(f['name'])
    
    if 'fos' in p:
        for f in p['fos']:
            fos.add(f['name'])    

In [None]:
print(f'author.num:{len(author)}, venue.num:{len(venue)}, fos.num:{len(fos)}')

In [None]:
author_fos

## Construct paper, author, venue dict

In [None]:
'''
paper_dict: paper id -> title
author_dict: author id (author num_id \t author name) -> fos text
'''

paper_dict = {}
author_dict = {}

for p in tqdm(data):
    assert p['id'] not in paper_dict
    paper_dict[p['id']] = p['title']

for a in tqdm(author_fos):
    tmp_fos = list(author_fos[a])
    random.shuffle(tmp_fos)
    author_dict[str(a[1])+'\t'+a[0]] = ' '.join(tmp_fos)

In [None]:
### reindex
#'''
#paper_id2idx: paper id -> paper idx
#author_id2idx: author id -> author idx
#venue_dict: venue idx -> venue text
#'''

#paper_id2idx = {}
#author_id2idx = {}
#venue_dict = {}

#for p in tqdm(paper_dict):
#    assert p not in paper_id2idx
#    paper_id2idx[p] = len(paper_id2idx)

#for a in tqdm(author_dict):
#    assert a not in author_id2idx
#    author_id2idx[a] = len(author_id2idx)
    
#for v in tqdm(venue):
#    if v == '': continue
#    venue_dict[len(venue_dict)] = v

In [None]:
paper_dict

In [None]:
author_dict

## Center Node

In [None]:
## paper center
paper_neighbour = {} # key: paper id, value: dict['paper':(list), 'author':(list), 'venue':str]
author_miss = 0

for p in tqdm(data):
    assert p['id'] not in paper_neighbour
    tmp_neighbour = {}
    
    # add neighbour paper
    if 'references' in p:
        tmp_neighbour['paper'] = []
        for ref in p["references"]:
            tmp_neighbour['paper'].append(paper_dict[str(ref)])
    
    # add neighbour author
    if 'authors' in p:
        tmp_neighbour['author'] = []
        for a in p['authors']:
            if str(a['id'])+'\t'+a['name'] in author_dict:
                tmp_neighbour['author'].append(str(a['id'])+'\t'+a['name'])
            else:
                author_miss += 1

    # add neighbour venue
    if 'venue' in p:
        if p['venue']['raw'] != '':
            tmp_neighbour['venue'] = p['venue']['raw']
    
    paper_neighbour[p['id']] = tmp_neighbour
    
print(f'author_miss:{author_miss}')

In [None]:
paper_neighbour

In [None]:
## author center
author_neighbour = {} # key: author id (author num_id \t author name), value: paper list

for p in tqdm(data):
    if 'authors' in p:
        for a in p['authors']:
            # add paper for each author
            if (str(a['id'])+'\t'+a['name']) not in author_neighbour:
                author_neighbour[(str(a['id'])+'\t'+a['name'])] = []
            author_neighbour[(str(a['id'])+'\t'+a['name'])].append(p['title'])

In [None]:
author_neighbour

In [None]:
## venue center
venue_neighbour = {} # key: venue name, value: paper list

for p in tqdm(data):
    if 'venue' in p:
        if p['venue']['raw'] != '':
            if p['venue']['raw'] not in venue_neighbour:
                venue_neighbour[p['venue']['raw']] = []
            venue_neighbour[p['venue']['raw']].append(p['title'])

In [None]:
venue_neighbour

## save neighbour file

In [None]:
pickle.dump(paper_dict, open('DBLP_neighbour/paper_dict.pkl','wb'))
pickle.dump(author_dict, open('DBLP_neighbour/author_dict.pkl','wb'))
pickle.dump(paper_neighbour, open('DBLP_neighbour/paper_neighbour.pkl','wb'))
pickle.dump(author_neighbour, open('DBLP_neighbour/author_neighbour.pkl','wb'))
pickle.dump(venue_neighbour, open('DBLP_neighbour/venue_neighbour.pkl','wb'))