In [None]:
import os
import json
from tqdm import tqdm

import pickle

In [None]:
## nodes: papers, authors, venues
## paper features: title, abstract, year, label
## author features: name
## venue features: name

In [None]:
domain="Physics" # Medicine, Chemistry, Biology, Physics, Materials_Science

In [None]:
raw_data_dir=f"Paths/raw_data"
save_dir=f"Paths/save"

In [None]:
## read raw data files

def read_json_lines(file, id_key):
    data = {}
    with open(file) as f:
        readin = f.readlines()
        for line in tqdm(readin):
            tmp = json.loads(line)
            data[tmp[id_key]] = tmp
    return data

def read_txt_lines(file, target=1):
    data = {}
    with open(file) as f:
        readin = f.readlines()
        for line in tqdm(readin):
            tmp = line.strip().split('\t')
            data[tmp[0]] = tmp[target]
    return data

book_raw_data = read_json_lines(os.path.join(raw_data_dir, 'papers.json'), 'paper')
author_raw_data = read_txt_lines(os.path.join(raw_data_dir, 'authors.txt'), 2)
venue_raw_data = read_txt_lines(os.path.join(raw_data_dir, 'venues.txt'), 1)
category_raw_data = read_txt_lines(os.path.join(raw_data_dir, 'labels.txt'), 1)

In [None]:
## construct node dictionary
## paper features: title, abstract, year, label
## paper neighbors: paper, author, venue

## author features: name
## author neighbors: paper

## venue features: name
## venue neighbors: paper

paper_nodes = {}
author_nodes = {}
venue_nodes = {}

venue_name2id = {}

for paper_id in tqdm(book_raw_data):
    paper = book_raw_data[paper_id]

    # venue nodes
    if paper['venue'] != '':
        if paper['venue'] not in venue_nodes:
            venue_nodes[paper['venue']] = {'features': {}, 'neighbors': {}}
            venue_nodes[paper['venue']]['features']['name'] = venue_raw_data[paper['venue']]
            venue_nodes[paper['venue']]['neighbors']['paper'] = [paper["paper"]]
        else:
            venue_nodes[paper['venue']]['neighbors']['paper'].append(paper["paper"])

    # paper nodes
    paper_nodes[paper_id] = {'features': {}, 'neighbors': {}}
    ## add features
    paper_nodes[paper_id]['features']['title'] = paper['title']
    paper_nodes[paper_id]['features']['abstract'] = paper['abstract']
    paper_nodes[paper_id]['features']['year'] = paper['year']
    paper_nodes[paper_id]['features']['label'] = [category_raw_data[lb] for lb in paper['label']]
    ## add neighbors
    paper_nodes[paper_id]['neighbors']['author'] = paper['author']
    paper_nodes[paper_id]['neighbors']['venue'] = [paper['venue']] if paper['venue'] != '' else []
    paper_nodes[paper_id]['neighbors']['reference'] = paper['reference']
    paper_nodes[paper_id]['neighbors']['cited_by'] = []

    # author nodes
    for aid in paper['author']:
        if aid not in author_nodes:
            author_nodes[aid] = {'features': {}, 'neighbors': {}}
            author_nodes[aid]['features']['name'] = author_raw_data[aid]
            author_nodes[aid]['neighbors']['paper'] = [paper["paper"]]
        else:
            author_nodes[aid]['neighbors']['paper'].append(paper["paper"])

## add 'cited_by' for paper nodes
for paper_id in tqdm(paper_nodes):
    for ref_pid in paper_nodes[paper_id]['neighbors']['reference']:
        if ref_pid not in paper_nodes:
            continue
        paper_nodes[ref_pid]['neighbors']['cited_by'].append(paper_id)


In [None]:
## save graph
#pickle.dump({
#    'paper_nodes': paper_nodes,
#    'author_nodes': author_nodes,
#    'venue_nodes': venue_nodes,
#}, open(os.path.join(save_dir, 'graph.pkl'),"wb"))

json.dump({
    'paper_nodes': paper_nodes,
    'author_nodes': author_nodes,
    'venue_nodes': venue_nodes,
}, open(os.path.join(save_dir, 'graph.json'),"w"), indent=4)