In [None]:
import os
import json
from tqdm import tqdm

import pickle

In [None]:
## nodes: papers, authors, venues
## paper features: title, abstract, keywords, lang, year
## author features: name, org
## venue features: name

In [None]:
raw_data_dir="/home/ec2-user/quic-efs/user/bowenjin/llm-graph-plugin/data/raw_data/dblp"
save_dir="/home/ec2-user/quic-efs/user/bowenjin/llm-graph-plugin/data/processed_data/dblp"

In [None]:
## read raw data files

def read_json(file):
    with open(file) as f:
        data = json.load(f)
    return data

paper_raw_data = read_json(os.path.join(raw_data_dir, 'dblp_v14.json'))

In [None]:
## construct node dictionary
## paper features: title, abstract, keywords, lang, year
## paper neighbors: paper, author, venue

## author features: name, org
## author neighbors: paper

## venue features: name
## venue neighbors: paper

paper_nodes = {}
author_nodes = {}
venue_nodes = {}

venue_name2id = {}

for paper in tqdm(paper_raw_data):
    
    # venue nodes
    if paper['venue']['raw'] != '':
        if paper['venue']['raw'] not in venue_name2id:
            idd = f'pub_{len(venue_nodes)}'
            venue_name2id[paper['venue']['raw']] = idd
            venue_nodes[idd] = {'features': {}, 'neighbors': {}}
            venue_nodes[idd]['features']['name'] = paper['venue']['raw']
            venue_nodes[idd]['neighbors']['paper'] = [paper["id"]]
        else:
            venue_nodes[venue_name2id[paper['venue']['raw']]]['neighbors']['paper'].append(paper["id"])

    # paper nodes
    paper_nodes[paper["id"]] = {'features': {}, 'neighbors': {}}
    ## add features
    paper_nodes[paper["id"]]['features']['title'] = paper['title']
    paper_nodes[paper["id"]]['features']['abstract'] = paper['abstract']
    paper_nodes[paper["id"]]['features']['keywords'] = paper['keywords']
    paper_nodes[paper["id"]]['features']['lang'] = paper['lang']
    paper_nodes[paper["id"]]['features']['year'] = paper['year']
    ## add neighbors
    paper_nodes[paper["id"]]['neighbors']['author'] = [a['id'] for a in paper['authors']]
    paper_nodes[paper["id"]]['neighbors']['venue'] = [venue_name2id[paper['venue']['raw']]] if paper['venue']['raw'] != '' else []
    paper_nodes[paper["id"]]['neighbors']['reference'] = paper['references'] if 'references' in paper else []
    paper_nodes[paper["id"]]['neighbors']['cited_by'] = []

    # author nodes
    for a in paper['authors']:
        if a["id"] not in author_nodes:
            author_nodes[a["id"]] = {'features': {}, 'neighbors': {}}
            author_nodes[a["id"]]['features']['name'] = a['name']
            author_nodes[a["id"]]['features']['organization'] = a['org']
            author_nodes[a["id"]]['neighbors']['paper'] = [paper["id"]]
        else:
            author_nodes[a["id"]]['neighbors']['paper'].append(paper["id"])

## add 'cited_by' for paper nodes
for paper_id in tqdm(paper_nodes):
    for ref_pid in paper_nodes[paper_id]['neighbors']['reference']:
        if ref_pid not in paper_nodes:
            continue
        paper_nodes[ref_pid]['neighbors']['cited_by'].append(paper_id)

In [None]:
## save graph
#pickle.dump({
#    'paper_nodes': paper_nodes,
#    'author_nodes': author_nodes,
#    'venue_nodes': venue_nodes,
#}, open(os.path.join(save_dir, 'graph.pkl'),"wb"))

json.dump({
    'paper_nodes': paper_nodes,
    'author_nodes': author_nodes,
    'venue_nodes': venue_nodes,
}, open(os.path.join(save_dir, 'graph.json'),"w"), indent=4)