In [1]:
import os
import pickle
import random
import json

from collections import defaultdict, Counter

### Pay attention to the reproducibility !!

In [2]:
# read processed graph
with open('./graph.json', 'r') as fp:
    graph = json.load(fp)
print(graph.keys())

dict_keys(['paper_nodes', 'author_nodes', 'venue_nodes'])


In [3]:
k = 10
all_generated_data = {} # key: triple (question (str), answer (str)), value: generated data (List)

In [4]:
list(graph['paper_nodes'].keys())[:10]

['53e99784b7602d9701f3ffdd',
 '53e99785b7602d9701f41492',
 '53e99785b7602d9701f414a7',
 '53e99785b7602d9701f414a8',
 '53e99785b7602d9701f414cb',
 '53e99785b7602d9701f427b4',
 '53e99785b7602d9701f427df',
 '53e99785b7602d9701f4283f',
 '53e99785b7602d9701f43ee8',
 '53e99785b7602d9701f43f95']

In [5]:
graph['paper_nodes']['53e99784b7602d9701f3ffdd']

{'features': {'title': 'Flow.',
  'abstract': '',
  'keywords': [],
  'lang': 'en',
  'year': 2006},
 'neighbors': {'author': ['53f43776dabfaee0d9b6e75b'],
  'venue': ['pub_0'],
  'reference': [],
  'cited_by': []}}

In [6]:
list(graph['author_nodes'].keys())[:10]

['53f43776dabfaee0d9b6e75b',
 '53f42e86dabfaee43ebd375a',
 '53f4671bdabfaeee22a56233',
 '53f4d423dabfaeedd9781e23',
 '54055a2fdabfae91d3fd878b',
 '53f433e6dabfaeee229920b6',
 '541011e7dabfae450f4ccacd',
 '53f385b2dabfae4b34a115df',
 '53f485b0dabfaec09f2a93c5',
 '53f460a7dabfaee4dc83702a']

In [7]:
graph['author_nodes']['53f43776dabfaee0d9b6e75b']

{'features': {'name': 'Masa Inakage', 'organization': ''},
 'neighbors': {'paper': ['573696d56e3b12023e5d5238',
   '53e99ca1b7602d9702553336',
   '53e9978ab7602d9701f47e0d',
   '53e9b96eb7602d970455d077',
   '573696d46e3b12023e5d42c4',
   '53e9ad34b7602d9703718839',
   '53e9a974b7602d97032cd76b',
   '5c04961717c44a2c74705b26',
   '53e99792b7602d9701f580c5',
   '5b8c9f1617c44af36f8b34bd',
   '53e99792b7602d9701f544e1',
   '53e9a937b7602d9703289c3e',
   '53e9b289b7602d9703d31bbe',
   '573696d46e3b12023e5d3f3e',
   '53e9a645b7602d9702f75374',
   '53e9ba2db7602d9704638c63',
   '573696d46e3b12023e5d376d',
   '53e9b4e0b7602d97040063c2',
   '599c7f3c601a182cd28fe775',
   '53e99dfdb7602d97026bddf0',
   '53e99c60b7602d970251137d',
   '53e9acbcb7602d970369bb05',
   '53e99ae7b7602d97023726a6',
   '53e9a1bdb7602d9702ab5136',
   '53e9a432b7602d9702d4b821',
   '53e99d58b7602d9702613db8',
   '53e9b457b7602d9703f540ed',
   '53e9978ab7602d9701f458e1',
   '53e99ac4b7602d9702345eb1',
   '53e9b815b7602d97

### Design questions (one type of question in one cell)

1-hop question (EASY):
1. Who are the authors of paper xxx?
2. What organization is researcher xxx from?
3. Where is paper xxx published?


In [5]:
## question (easy): who are the authors of paper xxx?

random.seed(2023)

question = 'Who are the authors of paper "{paper_title}?" '
answer = "{authors}"
generated_data = []

paper_ids = list(graph['paper_nodes'].keys())
random.shuffle(paper_ids)

for paper_id in paper_ids:
    paper_title = graph['paper_nodes'][paper_id]['features']['title']
    author_ids = graph['paper_nodes'][paper_id]['neighbors']['author']
    author_names = [graph['author_nodes'][author_id]['features']['name'] for author_id in author_ids]
    generated_data.append({"paper_title":paper_title, "authors": ', '.join(author_names)})

    if(len(generated_data) == k):
        break

all_generated_data[(question, answer)] = generated_data

In [8]:
random.seed(2024)

question = "What organization is researcher {author_name} affiliated with?"
answer = "{org_name}"
generated_data = []

author_ids = list(graph['author_nodes'].keys())
random.shuffle(paper_ids)

#TO-DO: Does an author have multiple affiliation?

name_cnt = Counter([graph['author_nodes'][author_id]['features']['name'] for author_id in author_ids])

for author_id in author_ids:
    author_name = graph['author_nodes'][author_id]['features']['name']
    org_name = graph['author_nodes'][author_id]['features']['organization']
    if len(org_name) <= 0 or name_cnt[author_name] > 1:
        continue
        
    generated_data.append({"author_name":author_name, "org_name": org_name})
    print(author_id)
    
    if len(generated_data) == k:
        break

all_generated_data[(question, answer)] = generated_data

53f433e6dabfaeee229920b6
53f460a7dabfaee4dc83702a
53f47148dabfaec09f269c9e
53f45c8ddabfaeb22f517906
53f4612adabfaeee22a40808
53f474d7dabfaee0d9c6706d
53f4757ddabfaee4dc88a39f
53f42ba8dabfaedce54aa6c8
53f39e34dabfae4b34aa885d
53f43057dabfaee4dc73e9c8


In [10]:
random.seed(2025)

question = 'Where is the paper "{paper_title}" published?'
answer = "{venue}"
generated_data = []

paper_ids = list(graph['paper_nodes'].keys())
random.shuffle(paper_ids)

for paper_id in paper_ids:
    paper_title = graph['paper_nodes'][paper_id]['features']['title']

    assert len(graph['paper_nodes'][paper_id]['neighbors']['venue']) == 1
    venue_id = graph['paper_nodes'][paper_id]['neighbors']['venue'][0]
    venue_name = graph['venue_nodes'][venue_id]['features']['name']
    generated_data.append({"paper_title":paper_title, "venue": venue_name})

    if len(generated_data) == k:
        break

all_generated_data[(question, answer)] = generated_data

Multi-hop Reasoning Question (Medium)


1. Who collaborates with author xxx to write paper xxx?
2. What is the intersection author of paper xxx and paper xxx?
3. Who is the closest collaborator with author xxx?
4. How many collaborators does author xxx have in xxx?
5. How many papers did xxx and xxx write together?

In [11]:
## question (medium): Who collaborates with author xxx to write paper xxx?

random.seed(2026)

question = 'Who collaborate with author {author_name} in {org_name} to write paper "{paper_title}"?'
answer = "{collaborators}"
generated_data = []

paper_ids = list(graph['paper_nodes'].keys())
random.shuffle(paper_ids)

for paper_id in paper_ids:
    paper_title = graph['paper_nodes'][paper_id]['features']['title']
    author_ids = graph['paper_nodes'][paper_id]['neighbors']['author']
    random.shuffle(author_ids)

    org_name = graph['author_nodes'][author_ids[0]]['features']['organization']
    if org_name == '':
        continue

    author_names = [graph['author_nodes'][author_id]['features']['name'] for author_id in author_ids]
    if len(author_names) <= 1:
        continue

    generated_data.append({"author_name": author_names[0],
                           "org_name": org_name,
                       "paper_title": paper_title,
                       "collaborators": ', '.join(author_names[1:])})
    
    if len(generated_data) == k:
        break

all_generated_data[(question, answer)] = generated_data

In [12]:
## question (medium): who writed both paper xxx and paper xxx?

random.seed(2027)

question = 'Who writed both the paper "{paper1_title}" and paper "{paper2_title}"?'
answer = "{authors}"
generated_data = []

author_ids = list(graph['author_nodes'].keys())
random.shuffle(author_ids)

for author_id in author_ids:
    paper_ids = list(graph['author_nodes'][author_id]['neighbors']['paper'])
    random.shuffle(paper_ids)
    if len(paper_ids) < 2:
        continue

    author_list1 = graph['paper_nodes'][paper_ids[0]]['neighbors']['author']
    author_list2 = graph['paper_nodes'][paper_ids[1]]['neighbors']['author']

    if len(set(author_list1) & set(author_list2)) > 1:
        continue

    generated_data.append({"paper1_title": graph['paper_nodes'][paper_ids[0]]['features']['title'],
                            "paper2_title": graph['paper_nodes'][paper_ids[1]]['features']['title'],
                            "authors": graph['author_nodes'][author_id]['features']['name']})

    if len(generated_data) == k:
        break

all_generated_data[(question, answer)] = generated_data

In [13]:
# Question (medium):Who is the closest collaborator with author xxx?

'''
Closeness is defined in terms of the number of collaboration together. 
The most number of collaboration a pair has, the most closest they are
'''

random.seed(2028)

question = "Who is the closest collaborator with author {author_name} in {org_name}? Closeness is defined in terms of the number of collaboration together."
answer = "{collaborator_name}"
generated_data = []

author_ids = list(graph['author_nodes'].keys())
random.shuffle(author_ids)

for author_id in author_ids:
    paper_ids = graph['author_nodes'][author_id]['neighbors']['paper']
    collaborators_by_count = {} #key: collaborator_name, value: paper_counts

    for paper_id in paper_ids:
        collaborator_ids = graph['paper_nodes'][paper_id]['neighbors']['author']
        collaborator_names = [graph['author_nodes'][cid]['features']['name'] for cid in collaborator_ids if cid != author_id]
        
        for collab in collaborator_names:
            if collab not in collaborators_by_count:
                collaborators_by_count[collab] = 0
            collaborators_by_count[collab] += 1

    if len(collaborators_by_count) == 0:
        continue

    sorted_collaborators = sorted(collaborators_by_count.items(), key = lambda item: item[1], reverse = True)
    
    if len(sorted_collaborators) > 1 and sorted_collaborators[0][1] == sorted_collaborators[1][1]:
        continue
    
    author_name = graph['author_nodes'][author_id]['features']['name']
    org_name = graph['author_nodes'][author_id]['features']['organization']
    
    if org_name == '':
        continue
    
    generated_data.append({"author_name": author_name,
                           "org_name": org_name,
                        "collaborator_name": sorted_collaborators[0][0],
                          })

    if len(generated_data) == k:
        break

all_generated_data[(question, answer)] = generated_data

In [14]:
# Question (medium): How many collaborators does author xxx have in xxx?

random.seed(2029)

question = "How many collaborators does author {author_name} in {org_name} have in {year}"
answer = "{number}"
generated_data = []

author_ids = list(graph['author_nodes'].keys())
random.shuffle(author_ids)

for author_id in author_ids:
    paper_ids = graph['author_nodes'][author_id]['neighbors']['paper']
    collaborators_by_year = defaultdict(set) #key: year, value: author_names

    for paper_id in paper_ids:
        year = graph['paper_nodes'][paper_id]['features']['year']
        collaborator_ids = graph['paper_nodes'][paper_id]['neighbors']['author']
        collaborator_names = [graph['author_nodes'][cid]['features']['name'] for cid in collaborator_ids]
        collaborators_by_year[year].update(collaborator_names)

    author_name = graph['author_nodes'][author_id]['features']['name']
    
    #randomly shuffling and selecting year at 0-index as intened year for this question
    years = [y for y in collaborators_by_year]
    random.shuffle(years)
    
    org_name = graph['author_nodes'][author_id]['features']['organization']
    if org_name == '':
        continue
    
    generated_data.append({"author_name": author_name,
                        "year": years[0],
                        "org_name": org_name,
                        "number": len(collaborators_by_year[years[0]])-1})

    if len(generated_data) == k:
        break

all_generated_data[(question, answer)] = generated_data

#Related Q: Who did author xxx collaborate with in xxx?

In [15]:
# Question: How many papers did xxx and xxx write together?

random.seed(2030)

question = "How many papers did {author_name1} in {org_name1} and {author_name2} in {org_name2} write together?"
answer = "{number}"
generated_data = []

author_ids = list(graph['author_nodes'].keys())
random.shuffle(author_ids)

for author_id1 in author_ids:
    curr_author_ids = list(graph['author_nodes'].keys())
    random.shuffle(curr_author_ids)
    for author_id2 in curr_author_ids:

        if author_id1 == author_id2: 
            continue
        
        paper_ids1 = graph['author_nodes'][author_id1]['neighbors']['paper']
        paper_ids2 = graph['author_nodes'][author_id2]['neighbors']['paper']

        if len(set(paper_ids1) & set(paper_ids2)) < 2:
            continue

        author_name1 = graph['author_nodes'][author_id1]['features']['name']
        author_name2 = graph['author_nodes'][author_id2]['features']['name']

        org_name1 = graph['author_nodes'][author_id1]['features']['organization']
        if org_name1 == '':
            continue
        org_name2 = graph['author_nodes'][author_id2]['features']['organization']
        if org_name2 == '':
            continue

        generated_data.append({"author_name1": author_name1,
                            "author_name2": author_name2,
                            "org_name1": org_name1,
                            "org_name2": org_name2,
                            "number": len(set(paper_ids1) & set(paper_ids2))})
        break

    if len(generated_data) == k:
        break

all_generated_data[(question, answer)] = generated_data

# Related Q: How many papers did xxx and xxx write together in xxx?

Degree-based reasoning (easy)
1. How many papers cite paper xxx?
2. How many papers do paper xxx cite?
3. Which is the most cited paper by author xxx?
4. How many papers did author xxx write?

In [16]:
## question (medium): how many paper cite paper xxx?

random.seed(2031)

question = 'How many papers cite the paper "{paper_title}"?'
answer = "{num}"
generated_data = []

paper_ids = list(graph['paper_nodes'].keys())
random.shuffle(paper_ids)

for paper_id in paper_ids:
    paper_title = graph['paper_nodes'][paper_id]['features']['title']
    cited_by_id = graph['paper_nodes'][paper_id]['neighbors']['cited_by']
    if len(cited_by_id)  == 0:
        continue
        
    generated_data.append({"paper_title": paper_title, "num": len(cited_by_id)})
    
    if len(generated_data) == k:
        break
        
all_generated_data[(question, answer)] = generated_data

In [17]:
#Question: How many papers do paper xxx cite?

random.seed(2032)

question = 'How many papers does paper "{paper_title}" cite?'
answer = "{num}"
generated_data = []

paper_ids = list(graph['paper_nodes'].keys())
random.shuffle(paper_ids)

for paper_id in paper_ids:
    paper_title = graph['paper_nodes'][paper_id]['features']['title']
    referred_by_id = graph['paper_nodes'][paper_id]['neighbors']['reference']
    if len(referred_by_id) == 0:
        continue
    generated_data.append({"paper_title": paper_title, "num": len(referred_by_id)})
    
    if len(generated_data) == k:
        break
        
all_generated_data[(question, answer)] = generated_data

In [18]:
#Question: Which is the most cited paper by author xxx?
random.seed(2033)

question = "Which is the most cited paper by author {author_name} in {org_name}?"
answer = "{paper_title}"
generated_data = []

author_ids = list(graph['author_nodes'].keys())
random.shuffle(author_ids)

for author_id in author_ids:
    paper_ids = graph['author_nodes'][author_id]['neighbors']['paper']
    max_count = -1
    max_paper_id = None
    random.shuffle(paper_ids)
    for paper_id in paper_ids:
        
        cited_by_id = graph['paper_nodes'][paper_id]['neighbors']['cited_by']

        if len(cited_by_id) > max_count:
            max_count = len(cited_by_id)
            max_paper_id = paper_id
    
    paper_title = graph['paper_nodes'][max_paper_id]['features']['title']
    author_name = graph['author_nodes'][author_id]['features']['name']

    org_name = graph['author_nodes'][author_id]['features']['organization']
    if org_name == '':
        continue

    generated_data.append({"author_name": author_name,
                           "org_name": org_name,
                        "paper_title": paper_title})

    if len(generated_data) == k:
        break

all_generated_data[(question, answer)] = generated_data

In [19]:
# Question: 4. How many papers did author xxx write?
random.seed(2034)

question = "How many papers did author {author_name} in {org_name} write?"
answer = "{num}"
generated_data = []

author_ids = list(graph['author_nodes'].keys())
random.shuffle(author_ids)

for author_id in author_ids:
    paper_ids = graph['author_nodes'][author_id]['neighbors']['paper']
    author_name = graph['author_nodes'][author_id]['features']['name']
    org_name = graph['author_nodes'][author_id]['features']['organization']
    if org_name == '':
        continue
    
    generated_data.append({"author_name": author_name,
                           "org_name": org_name,
                        "num": len(paper_ids)})

    if len(generated_data) == k:
        break
    
all_generated_data[(question, answer)] = generated_data

#Related Question How many papers does author xxx in xxx venue?
#Related Question How many papers does author xxx in xxx year?

Complex structure reasoning (medium)
1. Which venue did author xxx and author xxx collaborate most?
2. How many people does author xxx need to know at least to know author xxx?
3. What is the research interests of author xxx?

In [None]:
# Question: 5. Which venue did author xxx and author xxx collaborate most?

random.seed(2035)

question = "Which venue did {author_name1} in {org_name1} and {author_name2} in {org_name2} collaborate most?"
answer = "{venue}"
generated_data = []

author_ids = list(graph['author_nodes'].keys())
random.shuffle(author_ids)

for author_id1 in author_ids:
    curr_author_ids = list(graph['author_nodes'].keys())
    random.shuffle(curr_author_ids)
    for author_id2 in curr_author_ids:

        if author_id1 == author_id2: 
            continue
        paper_ids1 = graph['author_nodes'][author_id1]['neighbors']['paper']
        paper_ids2 = graph['author_nodes'][author_id2]['neighbors']['paper']

        if len(set(paper_ids1) & set(paper_ids2)) < 1:
            continue

        count_per_venue = {}
        max_count = -1
        max_venue = None
        common_paper_ids = list(set(paper_ids1) & set(paper_ids2))
        
        for paper_id in common_paper_ids:
            venue = graph['paper_nodes'][paper_id]['neighbors']['venue'][0]
            if venue not in count_per_venue:
                count_per_venue[venue] = 0
            
            count_per_venue[venue] += 1
            if max_count < count_per_venue[venue]:
                max_count = count_per_venue[venue]
                max_venue = venue

        author_name1 = graph['author_nodes'][author_id1]['features']['name']
        author_name2 = graph['author_nodes'][author_id2]['features']['name']

        org_name1 = graph['author_nodes'][author_id1]['features']['organization']
        if org_name1 == '':
            continue
        org_name2 = graph['author_nodes'][author_id2]['features']['organization']
        if org_name2 == '':
            continue

        generated_data.append({"author_name1": author_name1,
                            "author_name2": author_name2,
                            "org_name1": org_name1,
                            "org_name2": org_name2,
                            "venue": graph['venue_nodes'][max_venue]['features']['name']})
        break
        
    if len(generated_data) == k:
            break

all_generated_data[(question, answer)] = generated_data

#Related Question: Which year did author xxx and author xxx collaborate most in?

In [21]:
# Question: 1.  How many people does author xxx need to know at least to know author xxx?

random.seed(2036)

question = "How many people does author {author_name1} in {org_name1} need to know at least to know author {author_name2} in {org_name2}?"
answer = "{number}"
generated_data = []
max_hop_length = 5 # setting the maximum hop distance between two asked authors in the graph

author_ids = list(graph['author_nodes'].keys())
random.shuffle(author_ids)

def get_k_hop_neighbor(cur_author, hop, dist):
    
    queue = [cur_author]
    dist[cur_author] = 0
    
    while(len(queue)):
        cia = queue.pop(0)
        cur_papers = graph['author_nodes'][cia]['neighbors']['paper']
        cur_nids = []
        for pid in cur_papers:
            nids = graph['paper_nodes'][pid]['neighbors']['author']
            cur_nids.extend(nids)
        
        for cin in cur_nids:
            if cin in dist:
                continue
            dist[cin] = dist[cia] + 1
            if dist[cin] == hop:
                return cin
            queue.append(cin)
            
    return -1

for author_id in author_ids:
    cur_hop = random.randint(1, max_hop_length)
    neighbor = get_k_hop_neighbor(author_id, cur_hop, dict())
    if (neighbor == -1) or (neighbor is None):
        continue
    
    author_name1 = graph['author_nodes'][author_id]['features']['name']
    author_name2 = graph['author_nodes'][neighbor]['features']['name']

    org_name1 = graph['author_nodes'][author_id]['features']['organization']
    if org_name1 == '':
        continue
    org_name2 = graph['author_nodes'][neighbor]['features']['organization']
    if org_name2 == '':
        continue

    generated_data.append({"author_name1": author_name1,
                        "author_name2": author_name2,
                        "org_name1": org_name1,
                        "org_name2": org_name2,
                        "number": cur_hop})
                               
    if len(generated_data) == k:
            break

all_generated_data[(question, answer)] = generated_data

#Related Question: Which year did author xxx and author xxx collaborate most in?

In [22]:
# Question: 2. What is the research interests of author xxx?

random.seed(2037)

topk = 3 # how many top keywords will be considered as research interests. 

question = "What is the research interests (top 3 keywords) of author {author_name} in {org_name}?"
answer = "{keywords}"
generated_data = []

author_ids = list(graph['author_nodes'].keys())
random.shuffle(author_ids)

for author_id in author_ids:
    paper_ids = graph['author_nodes'][author_id]['neighbors']['paper']
    keywords_by_count = {} # key: keyword, value: paper_counts

    for paper_id in paper_ids:
        keywords = graph['paper_nodes'][paper_id]['features']['keywords']
        
        for key in keywords:
            if key not in keywords_by_count:
                keywords_by_count[key] = 0
            keywords_by_count[key] += 1

    sorted_keywords = sorted(keywords_by_count.items(), key = lambda item: item[1], reverse = True)
    
    author_name = graph['author_nodes'][author_id]['features']['name']
    
    top_keywords = [key[0] for key in sorted_keywords[:topk]]
    org_name = graph['author_nodes'][author_id]['features']['organization']
    if org_name == '':
        continue
        
    generated_data.append({"author_name": author_name,
                           "org_name": org_name,
                        "keywords": ', '.join(top_keywords),
                          })
    
    if len(generated_data) == k:
        break
    
all_generated_data[(question, answer)] = generated_data

#Related Question: Which year did author xxx and author xxx collaborate most in?

In [23]:
pickle.dump(all_generated_data, open(os.path.join(f'preprocess_samples.pkl'), 'wb'))

print('Saving file of #questions, ', len(all_generated_data))

Saving file of #questions,  15
