In [1]:
import json
with open('concepts/concepts.json', 'r', encoding='utf-8') as fp:
    final_result = json.load(fp)

In [2]:
import networkx as nx

In [3]:
ancestors_set = dict()
node_set = set()
G = nx.DiGraph()

for item in final_result:
    # print(item.keys())
    # step 1 添加节点
    ancestors = item['ancestors']
    myself = (item['display_name'], item['level'])
    ancestors = [(item['display_name'], item['level']) for item in ancestors]
    node_set.add(myself)
    for anc in ancestors:
        node_set.add(anc)
        # step 2 添加边
        # print(anc[1] - myself[1])
        if anc[1] - myself[1] == -1: # 只保留直系关系，ancestors字段会记录所有层级的祖先
            G.add_edge(anc[0], myself[0])

# 添加节点属性
node_levels = {item[0]: item[1]  for item in node_set}
# {'A': 'red', 'B': 'blue', 'C': 'green', 'D': 'yellow'}
nx.set_node_attributes(G, node_levels, 'level')

# 导出图
nx.write_gml(G, "concepts/concepts_tree.gml")

In [6]:
for n in G.nodes.data():
    if n[1]['level'] == 0:
        print(n)
    

('Business', {'level': 0})
('Economics', {'level': 0})
('Biology', {'level': 0})
('Environmental science', {'level': 0})
('Medicine', {'level': 0})
('Chemistry', {'level': 0})
('Physics', {'level': 0})
('Computer science', {'level': 0})
('Materials science', {'level': 0})
('Political science', {'level': 0})
('Psychology', {'level': 0})
('Mathematics', {'level': 0})
('Geology', {'level': 0})
('Philosophy', {'level': 0})
('Engineering', {'level': 0})
('Sociology', {'level': 0})
('Geography', {'level': 0})
('History', {'level': 0})
('Art', {'level': 0})


In [9]:
# 获取节点 Computer Science 能够到达的所有节点
reachable_nodes = nx.single_source_shortest_path(G, 'Computer science')
# print("节点 'Medicine' 能够到达的所有节点：", reachable_nodes) # node -> shortest path
nodes_of_medicine = [(item, G.nodes[item]['level']) for item in reachable_nodes.keys()]
print(len(nodes_of_medicine))
nodes_of_medicine = sorted(nodes_of_medicine, key=lambda x:x[1])
level_count = dict()
for item in nodes_of_medicine:
    level_count[item[1]] = level_count.get(item[1], 0) + 1
print(level_count)
# nodes_of_medicine[-10:]

# 打印level 1 的节点
for item in nodes_of_medicine:
    if item[1] == 1:
        # print(item)
        pass

selected_level_1 = ['Artificial intelligence', 'Machine learning', 'Computer vision', 'Data mining', 'Natural language processing', 'Speech recognition', 'Computer graphics (images)']
# not_sure = ['Data science', 'Information retrieval', '']

9266
{0: 1, 1: 33, 2: 3164, 3: 3253, 4: 1480, 5: 1335}


In [10]:
# AI生成两份，
reachable_nodes = nx.single_source_shortest_path(G, 'Computer science')
# print("节点 'Medicine' 能够到达的所有节点：", reachable_nodes) # node -> shortest path
nodes_of_AI = [(item, G.nodes[item]['level']) for item in reachable_nodes.keys()]
print(len(nodes_of_AI))
nodes_of_AI = sorted(nodes_of_AI, key=lambda x:x[1])
level_count = dict()
for item in nodes_of_AI:
    level_count[item[1]] = level_count.get(item[1], 0) + 1
print(level_count)
# nodes_of_medicine[-10:]

9266
{0: 1, 1: 33, 2: 3164, 3: 3253, 4: 1480, 5: 1335}


In [12]:
with open('D:\SciMig\concepts/concepts_AI_full.json', 'w') as fp:
    json.dump(nodes_of_AI, fp)

In [13]:
nodes_of_AI = []
for level1 in selected_level_1:
    reachable_nodes = nx.single_source_shortest_path(G, level1)
    # print("节点 'Medicine' 能够到达的所有节点：", reachable_nodes) # node -> shortest path
    nodes_of_AI += [(item, G.nodes[item]['level']) for item in reachable_nodes.keys()]
    print(len(nodes_of_AI))
nodes_of_AI = sorted(nodes_of_AI, key=lambda x:x[1])
level_count = dict()
for item in nodes_of_AI:
    level_count[item[1]] = level_count.get(item[1], 0) + 1
print(level_count)
# nodes_of_medicine[-10:]

2502
3252
4325
4556
4777
4907
5010
{1: 7, 2: 1573, 3: 1963, 4: 842, 5: 625}


In [14]:
with open('D:\SciMig\concepts/concepts_AI_refined.json', 'w') as fp:
    json.dump(nodes_of_AI, fp)

In [None]:
# 获取节点 Medicine 能够到达的所有节点
reachable_nodes = nx.single_source_shortest_path(G, 'Medicine')
# print("节点 'Medicine' 能够到达的所有节点：", reachable_nodes) # node -> shortest path
nodes_of_medicine = [(item, G.nodes[item]['level']) for item in reachable_nodes.keys()]
print(len(nodes_of_medicine))
nodes_of_medicine = sorted(nodes_of_medicine, key=lambda x:x[1])
level_count = dict()
for item in nodes_of_medicine:
    level_count[item[1]] = level_count.get(item[1], 0) + 1
print(level_count)
# nodes_of_medicine[-10:]

In [None]:
with open('D:\SciMig\concepts/concepts_Medicine_l.json', 'w') as fp:
    json.dump(nodes_of_medicine, fp)

In [None]:
selected_medicine_concepts = []
for item in nodes_of_medicine:
    if item[1] == 1:
        selected_medicine_concepts.append(item)
    # level_count[item[1]] = level_count.get(item[1], 0) + 1
print(*selected_medicine_concepts, sep='\n')

In [None]:
# 获取节点 Medicine 能够到达的所有节点
reachable_nodes = nx.single_source_shortest_path(G, 'Physics')
# print("节点 'Medicine' 能够到达的所有节点：", reachable_nodes) # node -> shortest path
nodes_of_physic = [(item, G.nodes[item]['level']) for item in reachable_nodes.keys()]
nodes_of_physic = sorted(nodes_of_physic, key=lambda x:x[1])
print(len(nodes_of_physic))
level_count = dict()
for item in nodes_of_physic:
    level_count[item[1]] = level_count.get(item[1], 0) + 1
print(level_count)
nodes_of_physic[-10:]

In [None]:
selected_Physics_concepts = []
for item in nodes_of_physic:
    if item[1] == 4:
        selected_Physics_concepts.append(item)
    # level_count[item[1]] = level_count.get(item[1], 0) + 1
print(*selected_Physics_concepts, sep='\n')

In [None]:
with open('D:\SciMig\concepts/concepts_Physic_l.json', 'w') as fp:
    json.dump(nodes_of_physic, fp)

In [29]:
# 接下来获取，管理学的子图
reachable_nodes_management = nx.single_source_shortest_path(G, 'Economics')
reachable_nodes_business = nx.single_source_shortest_path(G, 'Business')
# reachable_nodes_Politiacl = nx.single_source_shortest_path(G, 'Political science')

In [30]:
subgraph_nodes = [i[0] for i in reachable_nodes_management.items()] + [i[0] for i in reachable_nodes_business.items()] # + [i[0] for i in reachable_nodes_Politiacl.items()]
print(len(subgraph_nodes))
management_related_subgraph = G.subgraph(subgraph_nodes)
# management_related_subgraph.to
nx.write_gml(management_related_subgraph, "concepts/concepts_tree_management.gml")

6590


In [31]:
sorted(subgraph_nodes, key=len)

['Debt',
 'Bond',
 'Cash',
 'Loan',
 'Lien',
 'ROWE',
 'XBRL',
 'Wage',
 'Lira',
 'BRIC',
 'Tort',
 'CDIO',
 'Clef',
 'Oboe',
 'Abia',
 'CVAR',
 'XPDL',
 'OPM3',
 'PDCA',
 'PEVQ',
 'Debt',
 'Bond',
 'Cash',
 'Loan',
 'Lien',
 'ROWE',
 'Dove',
 'XBRL',
 'Nike',
 'Reel',
 'Lira',
 'BRIC',
 'Tort',
 'CVAR',
 'PDCA',
 'XPDL',
 'PEVQ',
 'Lease',
 'LEAPS',
 'Logit',
 'Audit',
 'Piano',
 'Blues',
 'Flute',
 'Tying',
 'Bluff',
 'Purge',
 'CUSUM',
 'DPSIR',
 'COBIT',
 'Rupee',
 'Libor',
 'Crore',
 'Plant',
 'Viola',
 'Cello',
 'NAIRU',
 'HRHIS',
 'DMAIC',
 'Lease',
 'LEAPS',
 'Audit',
 'Bleed',
 'Foley',
 'COBIT',
 'Rupee',
 'Libor',
 'Chilo',
 'Media',
 'NAIRU',
 'Dacus',
 'DMAIC',
 'Alate',
 'Estate',
 'Issuer',
 'Collar',
 'Coupon',
 'Ledger',
 'Escrow',
 'Surety',
 'Excise',
 'Barter',
 'Probit',
 'Sprint',
 'Lyrics',
 'Violin',
 'Guitar',
 'Eponym',
 'Fyodor',
 'Supply',
 'Salary',
 'Utopia',
 'TOPSIS',
 'Greeks',
 'Tariff',
 'Layoff',
 'Kanban',
 'CarSim',
 'Cartel',
 'Kaizen',
 'Brexit',

In [32]:
nodes_of_management = [(item, G.nodes[item]['level']) for item in subgraph_nodes]
nodes_of_management = sorted(nodes_of_management, key=lambda x:x[1])

with open('D:\SciMig\concepts/concepts_Management_l.json', 'w') as fp:
    json.dump(subgraph_nodes, fp)

In [33]:

print(len(nodes_of_management))
level_count = dict()
for item in nodes_of_management:
    level_count[item[1]] = level_count.get(item[1], 0) + 1
print(level_count)

6590
{0: 2, 1: 53, 2: 2566, 3: 2572, 4: 861, 5: 536}


In [40]:
selected_management_concepts = []
for item in nodes_of_management:
    if item[1] == 1:
        selected_management_concepts.append(item)
    # level_count[item[1]] = level_count.get(item[1], 0) + 1
print(*selected_management_concepts, sep='\n')

('Finance', 1)
('Economic growth', 1)
('Macroeconomics', 1)
('Econometrics', 1)
('Management', 1)
('Microeconomics', 1)
('Accounting', 1)
('Market economy', 1)
('Operations management', 1)
('Industrial organization', 1)
('Environmental resource management', 1)
('Monetary economics', 1)
('Mathematical economics', 1)
('Management science', 1)
('Financial economics', 1)
('Political economy', 1)
('Demographic economics', 1)
('Natural resource economics', 1)
('Economy', 1)
('Environmental economics', 1)
('Public economics', 1)
('Development economics', 1)
('Law and economics', 1)
('Economic geography', 1)
('Labour economics', 1)
('International trade', 1)
('Positive economics', 1)
('Actuarial science', 1)
('Economic system', 1)
('Socioeconomics', 1)
('Agricultural economics', 1)
('International economics', 1)
('Financial system', 1)
('Neoclassical economics', 1)
('Economic history', 1)
('Economic policy', 1)
('Keynesian economics', 1)
('Commerce', 1)
('Welfare economics', 1)
('Classical eco

## 测试concept读取

In [42]:
##
import json
with open('D:\SciMig\concepts/concepts_Medicine_l.json', 'r') as fp:
    physic_concept = json.load(fp)

In [44]:
len(set(physic_concept_2))

18426

In [43]:
physic_concept_2 = [item[0].lower() for item in physic_concept]
sorted(physic_concept_2,key=len)

['l1',
 'sed',
 'yak',
 'nop',
 'd-1',
 'u87',
 'cd5',
 'rod',
 'nod',
 'cd8',
 'q10',
 'erg',
 'abl',
 'syk',
 'fyn',
 'lyn',
 'e2f',
 'apx',
 'cd3',
 'iif',
 'ns3',
 'hbx',
 'p3a',
 'p3b',
 'nls',
 'bus',
 'cd1',
 'lung',
 'copd',
 'meal',
 'cats',
 'vein',
 'rash',
 'dose',
 'gout',
 'pons',
 'toad',
 'mole',
 'pco2',
 'rana',
 'sss*',
 'peri',
 'tasa',
 'cyst',
 'lobe',
 'toll',
 'dept',
 'deet',
 'nose',
 'cord',
 'cuff',
 'ulna',
 'anus',
 'pacu',
 'limp',
 'stab',
 'drug',
 'mood',
 'mdma',
 'tics',
 'cbcl',
 'oxon',
 'kava',
 'khat',
 'coca',
 'slug',
 'pill',
 'axon',
 'heel',
 'club',
 'chin',
 'loin',
 'hoof',
 'spur',
 'rump',
 'fist',
 'hock',
 'tick',
 'diva',
 'gait',
 'hook',
 'acne',
 'prom',
 'twig',
 'herd',
 'ovis',
 'flea',
 'zebu',
 'tlr4',
 'cd14',
 'smad',
 'cd36',
 'erbb',
 'ed50',
 'cd47',
 'cd64',
 'muc1',
 'bmi1',
 'bap1',
 'ptx3',
 'slpi',
 'chop',
 'nefa',
 'scad',
 'ards',
 'cd20',
 'cd31',
 'cd68',
 'hccs',
 'neun',
 'cd30',
 'odds',
 'egta',
 'romk',
 '

In [37]:
import json
with open('D:\SciMig\concepts/concepts_Management_l.json', 'r') as fp:
    mana_concept = json.load(fp)

In [41]:
len(set(mana_concept))

4810

test merged_author_id

In [16]:
# 简单统计下5开头的和非5开头的author数量
from utils import scan_list, extract_certain_suffix
import pandas as pd
fl = extract_certain_suffix(scan_list('merged_authors'), 'txt')

start_with_5_count = 0
start_with_else_count = 0
for fpath in fl:
    df = pd.read_csv(fpath, header=0)
    ids = df['id'].tolist() + df['merge_into_id'].tolist()
    for one_id in ids:
        if one_id[1] == '5':
            start_with_5_count += 1
        else:
            start_with_else_count += 1

print(start_with_5_count, start_with_else_count)


7376100 0


# 测试按照 source 寻找 work 的方法

In [17]:
import json

test_snapshot_fp = r'works\updated_date=2023-11-09\part_000.txt'
with open(test_snapshot_fp, 'r', encoding='utf-8') as fp:
    for line in fp:
        json_obj = json.loads(line)
        break

In [18]:
json_obj.keys()

dict_keys(['id', 'doi', 'doi_registration_agency', 'display_name', 'title', 'publication_year', 'publication_date', 'language', 'ids', 'primary_location', 'best_oa_location', 'type', 'type_crossref', 'open_access', 'authorships', 'countries_distinct_count', 'institutions_distinct_count', 'corresponding_author_ids', 'corresponding_institution_ids', 'cited_by_count', 'summary_stats', 'biblio', 'is_retracted', 'is_paratext', 'concepts', 'mesh', 'locations_count', 'locations', 'referenced_works', 'referenced_works_count', 'sustainable_development_goals', 'keywords', 'grants', 'apc_list', 'apc_paid', 'cited_by_percentile_year', 'related_works', 'abstract_inverted_index', 'counts_by_year', 'cited_by_api_url', 'updated_date', 'created_date', 'updated', 'authors_count', 'concepts_count', 'has_fulltext'])

In [20]:
json_obj['primary_location']['source']

{'id': 'https://openalex.org/S4306463673',
 'issn_l': None,
 'issn': None,
 'display_name': 'Nouveau Monde eBooks',
 'publisher': 'Nouveau Monde',
 'host_organization': None,
 'host_organization_name': None,
 'host_organization_lineage': [],
 'host_organization_lineage_names': [],
 'is_oa': False,
 'is_in_doaj': False,
 'host_institution_lineage': [],
 'host_institution_lineage_names': [],
 'publisher_lineage': [],
 'publisher_lineage_names': [],
 'publisher_id': None,
 'type': 'ebook platform'}