In [1]:
import os
import json

dataDir = "../data/"
relationDir = os.path.join(dataDir, "relations")
structureFile = os.path.join(relationDir, "structure_relations.json")
semanticFile = os.path.join(relationDir, "semantic_relations.json")
locationFile = os.path.join(relationDir, "location.json")

# read file from structureFile
relations = set()
with open(structureFile, "r") as f:
    structureRelations = json.load(f)
    for idx, structureRelation in enumerate(structureRelations):
        if structureRelation[1] == "目录":
            structureRelations[idx][1] = "parent_content"
        elif structureRelation[1] == "前置":
            structureRelations[idx][1] = "prerequisites"
        relations.add(structureRelations[idx][1])


print("# of relations: ", len(relations))
print(relations)

print("# of structure relations: ", len(structureRelations))

# of relations:  4
{'include', 'co_presence', 'prerequisites', 'parent_content'}
# of structure relations:  22737


In [2]:
relations = set()
with open(semanticFile, "r") as f:
    semanticRelations = json.load(f)
    for idx, semanticRelation in enumerate(semanticRelations):
        relations.add(semanticRelations[idx][1])

print("# of relations: ", len(relations))
print(relations)

print("# of semantic relations: ", len(semanticRelations))

# of relations:  10
{'instance of', 'based on', 'subclass of', 'part of', 'use', 'subject of', 'facet of', 'is a', 'defined by', 'contains'}
# of semantic relations:  903


In [5]:
import networkx as nx

graphDir = os.path.join(dataDir, "graph")


def build_graph(relations):
    graph = nx.DiGraph()
    for relation in relations:
        graph.add_edge(relation[0], relation[2], relation=relation[1])
    return graph


graph = build_graph(structureRelations)
# print # of nodes and edges
print("# of nodes: ", graph.number_of_nodes())
print("# of edges: ", graph.number_of_edges())

semanticGraph = build_graph(semanticRelations)
# print # of nodes and edges
print("# of nodes: ", semanticGraph.number_of_nodes())
print("# of edges: ", semanticGraph.number_of_edges())

# export graph to json file
import json

graphJsonFile = os.path.join(graphDir, "structure_graph.json")
with open(graphJsonFile, "w", encoding='utf-8') as f:
    json.dump(nx.node_link_data(graph), f, ensure_ascii=False, indent=4)

semanticGraphJsonFile = os.path.join(graphDir, "semantic_graph.json")
with open(semanticGraphJsonFile, "w", encoding='utf-8') as f:
    json.dump(nx.node_link_data(semanticGraph), f, ensure_ascii=False, indent=4)

# of nodes:  1782
# of edges:  22737
# of nodes:  754
# of edges:  886


In [6]:
def validate_graph(graph):
    import re
    import collections

    links = graph.edges(data=True)
    # define a regular expression to match "1.1 Who Should Read This Book?"
    reg = r"^\d+\.\d+.*"
    sections = sorted(
        [
            link[0]
            for link in links
            if re.match(reg, link[0]) and link[2]["relation"] == "include"
        ]
    )

    entities = [
        link[1]
        for link in links
        if re.match(reg, link[0]) and link[2]["relation"] == "include"
    ]

    # count all unique values in sections
    # sections = collections.Counter(sections)

    sections = set(sections)

    print("# of sections: ", len(sections))
    # print(sections)

    location = json.load(open(locationFile, "r"))
    print("# of location: ", len(location))
    # print(location)

    for section in sections:
        print("checking section: ", section)
        # filter all edges with section as source and with relation "include"
        section_entities = [
            link[1]
            for link in links
            if link[0] == section and link[2]["relation"] == "include"
        ]
        # check if section_entities in location
        for entity in section_entities:
            if entity not in location:
                print("entity: ", entity)
                print("not in location")
        print("===========================================")


validate_graph(graph)

# of sections:  160
# of location:  1597
checking section:  12.2 Computer Vision
checking section:  18.5 Denoising Score Matching
checking section:  2.10 The Trace Operator
checking section:  11.4 Selecting Hyperparameters
checking section:  7.14 Tangent Distance, Tangent Prop, and Manifold Tangent Classifier
checking section:  14.6 Learning Manifolds with Autoencoders
checking section:  5.7 Supervised Learning Algorithms
checking section:  6.6 Historical Notes
checking section:  7.2 Norm Penalties as Constrained Optimization
checking section:  2.12 Example: Principal Components Analysis
checking section:  3.11 Bayes’ Rule
checking section:  11.1 Performance Metrics
checking section:  2.7 Eigendecomposition
checking section:  3.1 Why Probability?
checking section:  8.2 Challenges in Neural Network Optimization
checking section:  20.10 Directed Generative Nets
checking section:  2.2 Multiplying Matrices and Vectors
checking section:  18.1 The Log-Likelihood Gradient
checking section:  1

In [7]:
# find all links connected to the node named "Deep Learning"
links = graph.edges(data=True)
print(len(links))
for link in links:
    if link[2]["relation"] == "include" and link[0] == "1.1 Who Should Read This Book?":
        print(link)

22737
('1.1 Who Should Read This Book?', 'architecture', {'relation': 'include'})
('1.1 Who Should Read This Book?', 'AI systems', {'relation': 'include'})
('1.1 Who Should Read This Book?', 'computer program', {'relation': 'include'})
