In [2]:
import os
import json

dataDir = "../data/"
relationDir = os.path.join(dataDir, "relations")
structureFile = os.path.join(relationDir, "structure_relations.json")
semanticFile = os.path.join(relationDir, "semantic_relations.json")
locationFile = os.path.join(relationDir, "location.json")

# read file from structureFile
relations = set()
with open(structureFile, "r") as f:
    structureRelations = json.load(f)
    for idx, structureRelation in enumerate(structureRelations):
        if structureRelation[1] == "目录":
            structureRelations[idx][1] = "parent_content"
        elif structureRelation[1] == "前置":
            structureRelations[idx][1] = "prerequisites"
        relations.add(structureRelations[idx][1])


print("# of relations: ", len(relations))
print(relations)

print("# of structure relations: ", len(structureRelations))

# of relations:  4
{'co_presence', 'parent_content', 'prerequisites', 'include'}
# of structure relations:  78415


In [3]:
relations = set()
with open(semanticFile, "r") as f:
    semanticRelations = json.load(f)
    for idx, semanticRelation in enumerate(semanticRelations):
        relations.add(semanticRelations[idx][1])

print("# of relations: ", len(relations))
print(relations)

print("# of semantic relations: ", len(semanticRelations))

# of relations:  9
{'part of', 'use', 'is a', 'subject of', 'facet of', 'defined by', 'subclass of', 'based on', 'instance of'}
# of semantic relations:  2734


In [4]:
import networkx as nx

graphDir = os.path.join(dataDir, "graph")


def build_graph(relations):
    graph = nx.DiGraph()
    for relation in relations:
        graph.add_edge(relation[0], relation[2], relation=relation[1])
    return graph


graph = build_graph(structureRelations)
# print # of nodes and edges
print("# of nodes: ", graph.number_of_nodes())
print("# of edges: ", graph.number_of_edges())

semanticGraph = build_graph(semanticRelations)
# print # of nodes and edges
print("# of nodes: ", semanticGraph.number_of_nodes())
print("# of edges: ", semanticGraph.number_of_edges())

# export graph to json file
import json

graphJsonFile = os.path.join(graphDir, "structure_graph.json")
with open(graphJsonFile, "w") as f:
    json.dump(nx.node_link_data(graph), f, ensure_ascii=False, indent=4)

semanticGraphJsonFile = os.path.join(graphDir, "semantic_graph.json")
with open(semanticGraphJsonFile, "w") as f:
    json.dump(nx.node_link_data(semanticGraph), f, ensure_ascii=False, indent=4)

# of nodes:  3832
# of edges:  78415
# of nodes:  2196
# of edges:  2693


In [28]:
def validate_graph(graph):
    import re
    import collections

    links = graph.edges(data=True)
    # define a regular expression to match "1.1 Who Should Read This Book?"
    reg = r"^\d+\.\d+.*"
    sections = sorted(
        [
            link[0]
            for link in links
            if re.match(reg, link[0]) and link[2]["relation"] == "include"
        ]
    )

    entities = [
        link[1]
        for link in links
        if re.match(reg, link[0]) and link[2]["relation"] == "include"
    ]

    # count all unique values in sections
    # sections = collections.Counter(sections)

    sections = set(sections)

    print("# of sections: ", len(sections))
    # print(sections)

    location = json.load(open(locationFile, "r"))
    print("# of location: ", len(location))
    # print(location)

    for section in sections:
        print("checking section: ", section)
        # filter all edges with section as source and with relation "include"
        section_entities = [
            link[1]
            for link in links
            if link[0] == section and link[2]["relation"] == "include"
        ]
        # check if section_entities in location
        for entity in section_entities:
            if entity not in location:
                print("entity: ", entity)
                print("not in location")
        print("===========================================")


validate_graph(graph)

# of sections:  164
# of location:  3263
checking section:  2.4 Linear Dependence and Span
entity:  singular matrix
not in location
checking section:  13.4 Sparse Coding
entity:  decoder training
not in location
entity:  model training
not in location
entity:  data encoding
not in location
checking section:  16.7 The Deep Learning Approach to Structured Probabilistic Models
entity:  unknown quantities
not in location
checking section:  7.6 Semi-Supervised Learning
entity:  output targets
not in location
entity:  injecting noise
not in location
checking section:  12.2 Computer Vision
entity:  image preprocessing
not in location
checking section:  5.9 Stochastic Gradient Descent
checking section:  9.1 The Convolution Operation
entity:  Loss Functions
not in location
entity:  Model Evaluation
not in location
checking section:  3.8 Expectation, Variance and Covariance
entity:  Pb,c
not in location
entity:  E [fx] = Pxfx
not in location
entity:  Pa,b,c
not in location
entity:  Pb c
not in l

In [None]:
# find all links connected to the node named "Deep Learning"
links = graph.edges(data=True)
print(len(links))
for link in links:
    if link[2]["relation"] == "include" and link[0] == "1.1 Who Should Read This Book?":
        print(link)