In [2]:
import json
from neo4j import GraphDatabase
import pandas as pd

In [3]:
#Load dataset
with open('../building-dataset/json/processed_data_mvd.json', 'r') as file:
    data = json.load(file)

In [84]:
#use for local database connection
driver = GraphDatabase.driver(uri="bolt://localhost:7687", auth=("neo4j", "password"))

In [5]:
#Convert JSON to DataFrame
df = pd.DataFrame(data)

In [6]:
df.columns

Index(['id', 'normalized_name', 'start', 'end', 'text', 'labels',
       'document_id', 'parent_id', 'publishdate', 'title', 'docsource'],
      dtype='object')

In [7]:
#Group documents by ID
grouped = df.groupby('document_id')

In [10]:
#Method to print iterations progress
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total:
        print()

In [68]:
#gets sub-text map
#subtextID: text
documentIDToSentences = {}
#case name : set()

for name, group in grouped:
    group = group.sample(frac=1)

    doc_id = group.iloc[0]['document_id']
    documentIDToSentences[doc_id] = set()

    for index, row in group.iterrows():
        parentId = row['parent_id']
        text = row['text']

        subSentenceObject = {
            "text": text,
            "parentId": parentId
        }

        documentIDToSentences[doc_id].add(json.dumps(subSentenceObject))

documentIDToSentences

In [115]:
with driver.session() as session:
    session.run("MATCH (n) DETACH DELETE n")

In [116]:
#Add case nodes to database
count = 0
total = len(grouped)
print("CREATING CASE NODES: ")
printProgressBar(0, total, prefix='Progress:', suffix='Complete', length=total)

with driver.session() as session:
    for name, group in grouped:
        group = group.sample(frac=1)

        caseName = group.iloc[0]['title']
        caseId = group.iloc[0]['document_id']
        session.run("CREATE (n:Case {name: $name, type: $type, case_id: $case_id})", name=caseName, type='CASE',
                    case_id=caseId)

        count = count + 1
        printProgressBar(count, total, prefix='Progress:', suffix='Complete', length=total)

CREATING CASE NODES: 
Progress: |███████████████████████████████████████████████| 100.0% Complete


In [117]:
#Add case sub-text nodes to database and add relationship between them
count = 0
total = len(grouped)
print("CONNECTING SUB-TEXT TO CASE NODES")
printProgressBar(0, total, prefix='Progress:', suffix='Complete', length=total)

with driver.session() as session:
    for name, group in grouped:
        group = group.sample(frac=1)

        caseId = group.iloc[0]['document_id']
        caseSubTextObjects = documentIDToSentences[caseId]
        for subText in caseSubTextObjects:
            text = json.loads(subText)['text']
            parentId = json.loads(subText)['parentId']

            if session.run(
                    "MATCH (n:SubText {name: $parent_id, text: $text, type: $type, parent_id: $parent_id, case_id: $case_id}) RETURN n",
                    text=text,
                    type='SUB-TEXT', parent_id=parentId, case_id=caseId).single() is None:

                session.run("CREATE (n:SubText {name: $text, type: $type, parent_id: $parent_id, case_id: $case_id})",
                            text=text,
                            type='SUB-TEXT', parent_id=parentId, case_id=caseId)

                session.run("MATCH (a:SubText),(b:Case) WHERE a.case_id = $case_id AND b.case_id = "
                            "$case_id AND b.type = 'CASE' AND a.parent_id = $parent_id "
                            "MERGE (a)-[ r:SUBTEXT_OF {relationship: $relationship}]->(b)",
                            case_id=caseId,
                            relationship="Sub-Text with ID: " + str(parentId) + ", of case with ID " + str(caseId), parent_id=parentId)

        count = count + 1
        printProgressBar(count, total, prefix='Progress:', suffix='Complete', length=total)

CONNECTING SUB-TEXT TO CASE NODES
Progress: |███████████████████████████████████████████████| 100.0% Complete


In [118]:
#Add end nodes to the database and connect them to sub-text nodes
count = 0
total = len(grouped)
print("CONNECTING END-NODES TO CASE NODES")
printProgressBar(0, total, prefix='Progress:', suffix='Complete', length=total)

with driver.session() as session:
    for name, group in grouped:
        group = group.sample(frac=1)

        caseName = group.iloc[0]['title']
        caseId = group.iloc[0]['document_id']

        for index, row in group.iterrows():
            normalizedName = row['normalized_name'].upper()
            label = row['labels']
            parent_id = row['parent_id']

            if session.run("MATCH (n:Node {name: $name, type: $type}) RETURN n", name=normalizedName,
                           type=label).single() is None:

                session.run("CREATE (n:Node {name: $name, type: $type, parent_id: $parent_id})", name=normalizedName,
                            type=label, parent_id=[parent_id])

            else:
                session.run("MATCH (n:Node {name: $name, type: $type}) SET n.parent_id = apoc.coll.union(n.parent_id, "
                            "$parent_id)", name=normalizedName,
                            type=label, parent_id=[parent_id])

            relationship = "Normalized Name found in subtext: " + str(parent_id) + ", of case with ID " + str(caseId)

            session.run("MATCH (a:Node),(b:SubText) WHERE a.name = $end_node AND b.parent_id = $parent_id"
                        " AND b.case_id = $case_id"
                        " AND a.type = $type"
                        " CREATE (a)-[ r:" + label + "_OF" + " {relationship: $relationship}]->(b)",
                        end_node=normalizedName, parent_id=parent_id,
                        case_id=caseId,
                        relationship=relationship,
                        type=label
                        )

        count = count + 1
        printProgressBar(count, total, prefix='Progress:', suffix='Complete', length=total)

CONNECTING END-NODES TO CASE NODES
Progress: |███████████████████████████████████████████████| 100.0% Complete


In [114]:
#There can be multiple relationships pointing to the same node as the normalized_text can appear multiple times in the same sentence, those relations are removed.
with driver.session() as session:
    session.run("MATCH (a:Node)-[r]->(b:SubText)"
                " WITH a, b, type(r) AS rel_type, r.name AS name, COLLECT(r) AS rels, COUNT(*) AS count"
                " WHERE count > 1"
                " UNWIND rels[1..] AS duplicate"
                " DELETE duplicate")

In [94]:
#Use the following query to get specific cases
print("MATCH (a:Node),(b:SubText),(c:Case) WHERE c.case_id in [ids here] AND b.case_id = c.case_id AND b.parent_id in a.parent_id")