In [36]:
import json
from neo4j import GraphDatabase
import pandas as pd
import networkx as nx

In [37]:
with open('../building-dataset/json/processed_data_mvd.json', 'r') as file:
    data = json.load(file)

In [None]:
driver = GraphDatabase.driver(uri="neo4j+s://5bdc6a9f.databases.neo4j.io", auth=("neo4j", "jCgUToP_2Qe7UWfqDi8iGj4JCp6k_5I22MZQvflVUVA"))

In [74]:
driver = GraphDatabase.driver(uri="bolt://localhost:7687", auth=("neo4j", "password"))

In [62]:
df = pd.DataFrame(data)

In [63]:
df.columns

Index(['id', 'normalized_name', 'start', 'end', 'text', 'labels',
       'document_id', 'parent_id', 'publishdate', 'title', 'docsource'],
      dtype='object')

In [64]:
grouped = df.groupby('document_id')

In [65]:
relationShipToCase = {'ORG': 'ORGANIZATION INVOLVED IN CASE',
                      'JUDGE': 'JUDGE OF CASE',
                      'WITNESS': 'WITNESS APPEALED IN CASE',
                      'LAWYER': 'LAWYER IN CAsE',
                      'GPE': 'GPE OF CASE',
                      'PRECEDENT': 'PRECEDENT OF CASE',
                      'RESPONDENT': 'RESPONDENT OF CASE',
                      'CASE_NUMBER': 'CASE NUMBER',
                      'PETITIONER': 'PETITIONER OF CASE',
                      'COURT': 'COURT OF CASE',
                      'OTHER_PERSON': 'OTHER PERSON INVOLVED',
                      'DATE': 'DATE OF TRIAL',
                      'PROVISION': 'PROVISION OF CASE',
                      'STATUTE': 'STATUTE OF CASE'
                      }

In [66]:
# Print iterations progress
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total:
        print()

In [67]:
with driver.session() as session:
    session.run("MATCH (n) DETACH DELETE n")

In [68]:
count = 0
total = len(grouped)
printProgressBar(0, total, prefix = 'Progress:', suffix = 'Complete', length = total)

for name, group in grouped:

    group = group.sample(frac=1)

    G = nx.Graph()
    count = count + 1
    printProgressBar(count, total, prefix = 'Progress:', suffix = 'Complete', length = total)


    for index, row in group.iterrows():
        normalizedName = row['normalized_name'].upper()
        G.add_node(normalizedName, type=row['labels'])

    with driver.session() as session:
        caseName = group.iloc[0]['title']
        caseId = group.iloc[0]['document_id']
        session.run("CREATE (n:Case {name: $name, type: $type, case_id: $case_id})", name=caseName, type='CASE',
                    case_id=caseId)

        for node in G.nodes:
            label = G.nodes[node]['type']
            caseId = group.iloc[0]['document_id']
            session.run("CREATE (n:Node {name: $name, type: $type, case_id: $case_id})", name=node,
                        type=label, case_id=caseId)

            relationship = relationShipToCase[label]
            if relationship is None:
                relationship = "IS IN"

            session.run("MATCH (a:Node),(b:Case) WHERE a.name = $from_node AND b.name = $to_node AND b.case_id = "
                        "$case_id AND b.type = 'CASE' "
                        " CREATE (a)-[ "
                        "r:RELATIONSHIP {relationship: $relationship}]->(b)", from_node=node, to_node=caseName,
                        case_id=caseId,
                        relationship=relationship)

Progress: |███████████████████████████████████████████████| 100.0% Complete


In [75]:
#deletes duplicate nodes and merges them together retaining all relations
with driver.session() as session:
        session.run("MATCH (n:Node) WITH n.name AS name, COLLECT(n) AS nodes, COUNT(*) AS count "
            "WHERE count > 1 "
            "CALL apoc.refactor.mergeNodes(nodes) YIELD node "
            "RETURN node")

In [76]:
#Deletes duplicate relationships and merges them together
with driver.session() as session:
    session.run("MATCH (n:Node)-[r:RELATIONSHIP]->(c:Case) WITH n, c, COLLECT(r) AS rels, COUNT(*) AS count "
                "WHERE count > 1 "
                "CALL apoc.refactor.mergeRelationships(rels) YIELD rel "
                "RETURN rel")