In [23]:
## Import up sound alert dependencies
from IPython.display import Audio, display

def allDone():
  display(Audio(url='https://sound.peal.io/ps/audios/000/000/537/original/woo_vu_luvub_dub_dub.wav', autoplay=True))
## Insert whatever audio file you want above

In [27]:
from neo4j import GraphDatabase
import pandas as pd

class Neo4jConnection:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))
        print("Connected to Neo4j.")

    def close(self):
        self.driver.close()
        print("Connection to Neo4j closed.")

    def clear_relationships(self):
        """Removes all CITES and CITED_BY relationships."""
        with self.driver.session(database="DocDatabase") as session:
            try:
                session.run("MATCH ()-[r:CITES]->() DELETE r")
                session.run("MATCH ()-[r:CITED_BY]->() DELETE r")
                print("Removed all CITES and CITED_BY relationships.")
            except Exception as e:
                print(f"Error clearing relationships: {e}")

    def create_outsource_node(self, session, uid, reference_text):
        """Creates a new Outsource node and links it to the source UID."""
        try:
            session.run(
                """
                MATCH (source:Passage {UID: $uid})
                CREATE (o:Outsource {reference_text: $reference_text})
                MERGE (source)-[:OUTSOURCE]->(o)
                """,
                uid=uid,
                reference_text=reference_text
            )
        except Exception as e:
            print(f"Error creating outsource node for UID {uid}: {e}")

    def create_cites_relationship(self, session, source_uid, target_uid, reference_text):
        """Creates CITES and CITED_BY relationships between source and target."""
        try:
            session.run(
                """
                MATCH (source:Passage {UID: $source_uid}), (target:Passage {UID: $target_uid})
                MERGE (source)-[:CITES {reference_text: $reference_text}]->(target)
                MERGE (target)-[:CITED_BY {reference_text: $reference_text}]->(source)
                """,
                source_uid=source_uid,
                target_uid=target_uid,
                reference_text=reference_text
            )
        except Exception as e:
            print(f"Error creating CITES relationship from {source_uid} to {target_uid}: {e}")

    def process_csv(self, csv_path):
        """Processes the CSV file to create relationships in the Neo4j database."""
        try:
            data = pd.read_csv(csv_path)
        except Exception as e:
            print(f"Error reading CSV file {csv_path}: {e}")
            return

        with self.driver.session(database="DocDatabase") as session:
            for index, row in data.iterrows():
                try:
                    source_uid = row['SourceID']
                    reference_text = row['ReferenceText']
                    reference_type = row['ReferenceType']

                    # Skip rows with missing source UID
                    if not source_uid or pd.isna(source_uid):
                        print(f"Skipping row {index}: Missing SourceID.")
                        continue

                    if reference_type == "Outsource":
                        self.create_outsource_node(session, source_uid, reference_text)

                    elif reference_type in ["Internal", "External"]:
                        target_uid = row['TargetID']

                        # Skip rows with missing target UID
                        if not target_uid or pd.isna(target_uid):
                            print(f"Skipping row {index}: Missing TargetID for SourceID {source_uid}.")
                            continue

                        self.create_cites_relationship(session, source_uid, target_uid, reference_text)
                except Exception as e:
                    print(f"Error processing row {index}: {e}")

if __name__ == "__main__":
    # Neo4j connection details
    uri = "bolt://localhost:7687"
    user = "neo4j"
    password = "12345678"

    # Initialize Neo4j connection
    conn = Neo4jConnection(uri, user, password)

    # Clear old relationships
    conn.clear_relationships()

    # Path to the CSV file
    csv_path = 'CrossReferenceData.csv'

    # Process the CSV file
    conn.process_csv(csv_path)

    # Close the Neo4j connection
    conn.close()


Connected to Neo4j.
Removed all CITES and CITED_BY relationships.
Connection to Neo4j closed.


In [28]:
allDone()

In [17]:
from neo4j import GraphDatabase

# Neo4j connection details
uri = "bolt://localhost:7687"
user = "neo4j"
password = "12345678"

class Neo4jConnection:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))
        print("Connected to Neo4j.")

    def close(self):
        self.driver.close()
        print("Connection to Neo4j closed.")

    def delete_old_defined_term_relationships(self, batch_size=10000):
        """
        Delete all existing HAS_SHARED_DEFINED_TERM relationships in batches.
        Reduces memory usage for large datasets.
        """
        with self.driver.session(database="DocDatabase") as session:
            while True:
                # Delete a limited number of relationships at a time
                query = f"""
                    MATCH ()-[r:HAS_SHARED_DEFINED_TERM]->()
                    WITH r LIMIT {batch_size}
                    DELETE r
                    RETURN COUNT(r) AS deleted_count
                """
                result = session.run(query)
                deleted_count = result.single()["deleted_count"]
                print(f"Deleted {deleted_count} relationships in this batch.")
                # Break when no more relationships are found
                if deleted_count < batch_size:
                    break
        print("Deleted all old HAS_SHARED_DEFINED_TERM relationships.")

    def get_duplicated_defined_terms(self):
        """Retrieve duplicated named entities and their counts."""
        with self.driver.session(database="DocDatabase") as session:
            query = """
                MATCH (n:DefinedTerm)
                WHERE n.term IS NOT NULL AND n.term <> ""
                WITH n.term AS term, COUNT(*) AS count
                WHERE count > 1
                RETURN term, count
                ORDER BY count DESC
            """
            result = session.run(query)
            duplicated_named_entities = result.data()
            if duplicated_named_entities:
                print("Duplicated Defined Terms and their counts:")
                for record in duplicated_named_entities:
                    print(f"Term: '{record['term']}', Count: {record['count']}")
            else:
                print("No duplicated Defined Term found.")
            return duplicated_named_entities

    def merge_duplicate_defined_terms(self, batch_size=10000):
        """
        Create bidirectional HAS_SHARED_DEFINED_TERM relationships in batches.
        Increases transaction efficiency by processing in smaller chunks.
        """
        with self.driver.session(database="DocDatabase") as session:
            offset = 0
            while True:
                query = f"""
                    MATCH (n1:DefinedTerm), (n2:DefinedTerm)
                    WHERE n1.term = n2.term AND n1.term IS NOT NULL AND n1.term <> "" AND id(n1) < id(n2)
                    WITH n1, n2 SKIP {offset} LIMIT {batch_size}
                    MERGE (n1)-[:HAS_SHARED_DEFINED_TERM]->(n2)
                    MERGE (n2)-[:HAS_SHARED_DEFINED_TERM]->(n1)
                    RETURN COUNT(*) AS processed
                """
                result = session.run(query)
                processed = result.single()["processed"]
                print(f"Processed {processed} relationships in batch {offset // batch_size + 1}")
                if processed < batch_size:
                    break
                offset += batch_size

if __name__ == "__main__":
    # Initialize Neo4j connection
    conn = Neo4jConnection(uri, user, password)

    # Step 1: Delete old HAS_SHARED_DEFINED_TERM relationships
    conn.delete_old_defined_term_relationships()

    # Step 2: Get and print duplicated defined terms
    duplicated_defined_terms = conn.get_duplicated_defined_terms()

    # Step 3: Merge duplicate named entities if any duplicates exist
    if duplicated_defined_terms:
        conn.merge_duplicate_defined_terms()

    # Close the connection
    conn.close()


Connected to Neo4j.
Deleted 10000 relationships in this batch.
Deleted 10000 relationships in this batch.
Deleted 10000 relationships in this batch.
Deleted 10000 relationships in this batch.
Deleted 10000 relationships in this batch.
Deleted 10000 relationships in this batch.
Deleted 10000 relationships in this batch.
Deleted 10000 relationships in this batch.
Deleted 10000 relationships in this batch.
Deleted 10000 relationships in this batch.
Deleted 10000 relationships in this batch.
Deleted 10000 relationships in this batch.
Deleted 10000 relationships in this batch.
Deleted 10000 relationships in this batch.
Deleted 10000 relationships in this batch.
Deleted 10000 relationships in this batch.
Deleted 10000 relationships in this batch.
Deleted 10000 relationships in this batch.
Deleted 10000 relationships in this batch.
Deleted 10000 relationships in this batch.
Deleted 10000 relationships in this batch.
Deleted 10000 relationships in this batch.
Deleted 10000 relationships in thi



Processed 10000 relationships in batch 1




Processed 10000 relationships in batch 2




Processed 10000 relationships in batch 3




Processed 10000 relationships in batch 4




Processed 10000 relationships in batch 5




Processed 10000 relationships in batch 6




Processed 10000 relationships in batch 7




Processed 10000 relationships in batch 8




Processed 10000 relationships in batch 9




Processed 10000 relationships in batch 10




Processed 10000 relationships in batch 11




Processed 10000 relationships in batch 12




Processed 10000 relationships in batch 13




Processed 10000 relationships in batch 14




Processed 10000 relationships in batch 15




Processed 10000 relationships in batch 16




Processed 10000 relationships in batch 17




Processed 10000 relationships in batch 18




Processed 10000 relationships in batch 19




Processed 10000 relationships in batch 20




Processed 10000 relationships in batch 21




Processed 10000 relationships in batch 22




Processed 10000 relationships in batch 23




Processed 10000 relationships in batch 24




Processed 10000 relationships in batch 25




Processed 10000 relationships in batch 26




Processed 10000 relationships in batch 27




Processed 10000 relationships in batch 28




Processed 10000 relationships in batch 29




Processed 10000 relationships in batch 30




Processed 10000 relationships in batch 31




Processed 10000 relationships in batch 32




Processed 10000 relationships in batch 33




Processed 10000 relationships in batch 34




Processed 10000 relationships in batch 35




Processed 10000 relationships in batch 36




Processed 10000 relationships in batch 37




Processed 10000 relationships in batch 38




Processed 10000 relationships in batch 39




Processed 10000 relationships in batch 40




Processed 10000 relationships in batch 41




Processed 10000 relationships in batch 42




Processed 10000 relationships in batch 43




Processed 10000 relationships in batch 44




Processed 10000 relationships in batch 45




Processed 10000 relationships in batch 46




Processed 10000 relationships in batch 47




Processed 10000 relationships in batch 48




Processed 10000 relationships in batch 49




Processed 10000 relationships in batch 50




Processed 10000 relationships in batch 51




Processed 10000 relationships in batch 52




Processed 10000 relationships in batch 53




Processed 10000 relationships in batch 54




Processed 10000 relationships in batch 55




Processed 10000 relationships in batch 56




Processed 10000 relationships in batch 57




Processed 10000 relationships in batch 58




Processed 10000 relationships in batch 59




Processed 10000 relationships in batch 60




Processed 10000 relationships in batch 61




Processed 10000 relationships in batch 62




Processed 10000 relationships in batch 63




Processed 10000 relationships in batch 64




Processed 10000 relationships in batch 65




Processed 10000 relationships in batch 66




Processed 10000 relationships in batch 67




Processed 10000 relationships in batch 68




Processed 10000 relationships in batch 69




Processed 10000 relationships in batch 70




Processed 10000 relationships in batch 71




Processed 10000 relationships in batch 72




Processed 10000 relationships in batch 73




Processed 10000 relationships in batch 74




Processed 10000 relationships in batch 75




Processed 10000 relationships in batch 76




Processed 10000 relationships in batch 77




Processed 10000 relationships in batch 78




Processed 10000 relationships in batch 79




Processed 10000 relationships in batch 80




Processed 10000 relationships in batch 81




Processed 10000 relationships in batch 82




Processed 10000 relationships in batch 83




Processed 10000 relationships in batch 84




Processed 10000 relationships in batch 85




Processed 10000 relationships in batch 86




Processed 10000 relationships in batch 87




Processed 10000 relationships in batch 88




Processed 10000 relationships in batch 89




Processed 10000 relationships in batch 90




Processed 10000 relationships in batch 91




Processed 10000 relationships in batch 92




Processed 10000 relationships in batch 93




Processed 10000 relationships in batch 94




Processed 10000 relationships in batch 95




Processed 10000 relationships in batch 96




Processed 10000 relationships in batch 97




Processed 10000 relationships in batch 98




Processed 10000 relationships in batch 99




Processed 10000 relationships in batch 100




Processed 10000 relationships in batch 101




Processed 10000 relationships in batch 102




Processed 10000 relationships in batch 103




Processed 10000 relationships in batch 104




Processed 10000 relationships in batch 105




Processed 10000 relationships in batch 106




Processed 10000 relationships in batch 107




Processed 10000 relationships in batch 108




Processed 10000 relationships in batch 109




Processed 10000 relationships in batch 110




Processed 10000 relationships in batch 111




Processed 10000 relationships in batch 112




Processed 10000 relationships in batch 113




Processed 10000 relationships in batch 114




Processed 10000 relationships in batch 115




Processed 10000 relationships in batch 116




Processed 10000 relationships in batch 117




Processed 10000 relationships in batch 118




Processed 10000 relationships in batch 119




Processed 10000 relationships in batch 120




Processed 10000 relationships in batch 121




Processed 10000 relationships in batch 122




Processed 10000 relationships in batch 123




Processed 10000 relationships in batch 124




Processed 10000 relationships in batch 125




Processed 10000 relationships in batch 126




Processed 10000 relationships in batch 127




Processed 10000 relationships in batch 128




Processed 10000 relationships in batch 129




Processed 10000 relationships in batch 130




Processed 10000 relationships in batch 131




Processed 10000 relationships in batch 132




Processed 10000 relationships in batch 133




Processed 10000 relationships in batch 134




Processed 10000 relationships in batch 135




Processed 10000 relationships in batch 136




Processed 10000 relationships in batch 137




Processed 10000 relationships in batch 138




Processed 10000 relationships in batch 139




Processed 10000 relationships in batch 140




Processed 10000 relationships in batch 141




Processed 10000 relationships in batch 142




Processed 10000 relationships in batch 143




Processed 10000 relationships in batch 144




Processed 10000 relationships in batch 145




Processed 10000 relationships in batch 146




Processed 10000 relationships in batch 147




Processed 10000 relationships in batch 148




Processed 10000 relationships in batch 149




Processed 10000 relationships in batch 150




Processed 10000 relationships in batch 151




Processed 10000 relationships in batch 152




Processed 10000 relationships in batch 153




Processed 10000 relationships in batch 154




Processed 10000 relationships in batch 155




Processed 10000 relationships in batch 156




Processed 10000 relationships in batch 157




Processed 10000 relationships in batch 158




Processed 10000 relationships in batch 159




Processed 10000 relationships in batch 160




Processed 10000 relationships in batch 161




Processed 10000 relationships in batch 162




Processed 10000 relationships in batch 163




Processed 10000 relationships in batch 164




Processed 10000 relationships in batch 165




Processed 10000 relationships in batch 166




Processed 10000 relationships in batch 167




Processed 10000 relationships in batch 168




Processed 10000 relationships in batch 169




Processed 10000 relationships in batch 170




Processed 10000 relationships in batch 171




Processed 10000 relationships in batch 172




Processed 10000 relationships in batch 173




Processed 10000 relationships in batch 174




Processed 10000 relationships in batch 175




Processed 10000 relationships in batch 176




Processed 10000 relationships in batch 177




Processed 10000 relationships in batch 178




Processed 10000 relationships in batch 179




Processed 10000 relationships in batch 180




Processed 10000 relationships in batch 181




Processed 10000 relationships in batch 182




Processed 10000 relationships in batch 183




Processed 10000 relationships in batch 184




Processed 10000 relationships in batch 185




Processed 10000 relationships in batch 186




Processed 10000 relationships in batch 187




Processed 10000 relationships in batch 188




Processed 10000 relationships in batch 189




Processed 10000 relationships in batch 190




Processed 10000 relationships in batch 191




Processed 10000 relationships in batch 192




Processed 10000 relationships in batch 193




Processed 10000 relationships in batch 194




Processed 10000 relationships in batch 195




Processed 10000 relationships in batch 196




Processed 10000 relationships in batch 197




Processed 10000 relationships in batch 198




Processed 10000 relationships in batch 199




Processed 10000 relationships in batch 200




Processed 10000 relationships in batch 201




Processed 10000 relationships in batch 202




Processed 10000 relationships in batch 203




Processed 10000 relationships in batch 204




Processed 10000 relationships in batch 205




Processed 10000 relationships in batch 206




Processed 10000 relationships in batch 207




Processed 10000 relationships in batch 208




Processed 10000 relationships in batch 209




Processed 10000 relationships in batch 210




Processed 10000 relationships in batch 211




Processed 10000 relationships in batch 212




Processed 10000 relationships in batch 213




Processed 10000 relationships in batch 214




Processed 10000 relationships in batch 215




Processed 10000 relationships in batch 216




Processed 10000 relationships in batch 217




Processed 10000 relationships in batch 218




Processed 10000 relationships in batch 219




Processed 10000 relationships in batch 220




Processed 10000 relationships in batch 221




Processed 10000 relationships in batch 222




Processed 10000 relationships in batch 223




Processed 10000 relationships in batch 224




Processed 10000 relationships in batch 225




Processed 10000 relationships in batch 226




Processed 10000 relationships in batch 227




Processed 10000 relationships in batch 228




Processed 10000 relationships in batch 229




Processed 10000 relationships in batch 230




Processed 10000 relationships in batch 231




Processed 10000 relationships in batch 232




Processed 10000 relationships in batch 233




Processed 10000 relationships in batch 234




Processed 10000 relationships in batch 235




Processed 10000 relationships in batch 236




Processed 10000 relationships in batch 237




Processed 10000 relationships in batch 238




Processed 10000 relationships in batch 239




Processed 10000 relationships in batch 240




Processed 10000 relationships in batch 241




Processed 10000 relationships in batch 242




Processed 10000 relationships in batch 243




Processed 10000 relationships in batch 244




Processed 10000 relationships in batch 245




Processed 10000 relationships in batch 246




Processed 10000 relationships in batch 247




Processed 10000 relationships in batch 248




Processed 10000 relationships in batch 249




Processed 10000 relationships in batch 250




Processed 10000 relationships in batch 251




Processed 10000 relationships in batch 252




Processed 10000 relationships in batch 253




Processed 10000 relationships in batch 254




Processed 10000 relationships in batch 255




Processed 10000 relationships in batch 256




Processed 10000 relationships in batch 257




Processed 10000 relationships in batch 258




Processed 10000 relationships in batch 259




Processed 10000 relationships in batch 260




Processed 10000 relationships in batch 261




Processed 10000 relationships in batch 262




Processed 10000 relationships in batch 263




Processed 10000 relationships in batch 264




Processed 10000 relationships in batch 265




Processed 10000 relationships in batch 266




Processed 10000 relationships in batch 267




Processed 10000 relationships in batch 268




Processed 10000 relationships in batch 269




Processed 10000 relationships in batch 270




Processed 10000 relationships in batch 271




Processed 10000 relationships in batch 272




Processed 10000 relationships in batch 273




Processed 10000 relationships in batch 274




Processed 10000 relationships in batch 275




Processed 10000 relationships in batch 276




Processed 10000 relationships in batch 277




Processed 10000 relationships in batch 278




Processed 10000 relationships in batch 279




Processed 10000 relationships in batch 280




Processed 10000 relationships in batch 281




Processed 10000 relationships in batch 282




Processed 10000 relationships in batch 283




Processed 10000 relationships in batch 284




Processed 10000 relationships in batch 285




Processed 10000 relationships in batch 286




Processed 10000 relationships in batch 287




Processed 10000 relationships in batch 288




Processed 10000 relationships in batch 289




Processed 10000 relationships in batch 290




Processed 10000 relationships in batch 291




Processed 10000 relationships in batch 292




Processed 10000 relationships in batch 293




Processed 10000 relationships in batch 294




Processed 10000 relationships in batch 295




Processed 10000 relationships in batch 296




Processed 10000 relationships in batch 297




Processed 10000 relationships in batch 298




Processed 10000 relationships in batch 299




Processed 10000 relationships in batch 300




Processed 10000 relationships in batch 301




Processed 10000 relationships in batch 302




Processed 10000 relationships in batch 303




Processed 10000 relationships in batch 304




Processed 10000 relationships in batch 305




Processed 10000 relationships in batch 306




Processed 10000 relationships in batch 307




Processed 10000 relationships in batch 308




Processed 10000 relationships in batch 309




Processed 10000 relationships in batch 310




Processed 10000 relationships in batch 311




Processed 10000 relationships in batch 312




Processed 10000 relationships in batch 313




Processed 10000 relationships in batch 314




Processed 10000 relationships in batch 315




Processed 10000 relationships in batch 316




Processed 10000 relationships in batch 317




Processed 10000 relationships in batch 318




Processed 10000 relationships in batch 319




Processed 10000 relationships in batch 320




Processed 10000 relationships in batch 321




Processed 10000 relationships in batch 322




Processed 10000 relationships in batch 323




Processed 10000 relationships in batch 324




Processed 10000 relationships in batch 325




Processed 10000 relationships in batch 326




Processed 10000 relationships in batch 327




Processed 10000 relationships in batch 328




Processed 10000 relationships in batch 329




Processed 10000 relationships in batch 330




Processed 10000 relationships in batch 331




Processed 10000 relationships in batch 332




Processed 10000 relationships in batch 333




Processed 10000 relationships in batch 334




Processed 10000 relationships in batch 335




Processed 10000 relationships in batch 336




Processed 10000 relationships in batch 337




Processed 10000 relationships in batch 338




Processed 10000 relationships in batch 339




Processed 10000 relationships in batch 340




Processed 10000 relationships in batch 341




Processed 10000 relationships in batch 342




Processed 10000 relationships in batch 343




Processed 10000 relationships in batch 344




Processed 10000 relationships in batch 345




Processed 10000 relationships in batch 346




Processed 10000 relationships in batch 347




Processed 10000 relationships in batch 348




Processed 10000 relationships in batch 349




Processed 10000 relationships in batch 350




Processed 10000 relationships in batch 351




Processed 10000 relationships in batch 352




Processed 10000 relationships in batch 353




Processed 10000 relationships in batch 354




Processed 10000 relationships in batch 355




Processed 10000 relationships in batch 356




Processed 10000 relationships in batch 357




Processed 10000 relationships in batch 358




Processed 10000 relationships in batch 359




Processed 10000 relationships in batch 360




Processed 10000 relationships in batch 361




Processed 10000 relationships in batch 362




Processed 10000 relationships in batch 363




Processed 10000 relationships in batch 364




Processed 10000 relationships in batch 365




Processed 10000 relationships in batch 366




Processed 10000 relationships in batch 367




Processed 10000 relationships in batch 368




Processed 10000 relationships in batch 369




Processed 10000 relationships in batch 370




Processed 10000 relationships in batch 371




Processed 10000 relationships in batch 372




Processed 10000 relationships in batch 373




Processed 10000 relationships in batch 374




Processed 10000 relationships in batch 375




Processed 10000 relationships in batch 376




Processed 10000 relationships in batch 377




Processed 10000 relationships in batch 378




Processed 10000 relationships in batch 379




Processed 10000 relationships in batch 380




Processed 10000 relationships in batch 381




Processed 10000 relationships in batch 382




Processed 10000 relationships in batch 383




Processed 10000 relationships in batch 384




Processed 10000 relationships in batch 385




Processed 10000 relationships in batch 386




Processed 10000 relationships in batch 387




Processed 10000 relationships in batch 388




Processed 10000 relationships in batch 389




Processed 10000 relationships in batch 390




Processed 10000 relationships in batch 391




Processed 10000 relationships in batch 392




Processed 10000 relationships in batch 393




Processed 10000 relationships in batch 394




Processed 10000 relationships in batch 395




Processed 10000 relationships in batch 396




Processed 10000 relationships in batch 397




Processed 10000 relationships in batch 398




Processed 10000 relationships in batch 399




Processed 10000 relationships in batch 400




Processed 10000 relationships in batch 401




Processed 10000 relationships in batch 402




Processed 10000 relationships in batch 403




Processed 10000 relationships in batch 404




Processed 10000 relationships in batch 405




Processed 10000 relationships in batch 406




Processed 10000 relationships in batch 407




Processed 10000 relationships in batch 408




Processed 10000 relationships in batch 409




Processed 10000 relationships in batch 410




Processed 10000 relationships in batch 411




Processed 10000 relationships in batch 412




Processed 10000 relationships in batch 413




Processed 10000 relationships in batch 414




Processed 10000 relationships in batch 415




Processed 10000 relationships in batch 416




Processed 10000 relationships in batch 417




Processed 10000 relationships in batch 418




Processed 10000 relationships in batch 419




Processed 10000 relationships in batch 420




Processed 10000 relationships in batch 421




Processed 10000 relationships in batch 422




Processed 10000 relationships in batch 423




Processed 10000 relationships in batch 424




Processed 10000 relationships in batch 425




Processed 10000 relationships in batch 426




Processed 10000 relationships in batch 427




Processed 10000 relationships in batch 428




Processed 10000 relationships in batch 429




Processed 10000 relationships in batch 430




Processed 10000 relationships in batch 431




Processed 10000 relationships in batch 432




Processed 10000 relationships in batch 433




Processed 10000 relationships in batch 434




Processed 10000 relationships in batch 435




Processed 10000 relationships in batch 436




Processed 10000 relationships in batch 437




Processed 10000 relationships in batch 438




Processed 10000 relationships in batch 439




Processed 10000 relationships in batch 440




Processed 10000 relationships in batch 441




Processed 10000 relationships in batch 442




Processed 10000 relationships in batch 443




Processed 10000 relationships in batch 444




Processed 10000 relationships in batch 445




Processed 10000 relationships in batch 446




Processed 10000 relationships in batch 447




Processed 10000 relationships in batch 448




Processed 10000 relationships in batch 449




Processed 10000 relationships in batch 450




Processed 10000 relationships in batch 451




Processed 10000 relationships in batch 452




Processed 10000 relationships in batch 453




Processed 10000 relationships in batch 454




Processed 10000 relationships in batch 455




Processed 10000 relationships in batch 456




Processed 10000 relationships in batch 457




Processed 10000 relationships in batch 458




Processed 10000 relationships in batch 459




Processed 10000 relationships in batch 460




Processed 10000 relationships in batch 461




Processed 10000 relationships in batch 462




Processed 10000 relationships in batch 463




Processed 10000 relationships in batch 464




Processed 10000 relationships in batch 465




Processed 10000 relationships in batch 466




Processed 10000 relationships in batch 467




Processed 10000 relationships in batch 468




Processed 10000 relationships in batch 469




Processed 10000 relationships in batch 470




Processed 10000 relationships in batch 471




Processed 10000 relationships in batch 472




Processed 10000 relationships in batch 473




Processed 10000 relationships in batch 474




Processed 10000 relationships in batch 475




Processed 10000 relationships in batch 476




Processed 10000 relationships in batch 477




Processed 10000 relationships in batch 478




Processed 10000 relationships in batch 479




Processed 10000 relationships in batch 480




Processed 10000 relationships in batch 481




Processed 10000 relationships in batch 482




Processed 10000 relationships in batch 483




Processed 10000 relationships in batch 484




Processed 10000 relationships in batch 485




Processed 10000 relationships in batch 486




Processed 10000 relationships in batch 487




Processed 10000 relationships in batch 488




Processed 10000 relationships in batch 489




Processed 10000 relationships in batch 490




Processed 10000 relationships in batch 491




Processed 10000 relationships in batch 492




Processed 10000 relationships in batch 493




Processed 10000 relationships in batch 494




Processed 10000 relationships in batch 495




Processed 10000 relationships in batch 496




Processed 10000 relationships in batch 497




Processed 10000 relationships in batch 498




Processed 10000 relationships in batch 499




Processed 10000 relationships in batch 500




Processed 10000 relationships in batch 501




Processed 10000 relationships in batch 502




Processed 10000 relationships in batch 503




Processed 10000 relationships in batch 504




Processed 10000 relationships in batch 505




Processed 10000 relationships in batch 506




Processed 10000 relationships in batch 507




Processed 10000 relationships in batch 508




Processed 10000 relationships in batch 509




Processed 10000 relationships in batch 510




Processed 10000 relationships in batch 511




Processed 10000 relationships in batch 512




Processed 10000 relationships in batch 513




Processed 10000 relationships in batch 514




Processed 10000 relationships in batch 515




Processed 10000 relationships in batch 516




Processed 10000 relationships in batch 517




Processed 10000 relationships in batch 518




Processed 10000 relationships in batch 519




Processed 10000 relationships in batch 520




Processed 10000 relationships in batch 521




Processed 10000 relationships in batch 522




Processed 10000 relationships in batch 523




Processed 10000 relationships in batch 524




Processed 10000 relationships in batch 525




Processed 10000 relationships in batch 526




Processed 10000 relationships in batch 527




Processed 10000 relationships in batch 528




Processed 10000 relationships in batch 529




Processed 10000 relationships in batch 530




Processed 10000 relationships in batch 531




Processed 10000 relationships in batch 532




Processed 10000 relationships in batch 533




Processed 10000 relationships in batch 534




Processed 10000 relationships in batch 535




Processed 10000 relationships in batch 536




Processed 10000 relationships in batch 537




Processed 10000 relationships in batch 538




Processed 10000 relationships in batch 539




Processed 10000 relationships in batch 540




Processed 10000 relationships in batch 541




Processed 10000 relationships in batch 542




Processed 10000 relationships in batch 543




Processed 10000 relationships in batch 544




Processed 10000 relationships in batch 545




Processed 10000 relationships in batch 546




Processed 10000 relationships in batch 547




Processed 10000 relationships in batch 548




Processed 10000 relationships in batch 549




Processed 10000 relationships in batch 550




Processed 10000 relationships in batch 551




Processed 10000 relationships in batch 552




Processed 10000 relationships in batch 553




Processed 10000 relationships in batch 554




Processed 10000 relationships in batch 555




Processed 10000 relationships in batch 556




Processed 10000 relationships in batch 557




Processed 10000 relationships in batch 558




Processed 10000 relationships in batch 559




Processed 10000 relationships in batch 560




Processed 10000 relationships in batch 561




Processed 10000 relationships in batch 562




Processed 10000 relationships in batch 563




Processed 10000 relationships in batch 564




Processed 10000 relationships in batch 565




Processed 10000 relationships in batch 566




Processed 10000 relationships in batch 567




Processed 10000 relationships in batch 568




Processed 10000 relationships in batch 569




Processed 10000 relationships in batch 570




Processed 10000 relationships in batch 571




Processed 10000 relationships in batch 572




Processed 10000 relationships in batch 573




Processed 10000 relationships in batch 574




Processed 10000 relationships in batch 575




Processed 10000 relationships in batch 576




Processed 10000 relationships in batch 577




Processed 10000 relationships in batch 578




Processed 10000 relationships in batch 579




Processed 10000 relationships in batch 580




Processed 10000 relationships in batch 581




Processed 10000 relationships in batch 582




Processed 10000 relationships in batch 583




Processed 10000 relationships in batch 584




Processed 10000 relationships in batch 585




Processed 10000 relationships in batch 586




Processed 10000 relationships in batch 587




Processed 10000 relationships in batch 588




Processed 10000 relationships in batch 589




Processed 10000 relationships in batch 590




Processed 10000 relationships in batch 591




Processed 10000 relationships in batch 592




Processed 10000 relationships in batch 593




Processed 10000 relationships in batch 594




Processed 10000 relationships in batch 595




Processed 10000 relationships in batch 596




Processed 10000 relationships in batch 597




Processed 10000 relationships in batch 598




Processed 10000 relationships in batch 599




Processed 10000 relationships in batch 600




Processed 10000 relationships in batch 601




Processed 10000 relationships in batch 602




Processed 10000 relationships in batch 603




Processed 10000 relationships in batch 604




Processed 10000 relationships in batch 605




Processed 10000 relationships in batch 606




Processed 10000 relationships in batch 607




Processed 10000 relationships in batch 608




Processed 10000 relationships in batch 609




Processed 10000 relationships in batch 610




Processed 10000 relationships in batch 611




Processed 10000 relationships in batch 612




Processed 10000 relationships in batch 613




Processed 10000 relationships in batch 614




Processed 10000 relationships in batch 615




Processed 10000 relationships in batch 616




Processed 10000 relationships in batch 617




Processed 10000 relationships in batch 618




Processed 10000 relationships in batch 619




Processed 10000 relationships in batch 620




Processed 10000 relationships in batch 621




Processed 10000 relationships in batch 622




Processed 10000 relationships in batch 623




Processed 10000 relationships in batch 624




Processed 10000 relationships in batch 625




Processed 10000 relationships in batch 626




Processed 10000 relationships in batch 627




Processed 10000 relationships in batch 628




Processed 10000 relationships in batch 629




Processed 10000 relationships in batch 630




Processed 10000 relationships in batch 631




Processed 10000 relationships in batch 632




Processed 10000 relationships in batch 633




Processed 10000 relationships in batch 634




Processed 10000 relationships in batch 635




Processed 10000 relationships in batch 636




Processed 10000 relationships in batch 637




Processed 10000 relationships in batch 638




Processed 10000 relationships in batch 639




Processed 10000 relationships in batch 640




Processed 10000 relationships in batch 641




Processed 10000 relationships in batch 642




Processed 10000 relationships in batch 643




Processed 10000 relationships in batch 644




Processed 10000 relationships in batch 645




Processed 10000 relationships in batch 646




Processed 10000 relationships in batch 647




Processed 10000 relationships in batch 648




Processed 10000 relationships in batch 649




Processed 10000 relationships in batch 650




Processed 10000 relationships in batch 651




Processed 10000 relationships in batch 652




Processed 10000 relationships in batch 653




Processed 10000 relationships in batch 654




Processed 10000 relationships in batch 655




Processed 10000 relationships in batch 656




Processed 10000 relationships in batch 657




Processed 10000 relationships in batch 658




Processed 10000 relationships in batch 659




Processed 10000 relationships in batch 660




Processed 10000 relationships in batch 661




Processed 10000 relationships in batch 662




Processed 10000 relationships in batch 663




Processed 10000 relationships in batch 664




Processed 10000 relationships in batch 665




Processed 10000 relationships in batch 666




Processed 10000 relationships in batch 667




Processed 10000 relationships in batch 668




Processed 10000 relationships in batch 669




Processed 10000 relationships in batch 670




Processed 10000 relationships in batch 671




Processed 10000 relationships in batch 672




Processed 10000 relationships in batch 673




Processed 10000 relationships in batch 674




Processed 10000 relationships in batch 675




Processed 10000 relationships in batch 676




Processed 10000 relationships in batch 677




Processed 10000 relationships in batch 678




Processed 10000 relationships in batch 679




Processed 10000 relationships in batch 680




Processed 10000 relationships in batch 681




Processed 10000 relationships in batch 682




Processed 10000 relationships in batch 683




Processed 10000 relationships in batch 684




Processed 10000 relationships in batch 685




Processed 4835 relationships in batch 686
Connection to Neo4j closed.


In [None]:
allDone()

In [19]:
from neo4j import GraphDatabase

# Neo4j connection details
uri = "bolt://localhost:7687"
user = "neo4j"
password = "12345678"

class Neo4jConnection:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))
        print("Connected to Neo4j.")

    def close(self):
        self.driver.close()
        print("Connection to Neo4j closed.")

    def delete_old_named_entity_relationships(self):
        """Delete all existing HAS_SHARED_NAMED_ENTITY relationships."""
        with self.driver.session(database="DocDatabase") as session:
            session.run("MATCH ()-[r:HAS_SHARED_NAMED_ENTITY]->() DELETE r")
            print("Deleted all old HAS_SHARED_NAMED_ENTITY relationships.")

    def get_duplicated_named_entities(self):
        """Retrieve duplicated named entities and their counts."""
        with self.driver.session(database="DocDatabase") as session:
            query = """
                MATCH (n:NamedEntity)
                WHERE n.term IS NOT NULL AND n.term <> ""
                WITH n.term AS term, COUNT(*) AS count
                WHERE count > 1
                RETURN term, count
                ORDER BY count DESC
            """
            result = session.run(query)
            duplicated_named_entities = result.data()
            if duplicated_named_entities:
                print("Duplicated named entities and their counts:")
                for record in duplicated_named_entities:
                    print(f"Term: '{record['term']}', Count: {record['count']}")
            else:
                print("No duplicated named entities found.")
            return duplicated_named_entities

    def merge_duplicate_named_entities(self, batch_size=10000):
        with self.driver.session(database="DocDatabase") as session:
            offset = 0
            while True:
                query = f"""
                    MATCH (n1:NamedEntity), (n2:NamedEntity)
                    WHERE n1.term = n2.term AND n1.term IS NOT NULL AND n1.term <> "" AND id(n1) < id(n2)
                    WITH n1, n2 SKIP {offset} LIMIT {batch_size}
                    MERGE (n1)-[:HAS_SHARED_NAMED_ENTITY]->(n2)
                    MERGE (n2)-[:HAS_SHARED_NAMED_ENTITY]->(n1)
                    RETURN COUNT(*) AS processed
                """
                result = session.run(query)
                processed = result.single()["processed"]
                print(f"Processed {processed} relationships in batch {offset // batch_size + 1}")
                if processed < batch_size:
                    break
                offset += batch_size

if __name__ == "__main__":
    # Initialize Neo4j connection
    conn = Neo4jConnection(uri, user, password)

    # Step 1: Delete old HAS_SHARED_NAMED_ENTITY relationships
    conn.delete_old_named_entity_relationships()

    # Step 2: Get and print duplicated named entities
    duplicated_named_entities = conn.get_duplicated_named_entities()

    # Step 3: Merge duplicate named entities if any duplicates exist
    if duplicated_named_entities:
        conn.merge_duplicate_named_entities()

    # Close the connection
    conn.close()


Connected to Neo4j.
Deleted all old HAS_SHARED_NAMED_ENTITY relationships.
Duplicated named entities and their counts:
Term: 'account', Count: 415
Term: 'exchange', Count: 358
Term: 'clearing', Count: 339
Term: 'interest', Count: 336
Term: 'exposure', Count: 333
Term: 'group', Count: 294
Term: 'position', Count: 288
Term: 'price', Count: 257
Term: 'jurisdiction', Count: 245
Term: 'funds', Count: 239
Term: 'control', Count: 223
Term: 'collateral', Count: 216
Term: 'contract', Count: 214
Term: 'insurer', Count: 211
Term: 'security', Count: 209
Term: 'clearing house', Count: 206
Term: 'issuer', Count: 205
Term: 'liquidity', Count: 203
Term: 'agreement', Count: 182
Term: 'counterparty', Count: 181
Term: 'financial institution', Count: 175
Term: 'fund manager', Count: 163
Term: 'policy', Count: 162
Term: 'fee', Count: 145
Term: 'law', Count: 141
Term: 'principal', Count: 138
Term: 'trade', Count: 132
Term: 'balance', Count: 132
Term: 'future', Count: 131
Term: 'currency', Count: 127
Term: '



Processed 10000 relationships in batch 1




Processed 10000 relationships in batch 2




Processed 10000 relationships in batch 3




Processed 10000 relationships in batch 4




Processed 10000 relationships in batch 5




Processed 10000 relationships in batch 6




Processed 10000 relationships in batch 7




Processed 10000 relationships in batch 8




Processed 10000 relationships in batch 9




Processed 10000 relationships in batch 10




Processed 10000 relationships in batch 11




Processed 10000 relationships in batch 12




Processed 10000 relationships in batch 13




Processed 10000 relationships in batch 14




Processed 10000 relationships in batch 15




Processed 10000 relationships in batch 16




Processed 10000 relationships in batch 17




Processed 10000 relationships in batch 18




Processed 10000 relationships in batch 19




Processed 10000 relationships in batch 20




Processed 10000 relationships in batch 21




Processed 10000 relationships in batch 22




Processed 10000 relationships in batch 23




Processed 10000 relationships in batch 24




Processed 10000 relationships in batch 25




Processed 10000 relationships in batch 26




Processed 10000 relationships in batch 27




Processed 10000 relationships in batch 28




Processed 10000 relationships in batch 29




Processed 10000 relationships in batch 30




Processed 10000 relationships in batch 31




Processed 10000 relationships in batch 32




Processed 10000 relationships in batch 33




Processed 10000 relationships in batch 34




Processed 10000 relationships in batch 35




Processed 10000 relationships in batch 36




Processed 10000 relationships in batch 37




Processed 10000 relationships in batch 38




Processed 10000 relationships in batch 39




Processed 10000 relationships in batch 40




Processed 10000 relationships in batch 41




Processed 10000 relationships in batch 42




Processed 10000 relationships in batch 43




Processed 10000 relationships in batch 44




Processed 10000 relationships in batch 45




Processed 10000 relationships in batch 46




Processed 10000 relationships in batch 47




Processed 10000 relationships in batch 48




Processed 10000 relationships in batch 49




Processed 10000 relationships in batch 50




Processed 10000 relationships in batch 51




Processed 10000 relationships in batch 52




Processed 10000 relationships in batch 53




Processed 10000 relationships in batch 54




Processed 10000 relationships in batch 55




Processed 10000 relationships in batch 56




Processed 10000 relationships in batch 57




Processed 10000 relationships in batch 58




Processed 10000 relationships in batch 59




Processed 10000 relationships in batch 60




Processed 10000 relationships in batch 61




Processed 10000 relationships in batch 62




Processed 10000 relationships in batch 63




Processed 10000 relationships in batch 64




Processed 10000 relationships in batch 65




Processed 10000 relationships in batch 66




Processed 10000 relationships in batch 67




Processed 10000 relationships in batch 68




Processed 10000 relationships in batch 69




Processed 10000 relationships in batch 70




Processed 10000 relationships in batch 71




Processed 10000 relationships in batch 72




Processed 10000 relationships in batch 73




Processed 10000 relationships in batch 74




Processed 10000 relationships in batch 75




Processed 10000 relationships in batch 76




Processed 10000 relationships in batch 77




Processed 10000 relationships in batch 78




Processed 10000 relationships in batch 79




Processed 10000 relationships in batch 80




Processed 10000 relationships in batch 81




Processed 10000 relationships in batch 82




Processed 10000 relationships in batch 83




Processed 10000 relationships in batch 84




Processed 10000 relationships in batch 85




Processed 10000 relationships in batch 86




Processed 10000 relationships in batch 87




Processed 10000 relationships in batch 88




Processed 10000 relationships in batch 89
Processed 1563 relationships in batch 90
Connection to Neo4j closed.


In [25]:
allDone()