<a href="https://colab.research.google.com/github/RegNLP/ContextAware-Regulatory-GraphRAG-ObliQAMP/blob/main/1_build_regulatory_knowledge_graph.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 📘 01_build_regulatory_knowledge_graph.ipynb
# Purpose: Build a multi-level regulatory knowledge graph from JSON documents and cross-reference data.

import os
import json
import pickle
import networkx as nx
import pandas as pd
from collections import Counter

# --- Configuration ---
base_path = "/content/drive/MyDrive/Colab Notebooks/RIRAG-MultiPassage-NLLP"
documents_folder = os.path.join(base_path, "Documents")
crossref_path = os.path.join(base_path, "CrossReferenceData.csv")
graph_output_path = os.path.join(base_path, "graph.gpickle")

# List of documents with their information and JSON file paths
documents = [
    # Documents with complete information
    {
        "DocumentID": 3,
        "title": "CoBs",
        "version": "",
        "date": "",
        "json_file_path":  os.path.join(documents_folder, "COBs_updated.json")
    },
    {
        "DocumentID": 1,
        "title": "AML",
        "version": "VER09.211223",
        "date": "21/12/2023",
        "json_file_path":  os.path.join(documents_folder, "AML_VER09.211223_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 2,
        "title": "CIB",
        "version": "VER04.030220",
        "date": "03/02/2020",
        "json_file_path":  os.path.join(documents_folder, "CIB_VER04.030220_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 4,
        "title": "FEES",
        "version": "VER16.181223",
        "date": "18/12/2023",
        "json_file_path":  os.path.join(documents_folder, "FEES_VER16.181223_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 5,
        "title": "FP",
        "version": "VER01.110319",
        "date": "11/03/2019",
        "json_file_path":  os.path.join(documents_folder, "FP_VER01.110319_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 6,
        "title": "FUNDS",
        "version": "VER08.040723",
        "date": "04/07/2023",
        "json_file_path":  os.path.join(documents_folder, "FUNDS_VER08.040723_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 7,
        "title": "GEN",
        "version": "VER08.181223",
        "date": "18/12/2023",
        "json_file_path":  os.path.join(documents_folder, "GEN_VER08.181223_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 8,
        "title": "GLO",
        "version": "VER19.181223",
        "date": "18/12/2023",
        "json_file_path":  os.path.join(documents_folder, "GLO_VER19.181223.json")
    },
    {
        "DocumentID": 9,
        "title": "IFR",
        "version": "VER07.181223",
        "date": "18/12/2023",
        "json_file_path":  os.path.join(documents_folder, "IFR_VER07.181223_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 10,
        "title": "MIR",
        "version": "VER07.181223",
        "date": "18/12/2023",
        "json_file_path":  os.path.join(documents_folder, "MIR_VER07.181223_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 11,
        "title": "MKT",
        "version": "VER08.181223",
        "date": "18/12/2023",
        "json_file_path":  os.path.join(documents_folder, "MKT_VER08.181223_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 13,
        "title": "PRU",
        "version": "VER13.181223",
        "date": "18/12/2023",
        "json_file_path":  os.path.join(documents_folder, "PRU_VER13.181223_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 12,
        "title": "PIN",
        "version": "",
        "date": "",
        "json_file_path":  os.path.join(documents_folder, "PIN.json")
    },
    {
        "DocumentID": 14,
        "title": "BRR Regulations",
        "version": "",
        "date": "December 2018",
        "json_file_path":  os.path.join(documents_folder, "BRR Regulations (December 2018)_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 15,
        "title": "CRS Regulations 2017",
        "version": "(Consolidated October 2023) v6",
        "date": "01/01/2017",
        "json_file_path":  os.path.join(documents_folder, "CRS Regulations 2017 (Consolidated_October 2023) v6_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 16,
        "title": "Foreign Tax Account Compliance Regulations 2022",
        "version": "",
        "date": "01/01/2022",
        "json_file_path":  os.path.join(documents_folder, "Foreign Tax Account Compliance Regulations 2022_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 17,
        "title": "FSMR (Consolidated December 2023)",
        "version": "",
        "date": "December 2023",
        "json_file_path":  os.path.join(documents_folder, "FSMR (Consolidated_December 2023)_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 18,
        "title": "Guidance – Regulatory Framework for Fund Managers of Venture Capital Funds",
        "version": "VER03.181223",
        "date": "18/12/2023",
        "json_file_path":  os.path.join(documents_folder, "Guidance – Regulatory Framework for Fund Managers of Venture Capital Funds (VER03.181223)_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 19,
        "title": "Guidance - Virtual Asset Activities in ADGM",
        "version": "VER05.181223",
        "date": "18/12/2023",
        "json_file_path":  os.path.join(documents_folder, "Guidance - Virtual Asset Activities in ADGM (VER05.181223)_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 20,
        "title": "ADGM Guidance - Application of English Laws",
        "version": "",
        "date": "",
        "json_file_path":  os.path.join(documents_folder, "ADGM_Guidance_-_Application_of_English_Laws_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 21,
        "title": "API - Guidance Note",
        "version": "",
        "date": "14/10/2019",
        "json_file_path":  os.path.join(documents_folder, "API - Guidance Note_Final 14 October 2019 Eng_obligations_named_entities_defined_terms_modified.json")
    },
    {
        "DocumentID": 22,
        "title": "CMC",
        "version": "VER03.270922",
        "date": "27/09/2022",
        "json_file_path":  os.path.join(documents_folder, "CMC_VER03.270922_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 23,
        "title": "CONF",
        "version": "VER03.18042019",
        "date": "18/04/2019",
        "json_file_path":  os.path.join(documents_folder, "CONF_VER03.18042019_obligations_named_entities_defined_terms.json")
    },
        {
        "DocumentID": 25,
        "title": "Environmental Social and Governance Disclosures Guidance",
        "version": "VER01.040723",
        "date": "04/07/2023",
        "json_file_path":  os.path.join(documents_folder, "Environmental Social and Governance Disclosures Guidance_VER01.040723_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 26,
        "title": "FinTech RegLab Guidance",
        "version": "VER01.31082016",
        "date": "31/08/2016",
        "json_file_path":  os.path.join(documents_folder, "FinTech RegLab Guidance_VER01.31082016_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 27,
        "title": "GPM",
        "version": "VER03.120623",
        "date": "12/06/2023",
        "json_file_path":  os.path.join(documents_folder, "GPM VER03.120623_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 28,
        "title": "Guidance - Continuous Disclosure",
        "version": "VER01.280922",
        "date": "28/09/2022",
        "json_file_path":  os.path.join(documents_folder, "Guidance - Continuous Disclosure_VER01.280922_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 29,
        "title": "Guidance - Digital Securities Offerings and Virtual  Assets under the Financial Services and Markets Regulations",
        "version": "240220",
        "date": "24/02/2020",
        "json_file_path":  os.path.join(documents_folder, "Guidance - Digital Securities Offerings and Virtual  Assets under the Financial Services and Markets Regulations_240220_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 30,
        "title": "Guidance - Disclosure Requirements for Mining Reporting Entities",
        "version": "VER01.280922",
        "date": "28/09/2022",
        "json_file_path":  os.path.join(documents_folder, "Guidance - Disclosure Requirements for Mining Reporting Entities_VER01.280922_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 31,
        "title": "Guidance - Disclosure Requirements for Petroleum Reporting Entities",
        "version": "VER01.280922",
        "date": "28/09/2022",
        "json_file_path":  os.path.join(documents_folder, "Guidance - Disclosure Requirements for Petroleum Reporting Entities_VER01.280922_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 32,
        "title": "Guidance - Private Credit Funds",
        "version": "VER01.040523",
        "date": "04/05/2023",
        "json_file_path":  os.path.join(documents_folder, "Guidance - Private Credit Funds_VER01.040523_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 33,
        "title": "Guidance - Regulation of Digital Securities Activities in ADGM",
        "version": "",
        "date": "24/02/2024",
        "json_file_path":  os.path.join(documents_folder, "Guidance  Regulation of Digital Securities Activities in ADGM_240224_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 34,
        "title": "Guidance - Regulation of Spot Commodities Activities in ADGM",
        "version": "VER02.181223",
        "date": "18/12/2023",
        "json_file_path":  os.path.join(documents_folder, "Guidance - Regulation of Spot Commodities Activities in ADGM (VER02.181223)_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 35,
        "title": "Guidance - Regulatory Framework for PFP and Multilateral Trading Facilities dealing with Private Capital Markets",
        "version": "VER02.181223",
        "date": "18/12/2023",
        "json_file_path":  os.path.join(documents_folder, "Guidance_Regulatory Framework for PFP and Multilateral Trading Facilities dealing with Private Capital Markets (VER02.181223)_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 36,
        "title": "SFWG Guidance on Principles for the Effective Management of Climate-related Financial Risks",
        "version": "",
        "date": "",
        "json_file_path":  os.path.join(documents_folder, "SFWG_Guidance on Principles for the Effective Management of Climate-related Financial Risks_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 37,
        "title": "Supplementary Guidance - Authorisation of Digital Investment Management (Robo-advisory) Activities",
        "version": "",
        "date": "",
        "json_file_path":  os.path.join(documents_folder, "Supplementary Guidance  Authorisation of Digital Investment Management (Robo-advisory) Activities_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 38,
        "title": "Supplementary Guidance OTCLPs",
        "version": "VER02.181223",
        "date": "18/12/2023",
        "json_file_path":  os.path.join(documents_folder, "Supplementary Guidance OTCLPs (VER02.181223)_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 39,
        "title": "Sustainable Finance Supplementary Guidance",
        "version": "VER01.040723",
        "date": "04/07/2023",
        "json_file_path":  os.path.join(documents_folder, "Sustainable Finance Supplementary Guidance_VER01.040723_obligations_named_entities_defined_terms.json")
    },
    {
        "DocumentID": 40,
        "title": "UAE CRS Guidance Notes",
        "version": "June 2020",
        "date": "17/06/2020",
        "json_file_path":  os.path.join(documents_folder, "UAE_CRS_Guidance_Notes_17 June 2020 (002)_obligations_named_entities_defined_terms.json")
    }
]

In [None]:
# --- Initialize Graph ---
G = nx.DiGraph()

# --- Add Documents and Passages ---
for doc in documents:
    doc_id = doc["DocumentID"]
    doc_node_id = f"D{doc_id}"
    G.add_node(doc_node_id, type="Document", title=doc["title"])

    print(f"\n📄 Processing {doc['title']}")

    with open(doc["json_file_path"], "r", encoding="utf-8") as f:
        passages = json.load(f)

    passages_by_id = {}

    for p in passages:
        raw_id = p.get("PassageID") or p.get("ID") or p.get("ContextID")
        if not raw_id:
            continue

        passage_id = str(raw_id).replace("..", ".").strip(".")
        uid = p.get("ID") or p.get("ContextID")
        text = p.get("Text") or p.get("Passage") or "[EMPTY TEXT]"

        G.add_node(uid, type="Passage", passage_id=passage_id, document_id=doc_id, text=text)
        passages_by_id[passage_id] = uid
        G.add_edge(doc_node_id, uid, type="CONTAINS")

        # Named Entities
        for ent in p.get("NamedEntities", []):
            if isinstance(ent, dict):
                term = ent.get("Term") or ent.get("ContextID") or ent.get("ID")
                desc = ent.get("Description") or ent.get("Meaning", "")
            elif isinstance(ent, str):
                term, desc = ent, ""
            else:
                continue
            if not term:
                continue
            ne_uid = f"NE_{term}_{uid}"
            G.add_node(ne_uid, type="NamedEntity", term=term, description=desc)
            G.add_edge(uid, ne_uid, type="MENTIONS")

        # Defined Terms
        for term in p.get("DefinedTerms", []):
            if isinstance(term, dict):
                t = term.get("Term") or term.get("ContextID") or term.get("ID")
                desc = term.get("Description") or term.get("Meaning", "")
            elif isinstance(term, str):
                t, desc = term, ""
            else:
                continue
            if not t:
                continue
            dt_uid = f"DT_{t}_{uid}"
            G.add_node(dt_uid, type="DefinedTerm", term=t, description=desc)
            G.add_edge(uid, dt_uid, type="USES_TERM")

    # --- Parent Hierarchy ---
    for passage_id, uid in passages_by_id.items():
        parent_found = False
        parts = passage_id.split('.')
        while len(parts) > 1 and not parent_found:
            parts.pop()
            parent_pid = '.'.join(parts)
            parent_uid = passages_by_id.get(parent_pid)
            if parent_uid:
                G.add_edge(parent_uid, uid, type="PARENT_OF")
                parent_found = True

# --- Add Cross-References ---
try:
    crossref_df = pd.read_csv(crossref_path)
    for _, row in crossref_df.iterrows():
        src_uid = str(row.get("SourceID"))
        tgt_uid = str(row.get("TargetID"))
        if src_uid in G.nodes and tgt_uid in G.nodes:
            G.add_edge(src_uid, tgt_uid, type="CITES")
            G.add_edge(tgt_uid, src_uid, type="CITED_BY")
    print("\n🔗 Cross-reference edges added.")
except Exception as e:
    print(f"⚠️ Error loading cross-reference file: {e}")

# --- Save Graph ---
with open(graph_output_path, "wb") as f:
    pickle.dump(G, f)
print(f"\n✅ Graph saved to: {graph_output_path}")

# --- Summary ---
print("\n📊 Graph Summary")
print(f"🔹 Nodes: {G.number_of_nodes()}")
print(f"🔹 Edges: {G.number_of_edges()}")
node_types = Counter(nx.get_node_attributes(G, "type").values())
for t, count in node_types.items():
    print(f"   - {t}: {count}")



📄 Processing CoBs

📄 Processing AML

📄 Processing CIB

📄 Processing FEES

📄 Processing FP

📄 Processing FUNDS

📄 Processing GEN

📄 Processing GLO

📄 Processing IFR

📄 Processing MIR

📄 Processing MKT

📄 Processing PRU

📄 Processing PIN

📄 Processing BRR Regulations

📄 Processing CRS Regulations 2017

📄 Processing Foreign Tax Account Compliance Regulations 2022

📄 Processing FSMR (Consolidated December 2023)

📄 Processing Guidance – Regulatory Framework for Fund Managers of Venture Capital Funds

📄 Processing Guidance - Virtual Asset Activities in ADGM

📄 Processing ADGM Guidance - Application of English Laws

📄 Processing API - Guidance Note

📄 Processing CMC

📄 Processing CONF

📄 Processing Environmental Social and Governance Disclosures Guidance

📄 Processing FinTech RegLab Guidance

📄 Processing GPM

📄 Processing Guidance - Continuous Disclosure

📄 Processing Guidance - Digital Securities Offerings and Virtual  Assets under the Financial Services and Markets Regulations

📄 Processi

In [None]:
print("📊 Graph Summary")
print(f"🔹 Number of nodes: {G.number_of_nodes()}")
print(f"🔹 Number of edges: {G.number_of_edges()}")

# Optional: count node types
from collections import Counter
node_types = Counter(nx.get_node_attributes(G, "type").values())
print("🔸 Node types count:")
for t, count in node_types.items():
    print(f"   - {t}: {count}")

📊 Graph Summary
🔹 Number of nodes: 46677
🔹 Number of edges: 59058
🔸 Node types count:
   - Document: 39
   - Passage: 13729
   - NamedEntity: 9730
   - DefinedTerm: 23179
