In [1]:
import pickle


G = pickle.load(open('AWPR Version 2_text_graph.pickle', 'rb'))

In [5]:
import os
import requests

from google.oauth2 import service_account
from google.auth.transport.requests import Request
from langchain_core.callbacks import CallbackManagerForLLMRun
from typing import Any, Optional, ClassVar
from langchain_core.language_models.llms import LLM


# Set proxy environment variables for DB network
os.environ['HTTP_PROXY'] = 'http://sp-surf-proxy.intranet.db.com:8080'
os.environ['HTTPS_PROXY'] = 'http://sp-surf-proxy.intranet.db.com:8080'


class VertexAILangchainLLM(LLM):
    """
    LangChain LLM class for interacting with Vertex AI using direct HTTP requests.
    """
    project_id: ClassVar[str] = "db-dev-ny3a-flare-dev-1"
    location:ClassVar[str] = "europe-west3"
    model_name:ClassVar[str] = "gemini-1.5-pro-002"
    credentials_path:ClassVar[str] = 'keyfile_new.json'
    if os.getenv('HTTP_PROXY'):
        proxies = {
            'http': os.getenv('HTTP_PROXY'),
            'https': os.getenv('HTTPS_PROXY')
        }   
    else:
        proxies = {}

    temperature: int = 0
    credentials: str = ""

    def __init__(self, model_params: dict):
        """
        Initialize the VertexAILangchainLLM.

        Args:
            model_params: A dictionary containing model parameters.
                - location: The location of the Vertex AI endpoint.
                - model_name: The name of the model to use.
                - temperature: The temperature for text generation.
                - credentials_path: Optional path to service account key file.
        """
        super().__init__()
        if 'temperature' not in model_params:
            model_params['temperature'] = 0.5
        else:
            self.temperature = model_params['temperature']

        # if 'location' not in model_params:
        #     raise ValueError("location must be provided in model_params")

        # if 'model_name' not in model_params:
        #     raise ValueError("model_name must be provided in model_params")
        

        self.authenticate()

    @property
    def _llm_type(self) -> str:
        return "custom"

    def authenticate(self):
        """
        Authenticate with Vertex AI using service account credentials.

        Args:
            project_id: The GCP project ID
            credentials_path: Optional path to service account key file
        """

        if self.credentials_path:
            os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = self.credentials_path

        if not os.getenv('GOOGLE_APPLICATION_CREDENTIALS'):
            raise ValueError("GOOGLE_APPLICATION_CREDENTIALS not set")

        self.credentials = service_account.Credentials.from_service_account_file(
            os.getenv('GOOGLE_APPLICATION_CREDENTIALS'),
            scopes=['https://www.googleapis.com/auth/cloud-platform']
        )

    def _call(self,
              prompt: str,
              run_manager: Optional[CallbackManagerForLLMRun] = None,
              **kwargs: Any
              ) -> str:
        try:
            if not self.credentials.valid:
                self.credentials.refresh(Request())

            url = f"https://{self.location}-aiplatform.googleapis.com/v1/projects/{self.project_id}/locations/{self.location}/publishers/google/models/{self.model_name}:generateContent"

            headers = {
                "Authorization": f"Bearer {self.credentials.token}",
                "Content-Type": "application/json",
                "x-goog-request-params": "vpc-sc-bypass=true",
                "x-goog-user-project": self.project_id,
                "x-goog-vpc-service-controls": "true"
            }

            data = {
                "contents": [{
                    "role": "user",
                    "parts": [{
                        "text": prompt
                    }]
                }],
                "generationConfig": {
                    "temperature": self.temperature
                }
            }

            response = requests.post(url, headers=headers, json=data) #proxies=self.proxies)
            if response.status_code == 200:
                response_json = response.json()
                if 'candidates' in response_json and len(response_json['candidates']) > 0:
                    text = response_json['candidates'][0]['content']['parts'][0]['text']
                    return text
                raise Exception("No text found in response")
            else:
                error_message = response.text
                print(f"Full error: {error_message}")
                if "VPC_SERVICE_CONTROLS" in error_message:
                    raise Exception("Please ensure you're connected to DB VPN")
                elif "PERMISSION_DENIED" in error_message:
                    raise Exception("Check service account permissions")
                else:
                    raise Exception(f"API Error: {error_message}")

        except requests.exceptions.RequestException as e:
            print(f"Error making API request: {str(e)}")
            raise
        except Exception as e:
            print(f"Error generating response: {str(e)}")
            raise

In [41]:
def get_prompt():
    return """Generate a detailed summary of the differences between two versions of a graph, presented in a tabular format. The input is a JSON object that outlines the changes in nodes and edges.

        Here is the JSON input to process:
        -------------------
        {difference_json}
        

        The JSON input will be in the following format:

        {
        "nodes_diff": { // Differences in nodes
            "added_in_v2": [ // Nodes added in version 2
            {
                "label": "Node Label",
                "description": "Node Description"
                // ... other attributes
            }
            ],
            "removed_from_v1": [ // Nodes removed from version 1
            {
                "label": "Node Label",
                "description": "Node Description"
                // ... other attributes
            }
            ],
            "modified": [ // Nodes modified between versions
            {
                "label": "Node Label",
                "v1_description": "Version 1 Description",
                "v2_description": "Version 2 Description"
                // ... other changed attributes like "other_changed_attributes": {"attribute_name": {"v1_value": "...", "v2_value": "..."}}
            }
            ]
        },
        "edges_diff": { // Differences in edges
            "added_in_v2": [ // Edges added in version 2
            {
                "source_label": "Source Node Label",
                "target_label": "Target Node Label",
                "relationship_label": "Relationship Label",
                "details": "Edge Details"
                // ... other attributes
            }
            ],
            "removed_from_v1": [ // Edges removed from version 1
            {
                "source_label": "Source Node Label",
                "target_label": "Target Node Label",
                "relationship_label": "Relationship Label",
                "details": "Edge Details"
                // ... other attributes
            }
            ],
            "modified": [ // Edges modified between versions
            {
                "source_label": "Source Node Label",
                "target_label": "Target Node Label",
                "v1_relationship_label": "Version 1 Relationship",
                "v2_relationship_label": "Version 2 Relationship",
                "v1_details": "Version 1 Details",
                "v2_details": "Version 2 Details"
                // ... other changed attributes
            }
            ]
        }
        }

        Your task is to process this JSON and produce two tables: one for node differences and one for edge differences.

        **Node Differences Table:**
        The table should have the following columns:
        - "Change Type" (e.g., Added, Removed, Modified)
        - "Label"
        - "Attribute Changed" (e.g., Description, or specific attribute from `other_changed_attributes`)
        - "Value in v1" (empty if Added)
        - "Value in v2" (empty if Removed)

        **Edge Differences Table:**
        The table should have the following columns:
        - "Change Type" (e.g., Added, Removed, Modified)
        - "Source Label"
        - "Target Label"
        - "Attribute Changed" (e.g., Relationship Label, Details)
        - "Value in v1" (empty if Added)
        - "Value in v2" (empty if Removed)

        For modified items, clearly show the old (v1) and new (v2) values for each changed attribute. If an item has multiple attributes changed (e.g., both description and another property for a node, or both relationship label and details for an edge), list each changed attribute as a separate row or clearly delineate them for that item.



        Please provide the summary as two distinct markdown tables.
        """


In [42]:
import networkx as nx
import pickle
import json
import PyPDF2
import re
import json 
from langchain.prompts.chat import ChatPromptTemplate
from langchain.prompts.chat import HumanMessagePromptTemplate


def create_diff_report(graph_difference):
    print(f"--- Sending to Vertex AI (Placeholder) ---")
    messages = []

    template = get_prompt()
    human_template = HumanMessagePromptTemplate.from_template(template)
    messages.append(human_template)
    chat_prompt = ChatPromptTemplate.from_messages(messages)
    request = chat_prompt.format_prompt(difference_json= graph_difference, 
                                        ).to_messages()
    
    request_dicts = [{"role": msg.type, "content": msg.content} for msg in request]

    print(f"request_dicts: {request_dicts}")

    llm = VertexAILangchainLLM({})
    try:
        response = llm._call(prompt=str(request_dicts))
        print(response)

    except Exception as e:
        print("Some error occered"+str(e))

def get_entity_signature(node_id, node_attributes):
    """
    Creates a comparable signature for a node.
    Prioritizes the 'label' attribute for semantic matching.
    If 'label' is not present, uses the node_id itself as a fallback.
    """
    label = node_attributes.get('label')
    if label is not None:
        return ("label", str(label))
    return ("id", node_id)

# --- Helper: Get Node Display Name ---
def get_node_display_name(graph: nx.MultiDiGraph, node_id):
    if graph.has_node(node_id):
        return graph.nodes[node_id].get('label', str(node_id))
    return str(node_id)


def compare_graphs_by_page_number_with_signatures(g1: nx.MultiDiGraph, g2: nx.MultiDiGraph):
    """
    Compares two graphs page by page, using entity signatures for node matching.
    Category checking has been removed.
    """
    differences_by_page = {}

    all_page_numbers = set()
    for _, data in g1.nodes(data=True): all_page_numbers.update(data.get('page_numbers', []))
    for _, data in g2.nodes(data=True): all_page_numbers.update(data.get('page_numbers', []))
    for _, _, data in g1.edges(data=True): all_page_numbers.update(data.get('page_numbers', []))
    for _, _, data in g2.edges(data=True): all_page_numbers.update(data.get('page_numbers', []))

    sorted_page_numbers = sorted(list(all_page_numbers), key=lambda x: (isinstance(x, str) and x.isdigit(), int(x) if isinstance(x, str) and x.isdigit() else x))

    for page_num_str in sorted_page_numbers:
        page_report = {
            "page_number": page_num_str, "status": "",
            "nodes_on_page_only_in_g1_by_signature": [],
            "nodes_on_page_only_in_g2_by_signature": [],
            "nodes_on_page_attributes_diff": [],
            "edges_on_page_only_in_g1": [],
            "edges_on_page_only_in_g2": [],
            "edges_on_page_attributes_diff": []
        }

        g1_nodes_on_page_ids = {n for n, d in g1.nodes(data=True) if page_num_str in d.get('page_numbers', [])}
        g2_nodes_on_page_ids = {n for n, d in g2.nodes(data=True) if page_num_str in d.get('page_numbers', [])}

        sig_to_g1_id = {get_entity_signature(n, g1.nodes[n]): n for n in g1_nodes_on_page_ids}
        sig_to_g2_id = {get_entity_signature(n, g2.nodes[n]): n for n in g2_nodes_on_page_ids}

        g1_sigs_on_page = set(sig_to_g1_id.keys())
        g2_sigs_on_page = set(sig_to_g2_id.keys())

        page_in_g1 = bool(g1_nodes_on_page_ids or any(page_num_str in d.get('page_numbers', []) for _,_,d in g1.edges(data=True)))
        page_in_g2 = bool(g2_nodes_on_page_ids or any(page_num_str in d.get('page_numbers', []) for _,_,d in g2.edges(data=True)))

        if page_in_g1 and page_in_g2:
            page_report["status"] = "page_content_compared"

            # Node differences based on signatures
            for sig in g1_sigs_on_page - g2_sigs_on_page:
                node_id_g1 = sig_to_g1_id[sig]
                page_report["nodes_on_page_only_in_g1_by_signature"].append(get_node_display_name(g1, node_id_g1))
            for sig in g2_sigs_on_page - g1_sigs_on_page:
                node_id_g2 = sig_to_g2_id[sig]
                page_report["nodes_on_page_only_in_g2_by_signature"].append(get_node_display_name(g2, node_id_g2))

            # Attribute differences for common nodes (by signature)
            common_node_sigs = g1_sigs_on_page & g2_sigs_on_page
            for sig in common_node_sigs:
                id1, id2 = sig_to_g1_id[sig], sig_to_g2_id[sig]
                attrs1, attrs2 = g1.nodes[id1], g2.nodes[id2]
                node_attr_diffs = {}
                all_keys = set(attrs1.keys()) | set(attrs2.keys())
                # Exclude 'page_numbers' from attribute comparison
                rel_keys = [k for k in all_keys if k != 'page_numbers']
                for key in rel_keys:
                    v1, v2 = attrs1.get(key), attrs2.get(key)
                    if v1 != v2: node_attr_diffs[key] = {"g1_value": v1, "g2_value": v2}
                if node_attr_diffs:
                    page_report["nodes_on_page_attributes_diff"].append({
                        "node_signature": str(sig),
                        "g1_node_id": id1, "g2_node_id": id2,
                        "g1_display_name": get_node_display_name(g1,id1),
                        "g2_display_name": get_node_display_name(g2,id2),
                        "differences": node_attr_diffs
                    })

            # Edge differences using signature-mapped nodes
            g1_edges_on_page_details = []
            for u,v,k,d in g1.edges(keys=True, data=True):
                if page_num_str in d.get('page_numbers', []):
                    g1_edges_on_page_details.append( (u,v,k,d) )
            
            g2_edges_on_page_raw = set()
            for u,v,k,d in g2.edges(keys=True, data=True):
                 if page_num_str in d.get('page_numbers', []):
                    g2_edges_on_page_raw.add( (u,v,k) )

            processed_g2_edges = set()

            for u1, v1, k1, data1 in g1_edges_on_page_details:
                sig_u1 = get_entity_signature(u1, g1.nodes[u1])
                sig_v1 = get_entity_signature(v1, g1.nodes[v1])

                u2_mapped_id = sig_to_g2_id.get(sig_u1)
                v2_mapped_id = sig_to_g2_id.get(sig_v1)

                edge_in_g1_display = (get_node_display_name(g1, u1), get_node_display_name(g1, v1), k1)
                
                if u2_mapped_id is not None and v2_mapped_id is not None:
                    if g2.has_edge(u2_mapped_id, v2_mapped_id, k1):
                        processed_g2_edges.add((u2_mapped_id, v2_mapped_id, k1))
                        data2 = g2.get_edge_data(u2_mapped_id, v2_mapped_id, k1)
                        edge_attr_diffs = {}
                        all_edge_keys = set(data1.keys()) | set(data2.keys())
                        # Exclude 'page_numbers' from attribute comparison
                        rel_edge_keys = [ek for ek in all_edge_keys if ek != 'page_numbers']
                        for ek in rel_edge_keys:
                            ev1, ev2 = data1.get(ek), data2.get(ek)
                            if ev1 != ev2: edge_attr_diffs[ek] = {"g1_value": ev1, "g2_value": ev2}
                        if edge_attr_diffs:
                            page_report["edges_on_page_attributes_diff"].append({
                                "edge_g1": edge_in_g1_display,
                                "edge_g2": (get_node_display_name(g2, u2_mapped_id), get_node_display_name(g2, v2_mapped_id), k1),
                                "differences": edge_attr_diffs
                            })
                    else:
                        page_report["edges_on_page_only_in_g1"].append({
                            "edge": edge_in_g1_display, 
                            "attributes": {key: val for key, val in data1.items() if key != 'page_numbers'}
                        })
                else:
                    page_report["edges_on_page_only_in_g1"].append({
                        "edge": edge_in_g1_display, 
                        "attributes": {key: val for key, val in data1.items() if key != 'page_numbers'},
                        "reason": "endpoint_signature_not_found_in_g2"
                    })
            
            for u2, v2, k2 in g2_edges_on_page_raw:
                if (u2, v2, k2) not in processed_g2_edges:
                    data2 = g2.get_edge_data(u2,v2,k2)
                    page_report["edges_on_page_only_in_g2"].append({
                        "edge": (get_node_display_name(g2, u2), get_node_display_name(g2, v2), k2),
                        "attributes": {key: val for key, val in data2.items() if key != 'page_numbers'}
                    })

        elif page_in_g1:
            page_report["status"] = "content_on_page_only_in_g1"
            for node_id_g1 in g1_nodes_on_page_ids:
                 page_report["nodes_on_page_only_in_g1_by_signature"].append(get_node_display_name(g1, node_id_g1))
            for u,v,k,d_edge in g1.edges(keys=True, data=True):
                if page_num_str in d_edge.get('page_numbers', []):
                    page_report["edges_on_page_only_in_g1"].append({"edge": (get_node_display_name(g1,u), get_node_display_name(g1,v), k), "attributes": {key: val for key, val in d_edge.items() if key != 'page_numbers'}})

        elif page_in_g2:
            page_report["status"] = "content_on_page_only_in_g2"
            for node_id_g2 in g2_nodes_on_page_ids:
                 page_report["nodes_on_page_only_in_g2_by_signature"].append(get_node_display_name(g2, node_id_g2))
            for u,v,k,d_edge in g2.edges(keys=True, data=True):
                if page_num_str in d_edge.get('page_numbers', []):
                    page_report["edges_on_page_only_in_g2"].append({"edge": (get_node_display_name(g2,u), get_node_display_name(g2,v), k), "attributes": {key: val for key, val in d_edge.items() if key != 'page_numbers'}})
        else:
            page_report["status"] = "no_elements_on_page_in_either_graph"

        has_element_diffs = any([
            page_report["nodes_on_page_only_in_g1_by_signature"],
            page_report["nodes_on_page_only_in_g2_by_signature"],
            page_report["nodes_on_page_attributes_diff"],
            page_report["edges_on_page_only_in_g1"],
            page_report["edges_on_page_only_in_g2"],
            page_report["edges_on_page_attributes_diff"]
        ])

        if page_report["status"] == "page_content_compared" and not has_element_diffs:
            page_report["status"] = "no_substantive_diffs_found_on_page"

        # Only add to report if there's a meaningful status or differences
        if page_report["status"] not in ["", "no_elements_on_page_in_either_graph"] or has_element_diffs :
             if not (page_report["status"] == "no_substantive_diffs_found_on_page" and not has_element_diffs) : # Avoid reporting "no diffs" if it truly has no diffs
                differences_by_page[page_num_str] = page_report
            
    return differences_by_page


if __name__ == "__main__":
    try:
        with open('AWPR Version 1_text_graph.pickle', 'rb') as f1
            graph1_data = pickle.load(f1)
    except FileNotFoundError:
        print("Error: One or both pickle files not found. Please ensure the paths are correct.")
        exit()
    except Exception as e:
        print(f"Error loading pickle files: {e}")
        exit()

    if not isinstance(graph1_data, nx.MultiDiGraph) or not isinstance(graph2_data, nx.MultiDiGraph):
        print("Error: Loaded data is not of type NetworkX MultiDiGraph.")
        exit()

    # Assuming compare_graphs_by_page_number_with_signatures is the version WITHOUT category checks
    print("Comparing graphs by page number (with signatures, no category check)...")
    differences_by_page = compare_graphs_by_page_number_with_signatures(graph1_data, graph2_data)

    report_parts = ["\n--- Graph Differences Report by Page Number (Signatures Only) ---"]
    if not differences_by_page:
        report_parts.append("\nNo significant differences found across any page numbers, or no pages with content in both graphs to compare.")
    else:
        for page_num, page_diffs in differences_by_page.items():
            report_parts.append(f"\n\n--- Page Number: {page_diffs['page_number']} ---")
            report_parts.append(f"  Status: {page_diffs['status']}")

            no_element_diffs_reported_for_this_page = True

            if page_diffs.get("nodes_on_page_only_in_g1_by_signature"):
                report_parts.append(f"  Nodes (by signature) on this page only in Graph 1 (Version 1): {', '.join(map(str, page_diffs['nodes_on_page_only_in_g1_by_signature']))}")
                no_element_diffs_reported_for_this_page = False
            if page_diffs.get("nodes_on_page_only_in_g2_by_signature"):
                report_parts.append(f"  Nodes (by signature) on this page only in Graph 2 (Version 2): {', '.join(map(str, page_diffs['nodes_on_page_only_in_g2_by_signature']))}")
                no_element_diffs_reported_for_this_page = False
            
            if page_diffs.get("nodes_on_page_attributes_diff"):
                report_parts.append("  Nodes on this page with attribute differences (matched by signature):")
                for diff_item in page_diffs["nodes_on_page_attributes_diff"]:
                    report_parts.append(
                        f"    Node Signature '{diff_item['node_signature']}' "
                        f"(G1 Display: '{diff_item['g1_display_name']}', G2 Display: '{diff_item['g2_display_name']}')"
                        f": {json.dumps(diff_item['differences'], indent=2)}"
                    )
                no_element_diffs_reported_for_this_page = False

            if page_diffs.get("edges_on_page_only_in_g1"):
                report_parts.append("  Edges on this page only in Graph 1 (Version 1) (endpoints matched by signature):")
                for diff_item in page_diffs["edges_on_page_only_in_g1"]:
                    reason = f" (Reason: {diff_item['reason']})" if 'reason' in diff_item else ""
                    report_parts.append(f"    Edge {diff_item['edge']}{reason} with attributes: {json.dumps(diff_item['attributes'], indent=2)}")
                no_element_diffs_reported_for_this_page = False
            if page_diffs.get("edges_on_page_only_in_g2"):
                report_parts.append("  Edges on this page only in Graph 2 (Version 2) (endpoints matched by signature):")
                for diff_item in page_diffs["edges_on_page_only_in_g2"]:
                    report_parts.append(f"    Edge {diff_item['edge']} with attributes: {json.dumps(diff_item['attributes'], indent=2)}")
                no_element_diffs_reported_for_this_page = False
            if page_diffs.get("edges_on_page_attributes_diff"):
                report_parts.append("  Edges on this page with attribute differences (endpoints matched by signature):")
                for diff_item in page_diffs["edges_on_page_attributes_diff"]:
                     report_parts.append(f"    Edge G1: {diff_item['edge_g1']} to G2: {diff_item['edge_g2']}: {json.dumps(diff_item['differences'], indent=2)}")
                no_element_diffs_reported_for_this_page = False
            
            if no_element_diffs_reported_for_this_page and \
               page_diffs["status"] not in ["content_on_page_only_in_g1", "content_on_page_only_in_g2"] and \
               page_diffs["status"] != "no_substantive_diffs_found_on_page":
                 report_parts.append("  No other element differences found for this specific page number under current conditions.")
            elif page_diffs["status"] == "no_substantive_diffs_found_on_page" and no_element_diffs_reported_for_this_page:
                 report_parts.append("  No substantive element differences were found on this page.")


    diff_summary_str_by_page = "\n".join(report_parts)
    print(diff_summary_str_by_page)

    # create_diff_report(diff_summary_str_by_page)



# print("Nodes:", g1.nodes(data=True))
# print("Edges:", g1.edges(data=True))

# import matplotlib.pyplot as plt
# pos = nx.spring_layout(g1, k=0.9, iterations=20)
# labels = {node: g1.nodes[node]['label'] for node in g1.nodes()}
# edge_labels = {(u, v): d['relationship_type'] for u, v, d in g1.edges(data=True)}

# plt.figure(figsize=(12, 10))
# nx.draw(g1, pos, with_labels=False, node_size=200, node_color="skyblue", font_size=8, width=1.5, alpha=0.8)
# nx.draw_networkx_labels(g1, pos, labels=labels, font_size=9)
# nx.draw_networkx_edge_labels(g1, pos, edge_labels=edge_labels, font_color='red', font_size=7)
# plt.title("Financial Regulation Graph")
# plt.show()


Comparing graphs by page number (with signatures, no category check)...

--- Graph Differences Report by Page Number (Signatures Only) ---


--- Page Number: 1 ---
  Status: page_content_compared
  Nodes (by signature) on this page only in Graph 2 (Version 2): Revision of AWPR Guidelines, Discontinuation of SLIBOR
  Nodes on this page with attribute differences (matched by signature):
    Node Signature '('label', 'Sri Lanka Inter Bank Offered Rate (SLIBOR)')' (G1 Display: 'Sri Lanka Inter Bank Offered Rate (SLIBOR)', G2 Display: 'Sri Lanka Inter Bank Offered Rate (SLIBOR)'): {
  "description": {
    "g1_value": "former benchmark rate",
    "g2_value": "former benchmark interest"
  },
  "category": {
    "g1_value": [
      "Rationale for Revised"
    ],
    "g2_value": [
      "Revision of"
    ]
  }
}
    Node Signature '('label', 'Reporting Week')' (G1 Display: 'Reporting Week', G2 Display: 'Reporting Week'): {
  "description": {
    "g1_value": "period for reporting rates",
    "g2

In [27]:
import networkx as nx
from langchain_google_vertexai import VertexAI
from google.oauth2 import service_account
from langchain.chains.graph_qa.base import GraphQAChain
import json



with open('AWPR Version 1_text_graph.pickle', 'rb') as f1, \
    open('AWPR Version 2_text_graph.pickle', 'rb') as f2:
    graph1 = pickle.load(f1)
    graph2 = pickle.load(f2)

print(type(graph1))
print(type(graph2))


chain1 = None
chain2 = None

llm = VertexAILangchainLLM({})

try:
    chain1 = GraphQAChain.from_llm(
        llm=llm,
        graph=graph1, # Use the first graph
        verbose=True, # Set to False if intermediate outputs are too noisy
    )
    print("GraphQAChain for graph1 created successfully!")
except Exception as e:
    print(f"Error creating GraphQAChain for graph1: {e}")
    # Consider how to handle this error, e.g., exit or skip comparison

try:
    chain2 = GraphQAChain.from_llm(
        llm=llm,
        graph=graph2, # Use the second graph
        verbose=True, # Set to False if intermediate outputs are too noisy
    )
    print("GraphQAChain for graph2 created successfully!")
except Exception as e:
    print(f"Error creating GraphQAChain for graph2: {e}")
    # Consider how to handle this error

if chain1 and chain2:
    try:
        # Define a query to extract relevant information from each graph
        # This query should be designed to get comparable information
        # For example, to get a general overview:
        query = "Describe the main types of nodes and relationships in this graph and their counts."
        # Or, for more specific comparisons:
        # query = "List all nodes with label 'User' and their properties."

        print(f"\nQuerying graph1: {query}")
        response1_raw = chain1.invoke({"query": query}) # Or chain1.run(query)
        response1 = response1_raw.get("result", str(response1_raw)) # Adapt based on actual response structure
        print(f"Response from graph1:\n{response1}")

        print(f"\nQuerying graph2: {query}")
        response2_raw = chain2.invoke({"query": query}) # Or chain2.run(query)
        response2 = response2_raw.get("result", str(response2_raw)) # Adapt based on actual response structure
        print(f"Response from graph2:\n{response2}")

        # Now, use the LLM to compare the two responses
        comparison_prompt_template = """
        You are an expert graph analyst. Based on the information provided below from two different graphs,
        identify and summarize the key differences between them.

        Information from Graph 1:
        ---
        {graph1_info}
        ---

        Information from Graph 2:
        ---
        {graph2_info}
        ---

        What are the main differences between Graph 1 and Graph 2 regarding their node types, relationship types, counts, or any other salient features mentioned?
        Be specific and clear.
        """
        
        comparison_prompt = comparison_prompt_template.format(graph1_info=response1, graph2_info=response2)

        print("\nComparing responses using LLM...")
        # Assuming your llm object can be invoked directly for chat/completion
        # This might be llm.invoke(comparison_prompt) or llm(comparison_prompt)
        # depending on the LLM library (e.g., LangChain)
        comparison_result = llm.invoke(comparison_prompt) 
        # If llm.invoke returns a complex object, extract the content:
        # e.g., comparison_output = comparison_result.content if hasattr(comparison_result, 'content') else str(comparison_result)
        
        print("\n--- Comparison Result ---")
        # Adjust access to content based on your LLM object's response structure
        if hasattr(comparison_result, 'content'):
            print(comparison_result.content)
        else:
            print(comparison_result)
        print("--- End of Comparison ---")

    except Exception as e:
        print(f"Error during graph querying or comparison: {e}")
else:
    print("One or both GraphQAChains could not be initialized. Comparison aborted.")

<class 'networkx.classes.multidigraph.MultiDiGraph'>
<class 'networkx.classes.multidigraph.MultiDiGraph'>
Error creating GraphQAChain for graph1: 1 validation error for GraphQAChain
graph
  instance of NetworkxEntityGraph expected (type=type_error.arbitrary_type; expected_arbitrary_type=NetworkxEntityGraph)
Error creating GraphQAChain for graph2: 1 validation error for GraphQAChain
graph
  instance of NetworkxEntityGraph expected (type=type_error.arbitrary_type; expected_arbitrary_type=NetworkxEntityGraph)
One or both GraphQAChains could not be initialized. Comparison aborted.


In [45]:
def get_diff_prompt():
    return """Generate a detailed summary of the differences between two versions of a graph, presented in a tabular format. The input is a JSON object that outlines the changes in nodes and edges.

        Here is the JSON input to process:
        -------------------
        {difference_json}
        

        The JSON input will be in the following format:

        {
        "nodes_diff": { // Differences in nodes
            "added_in_v2": [ // Nodes added in version 2
            {
                "label": "Node Label",
                "description": "Node Description"
                // ... other attributes
            }
            ],
            "removed_from_v1": [ // Nodes removed from version 1
            {
                "label": "Node Label",
                "description": "Node Description"
                // ... other attributes
            }
            ],
            "modified": [ // Nodes modified between versions
            {
                "label": "Node Label",
                "v1_description": "Version 1 Description",
                "v2_description": "Version 2 Description"
                // ... other changed attributes like "other_changed_attributes": {"attribute_name": {"v1_value": "...", "v2_value": "..."}}
            }
            ]
        },
        "edges_diff": { // Differences in edges
            "added_in_v2": [ // Edges added in version 2
            {
                "source_label": "Source Node Label",
                "target_label": "Target Node Label",
                "relationship_label": "Relationship Label",
                "details": "Edge Details"
                // ... other attributes
            }
            ],
            "removed_from_v1": [ // Edges removed from version 1
            {
                "source_label": "Source Node Label",
                "target_label": "Target Node Label",
                "relationship_label": "Relationship Label",
                "details": "Edge Details"
                // ... other attributes
            }
            ],
            "modified": [ // Edges modified between versions
            {
                "source_label": "Source Node Label",
                "target_label": "Target Node Label",
                "v1_relationship_label": "Version 1 Relationship",
                "v2_relationship_label": "Version 2 Relationship",
                "v1_details": "Version 1 Details",
                "v2_details": "Version 2 Details"
                // ... other changed attributes
            }
            ]
        }
        }

        Your task is to process this JSON and produce two tables: one for node differences and one for edge differences.

        **Node Differences Table:**
        The table should have the following columns:
        - "Change Type" (e.g., Added, Removed, Modified)
        - "Label"
        - "Attribute Changed" (e.g., Description, or specific attribute from `other_changed_attributes`)
        - "Value in v1" (empty if Added)
        - "Value in v2" (empty if Removed)

        **Edge Differences Table:**
        The table should have the following columns:
        - "Change Type" (e.g., Added, Removed, Modified)
        - "Source Label"
        - "Target Label"
        - "Attribute Changed" (e.g., Relationship Label, Details)
        - "Value in v1" (empty if Added)
        - "Value in v2" (empty if Removed)

        For modified items, clearly show the old (v1) and new (v2) values for each changed attribute. If an item has multiple attributes changed (e.g., both description and another property for a node, or both relationship label and details for an edge), list each changed attribute as a separate row or clearly delineate them for that item.



        Please provide the summary as two distinct markdown tables.
        """

<networkx.classes.multidigraph.MultiDiGraph at 0x14b57c940>

In [3]:


def get_diff_prompt(graph_difference):


    print(f"--- Sending to Vertex AI (Placeholder) ---")
    messages = []

    template = get_diff_prompt()
    human_template = HumanMessagePromptTemplate.from_template(template)
    messages.append(human_template)
    chat_prompt = ChatPromptTemplate.from_messages(messages)
    request = chat_prompt.format_prompt(difference_json= graph_difference, 
                                        ).to_messages()
    
    request_dicts = [{"role": msg.type, "content": msg.content} for msg in request]

    print(f"request_dicts: {request_dicts}")

    llm = VertexAILangchainLLM({})
    try:
        response = llm._call(prompt=str(request_dicts))
        print(response)

    except Exception as e:
        print("Some error occered"+str(e))

In [10]:
import json

def get_diff_prompt():

    prompt = """Please analyze the provided JSON data, which outlines the changes between two versions of a graph (v1 and v2). Your task is to generate a comprehensive report in a flowing, human-readable paragraph style.

        For each identified difference, provide:
        1   Give a specific ID to each title so it is easy extract it 
        2   The overall shift in the graph's structure and meaning.
        3   **[Title Describing the Key Change, e.g., "Introduction of Reporting Week Definition" or "Discontinuation of SLIBOR's Role with Task Force"]**
        4   Explain how relationships and entities have transformed, highlighting the introduction of new concepts, the phasing out of previous ones, and changes in the characteristics of existing elements.
        5   The introduction of new concepts or connections and the phasing out of previous ones.
        6  Integrate the underlying reasons or implications for these changes, as suggested by the 'details' provided.
        7   The underlying reasons or implications for these changes, as suggested by the 'details' provided.
        8.  A "Confidence Rating" for the significance of the change (High, Medium, Low).
            *   **Low Confidence:** Assign this to changes that are primarily rephrasing, minor spelling corrections, or slight alterations in 'details' or 'description' fields that do not fundamentally change the meaning or structure. For example, changing "Regulates" to "regulates" or minor wording adjustments in details.
            *   **Medium Confidence:** Assign this to changes in relationship labels that might alter the nuance of a connection but not its core existence, or more substantial changes to 'details' that provide new, but not critical, information.
            *   **High Confidence:** Assign this to the addition or removal of nodes or edges, or modifications to relationship labels or critical attributes that fundamentally alter the graph's structure or the core meaning of a connection.

        9.  **Crucially suggest potential next steps, considerations, or solutions that might be relevant in response to these changes.** For example, if a process is newly defined, what might need to be implemented? If a key component is removed, what are the implications to address?

        Finally, provide an overall concise summary highlighting the most significant (High/Medium confidence) changes.

        Input JSON:
        {difference_json}

        The goal is a cohesive, insightful narrative that not only explains what changed but also why it matters and what might need to be done in response.
        """
    return prompt

def send_diff_to_llm(graph_difference_json_string):
 
    print(f"--- Preparing to send to LLM ---")
    messages = []

    template = get_diff_prompt() # Get the prompt string
    human_template = HumanMessagePromptTemplate.from_template(template)
    messages.append(human_template) 


    chat_prompt = ChatPromptTemplate.from_messages(messages)


    prompt_messages = chat_prompt.format_prompt(difference_json=graph_difference_json_string).to_messages()

    request_content_str = "".join([msg.content for msg in prompt_messages])


    print(f"Formatted Prompt for LLM: {request_content_str[:500]}...") # Print start of the prompt

    llm = VertexAILangchainLLM({}) # Replace with your actual LLM initialization
    try:
        # Assuming llm._call expects a single string prompt
        response = llm._call(prompt=request_content_str)
        print("\n--- LLM Response ---")
        print(response)
        return response
    except Exception as e:
        print(f"An error occurred while calling LLM: {e}")
        return None

if __name__ == "__main__":
    # Path to your merged JSON file
    merged_json_path = "/Users/shirsama/dtcc-hackathon/dtcc-ai-hackathon-2025/merged_graph_output.json"

    with open(merged_json_path, 'r') as f:
        graph_diff_json_content_string = f.read()

        if graph_diff_json_content_string:
            llm_summary = send_diff_to_llm(graph_diff_json_content_string)
            if llm_summary:
                print("\nSummary generation successful.")
            else:
                print("\nSummary generation failed.")
        else:
            print(f"Error: The file '{merged_json_path}' is empty or could not be read.")

    # except FileNotFoundError:
    #     print(f"Error: The file '{merged_json_path}' was not found.")
    # except json.JSONDecodeError:
    #     print(f"Error: The file '{merged_json_path}' does not contain valid JSON.")
    # except Exception as e:
    #     print(f"An unexpected error occurred: {e}")



--- Preparing to send to LLM ---
Formatted Prompt for LLM: Please analyze the provided JSON data, which outlines the changes between two versions of a graph (v1 and v2). Your task is to generate a comprehensive report in a flowing, human-readable paragraph style.

        For each identified difference, provide:
        1   Give a specific ID to each title so it is easy extract it 
        2   The overall shift in the graph's structure and meaning.
        3   **[Title Describing the Key Change, e.g., "Introduction of Reporting Week Definition" or "Disc...

--- LLM Response ---
## Analysis of AWPR Guidelines Changes (v1 to v2)

The changes from v1 to v2 of the AWPR guidelines represent a significant restructuring of the reporting process, introducing greater clarity and specificity.  Several new entities and relationships have been added, reflecting a more detailed and formalized approach to AWPR calculation and reporting.

**(1) Organizational Oversight of AWPR Reporting (High Confide

In [None]:
import json
import re
from langchain.prompts.chat import ChatPromptTemplate
from langchain.prompts.chat import HumanMessagePromptTemplate
from docx import Document


def get_diff_prompt():

    prompt = """Please analyze the provided JSON data, which outlines the changes between two versions of a graph (v1 and v2). Your task is to generate a clear Key Operating Procedures (KOP) out of it

        For each identified difference, provide:
        1   The overall shift in the graph's structure and meaning.
        2   **[Title Describing the Key Change, e.g., "Introduction of Reporting Week Definition" or "Discontinuation of SLIBOR's Role with Task Force"]**
        3   Explain how relationships and entities have transformed, highlighting the introduction of new concepts, the phasing out of previous ones, and changes in the characteristics of existing elements.
        4   The introduction of new concepts or connections and the phasing out of previous ones.
        5.  Integrate the underlying reasons or implications for these changes, as suggested by the 'details' provided.
        6   The underlying reasons or implications for these changes, as suggested by the 'details' provided.
       

        8.  **Crucially suggest potential next steps, considerations, or solutions that might be relevant in response to these changes.** For example, if a process is newly defined, what might need to be implemented? If a key component is removed, what are the implications to address?

        Finally, provide an overall concise summary highlighting the most significant (High/Medium confidence) changes.

        Input JSON:
        {difference_json}

            Generate a KOP document with step wise instruction for operational personnel.        """
    return prompt

def send_diff_to_llm(graph_difference_json_string):
 
    print(f"--- Preparing to send to LLM ---")
    messages = []

    template = get_diff_prompt() # Get the prompt string
    human_template = HumanMessagePromptTemplate.from_template(template)
    messages.append(human_template) 


    chat_prompt = ChatPromptTemplate.from_messages(messages)


    prompt_messages = chat_prompt.format_prompt(difference_json=graph_difference_json_string).to_messages()

    request_content_str = "".join([msg.content for msg in prompt_messages])


    print(f"Formatted Prompt for LLM: {request_content_str[:500]}...") # Print start of the prompt

    llm = VertexAILangchainLLM({}) # Replace with your actual LLM initialization
    try:
        # Assuming llm._call expects a single string prompt
        response = llm._call(prompt=request_content_str)
        print("\n--- LLM Response ---")
        print(response)

        doc = Document()
        doc.add_heading("Key Operating Procedure (KOP)", 0)
        markdown_to_docx(doc, response)
        
        path = '.'
        filename = "KOP_AWPR.docx"

        full_path = os.path.join(path, filename)

        try:
            doc.save(full_path)
            print(f"Document saved successfully to: {full_path}")
        except Exception as e:
            print(f"Error saving document: {e}")

    except Exception as e:
        print(f"An error occurred while calling LLM: {e}")
        return None
    

def markdown_to_docx(doc , text: str):
    lines = text.split('\n')
    for line in lines:
        stripped = line.strip()
        if not stripped:
            doc.add_paragraph()
            continue
        if stripped.startswith("###"):
            doc.add_heading(stripped.lstrip("#").strip(), level=3)
        elif stripped.startswith("##"):
            doc.add_heading(stripped.lstrip("#").strip(), level=2)
        elif stripped.startswith("#"):
            doc.add_heading(stripped.lstrip("#").strip(), level=1)
        elif stripped.startswith("- "):
            doc.add_paragraph(stripped[2:], style='List Bullet')
        elif re.match(r'^\d+\.\s', stripped):
            doc.add_paragraph(re.sub(r'^\d+\.\s', '', stripped), style='List Number')
        elif "**" in stripped:
            para = doc.add_paragraph()
            while "**" in stripped:
                before, bold, rest = stripped.split("**", 2)
                para.add_run(before)
                bold_run = para.add_run(bold)
                bold_run.bold = True
                stripped = rest
            para.add_run(stripped)
        else:
            doc.add_paragraph(stripped)

if __name__ == "__main__":
    # Path to your merged JSON file
    merged_json_path = "/Users/shirsama/dtcc-hackathon/dtcc-ai-hackathon-2025/merged_graph_output.json"

    with open(merged_json_path, 'r') as f:
        graph_diff_json_content_string = f.read()

        if graph_diff_json_content_string:
            llm_summary = send_diff_to_llm(graph_diff_json_content_string)
            if llm_summary:
                print("\nSummary generation successful.")
            else:
                print("\nSummary generation failed.")
        else:
            print(f"Error: The file '{merged_json_path}' is empty or could not be read.")

