In [1]:
# ==========================================
# CELL 1: SETUP
# ==========================================
import pickle
import networkx as nx
import unicodedata
from collections import defaultdict, Counter
from rapidfuzz import process, fuzz
import numpy as np
import pandas as pd
import re
from typing import List, Any

# --- CONFIG ---
GRAPH_PATH = "./models/knowledge_graph.pkl"
FUZZY_THRESHOLD = 90  # Similarity score (0-100) to flag as potential duplicate

print("‚úÖ MRI Scanner Ready.")

‚úÖ MRI Scanner Ready.


In [2]:
# ==========================================
# CELL 2: MRI SCANNER LOGIC
# ==========================================

class GraphMRI:
    def __init__(self, graph_path):
        print(f"üìÇ Loading Graph from {graph_path}...")
        with open(graph_path, "rb") as f:
            self.G = pickle.load(f)
        self.nodes = list(self.G.nodes())
        print(f"   üìä Graph Loaded. Total Nodes: {len(self.nodes):,}")

    def _normalize_key(self, text):
        """
        Aggressive normalizer: lowercases, removes all non-alphanumeric chars.
        Examples:
        'Jay-Z' -> 'jayz'
        'JAY Z' -> 'jayz'
        'Beyonc√©' -> 'beyonce'
        """
        # Convert non-string nodes (like integers) to string first
        text = str(text)
        
        # NFKC normalizes compatibility characters (e.g., fancy hyphens, accents)
        text = unicodedata.normalize('NFKC', text)
        
        # Keep only alphanumeric and lowercase
        return "".join([c.lower() for c in text if c.isalnum()])

    def scan_normalization_collisions(self):
        """
        Finds nodes that are effectively the same name but formatted differently.
        """
        print("\nüîç SCAN 1: Normalization Collisions")
        print("   (Detects: 'Jay-Z' vs 'Jay Z' vs 'TIDAL' vs 'Tidal')")
        print("-" * 50)
        
        # Map normalized_key -> [list of actual nodes]
        normalization_map = defaultdict(list)
        for node in self.nodes:
            key = self._normalize_key(node)
            if key: 
                normalization_map[key].append(node)
        
        collision_count = 0
        clusters = []

        for key, candidates in normalization_map.items():
            if len(candidates) > 1:
                # We found a collision!
                collision_count += 1
                
                # Sort by degree (connectivity) to see which is the "Main" node
                candidates_with_degree = [(n, self.G.degree(n)) for n in candidates]
                # Sort descending by degree
                candidates_with_degree.sort(key=lambda x: x[1], reverse=True)
                
                clusters.append(candidates_with_degree)

        # Report findings
        if collision_count == 0:
            print("   ‚úÖ No Normalization Collisions found.")
        else:
            print(f"   ‚ö†Ô∏è Found {collision_count} collision clusters.")
            print("   Top 10 Clusters by Impact:")
            
            # Show top 10 most impactful duplicates (ignoring year numbers usually)
            sorted_clusters = sorted(clusters, key=lambda x: x[0][1], reverse=True)
            
            for cluster in sorted_clusters[:10]:
                primary_node, primary_deg = cluster[0]
                print(f"\n   üö© Cluster: '{self._normalize_key(primary_node)}'")
                
                for node, degree in cluster:
                    tag = "üèÜ MAIN" if degree == primary_deg else "üóëÔ∏è DUPE"
                    print(f"      {tag}: '{node}' (Connections: {degree})")
                    
                # Check for shared neighbors (Are they talking to the same people?)
                if len(cluster) >= 2:
                    n1 = set(self.G.neighbors(cluster[0][0]))
                    n2 = set(self.G.neighbors(cluster[1][0]))
                    shared = n1.intersection(n2)
                    print(f"      üîó Shared Neighbors: {len(shared)} (e.g., {list(shared)[:3]})")

    def scan_targeted_fuzzy(self, targets):
        """
        Checks specific important entities for spelling variations.
        """
        print(f"\nüîç SCAN 2: Targeted Fuzzy Scan for {targets}")
        print("-" * 50)
        
        # Convert all nodes to string for rapidfuzz
        node_strings = [str(n) for n in self.nodes]
        
        for target in targets:
            # extract top 10 matches
            matches = process.extract(target, node_strings, limit=10, scorer=fuzz.token_sort_ratio)
            
            suspicious = []
            for match_name, score, index in matches:
                # We only care about high scores that aren't exact matches
                if score >= FUZZY_THRESHOLD and match_name != target:
                    degree = self.G.degree(match_name)
                    suspicious.append((match_name, score, degree))
            
            if suspicious:
                print(f"   üéØ Target: '{target}'")
                for cand in suspicious:
                    print(f"      ‚Ä¢ Potential Dupe: '{cand[0]}' (Score: {cand[1]:.1f}, Degree: {cand[2]})")
            else:
                print(f"   ‚úÖ Target '{target}': Clean (No fuzzy dupes > {FUZZY_THRESHOLD}%)")
        
    def scan_redundant_edges(self):
        """
        Scan 3: Finds pairs of nodes with multiple edges that are identical.
        (Fixed to handle mixed int/str node types).
        """
        print("\nüîç SCAN 3: Redundant Edge Analysis")
        print("   (Detects: Duplicate relations between same nodes)")
        print("-" * 60)
        
        if not self.G.is_multigraph():
            print("   ‚ÑπÔ∏è Graph is not a MultiGraph. Duplicate edges are impossible.")
            return

        multi_edge_pairs = 0
        redundant_count = 0
        processed_pairs = set()

        # Iterate over all nodes
        for u in self.G.nodes():
            # Iterate over outgoing neighbors
            for v in self.G[u]:
                
                # Create a unique key for this directed edge pair
                pair_key = (u, v)
                
                # Skip if we've already analyzed this pair
                if pair_key in processed_pairs:
                    continue
                processed_pairs.add(pair_key)

                # Get all edges between u and v
                edge_dict = self.G.get_edge_data(u, v)
                
                # If there is more than 1 edge key (0, 1, 2...)
                if len(edge_dict) > 1:
                    multi_edge_pairs += 1
                    
                    # Extract relations safely
                    relations = []
                    for d in edge_dict.values():
                        rel = d.get('relation', 'related_to')
                        # Ensure relation is a string before lowercasing
                        if isinstance(rel, str):
                            relations.append(rel.lower().strip())
                        else:
                            relations.append(str(rel))
                    
                    # Check if unique relations < total edges
                    if len(set(relations)) < len(relations):
                        redundant_count += 1

        print(f"   üìä Node Pairs with Multiple Edges: {multi_edge_pairs:,}")
        print(f"   ‚ö†Ô∏è  Pairs with EXACT Duplicate Relations: {redundant_count:,}")

# Initialize
mri = GraphMRI(GRAPH_PATH)

üìÇ Loading Graph from ./models/knowledge_graph.pkl...
   üìä Graph Loaded. Total Nodes: 311,236


In [3]:
# ==========================================
# CELL 3: RUN DIAGNOSIS
# ==========================================

# 1. Run the Normalization Scan (Finds formatting errors)
mri.scan_normalization_collisions()

# 2. Run the Fuzzy Scan (Finds typos for key entities)
# Add entities you care about here
key_entities = [
    "Beyonc√©", 
    "Jay-Z", 
    "Tidal", 
    "Parkwood Entertainment", 
    "Destiny's Child",
    "Columbia Records"
]

mri.scan_targeted_fuzzy(key_entities)


üîç SCAN 1: Normalization Collisions
   (Detects: 'Jay-Z' vs 'Jay Z' vs 'TIDAL' vs 'Tidal')
--------------------------------------------------
   ‚ö†Ô∏è Found 15396 collision clusters.
   Top 10 Clusters by Impact:

   üö© Cluster: 'unitedstates'
      üèÜ MAIN: 'United States' (Connections: 1907)
      üóëÔ∏è DUPE: 'United_States' (Connections: 16)
      üîó Shared Neighbors: 2 (e.g., ['2007', 'Myanmar'])

   üö© Cluster: 'newyorkcity'
      üèÜ MAIN: 'New York City' (Connections: 645)
      üóëÔ∏è DUPE: 'New_York_City' (Connections: 1)
      üîó Shared Neighbors: 1 (e.g., ["one of the world's largest natural harbors"])

   üö© Cluster: 'ottomanempire'
      üèÜ MAIN: 'Ottoman Empire' (Connections: 527)
      üóëÔ∏è DUPE: 'Ottoman_Empire' (Connections: 6)
      üóëÔ∏è DUPE: 'Ottoman empire' (Connections: 3)
      üîó Shared Neighbors: 1 (e.g., ['1918'])

   üö© Cluster: 'unitedkingdom'
      üèÜ MAIN: 'United Kingdom' (Connections: 500)
      üóëÔ∏è DUPE: 'United_Kin

In [4]:
# ==========================================
# CELL 4: THE GRAPH SURGEON (PRUNING ENGINE)
# ==========================================

class GraphSurgeon:
    def __init__(self, graph_path):
        self.graph_path = graph_path
        print(f"üìÇ Loading Patient Graph from {graph_path}...")
        with open(graph_path, "rb") as f:
            self.G = pickle.load(f)
        self.initial_count = self.G.number_of_nodes()
        print(f"   üìä Initial Nodes: {self.initial_count:,}")

    def _normalize_key(self, text):
        """Same normalization logic as the MRI scanner."""
        text = str(text)
        text = unicodedata.normalize('NFKC', text)
        return "".join([c.lower() for c in text if c.isalnum()])

    def merge_duplicates(self):
        """
        Scans for normalization collisions and merges them into the 
        highest-degree node (The 'Alpha').
        """
        print("\nüî™ Starting Merge Operation...")
        
        # 1. Map Keys to Nodes
        normalization_map = defaultdict(list)
        for node in self.G.nodes():
            key = self._normalize_key(node)
            if key:
                normalization_map[key].append(node)
        
        merged_count = 0
        nodes_removed = 0
        
        # 2. Iterate through clusters
        for key, candidates in normalization_map.items():
            if len(candidates) > 1:
                # Sort by Degree (Highest first) -> The Alpha is index 0
                candidates.sort(key=lambda x: self.G.degree(x), reverse=True)
                
                alpha_node = candidates[0] # The Winner (e.g., "United States")
                beta_nodes = candidates[1:] # The Losers (e.g., "United_States")
                
                for beta in beta_nodes:
                    self._transplant_edges(source=beta, target=alpha_node)
                    self.G.remove_node(beta)
                    nodes_removed += 1
                
                merged_count += 1

        print("-" * 40)
        print(f"   ‚úÖ Operation Complete.")
        print(f"   üîó Clusters Merged: {merged_count:,}")
        print(f"   üóëÔ∏è Nodes Removed:   {nodes_removed:,}")
        print(f"   üìâ Final Node Count: {self.G.number_of_nodes():,} (Was {self.initial_count:,})")

    def _transplant_edges(self, source, target):
        """
        Moves all edges from Source to Target, then deletes Source.
        """
        # Outgoing edges: source -> neighbor
        for neighbor in list(self.G.successors(source)):
            edge_data = self.G.get_edge_data(source, neighbor)
            if not self.G.has_edge(target, neighbor):
                self.G.add_edge(target, neighbor, **edge_data)
        
        # Incoming edges: neighbor -> source
        for neighbor in list(self.G.predecessors(source)):
            edge_data = self.G.get_edge_data(neighbor, source)
            if not self.G.has_edge(neighbor, target):
                self.G.add_edge(neighbor, target, **edge_data)

    def save(self, output_path):
        print(f"\nüíæ Saving Healthy Graph to {output_path}...")
        with open(output_path, "wb") as f:
            pickle.dump(self.G, f)
        print("   ‚úÖ Save Complete.")

    # def normalize_edge_relations(self):
    #     """
    #     Standardizes edge names.
    #     1. Explicit Mapping (e.g. 'is_located_in' -> 'located_in')
    #     2. Format Standardization (snake_case, remove trailing 's')
    #     3. De-duplication (if renamed edge already exists, delete the old one)
    #     """
    #     print("\nüõ†Ô∏è Starting Edge Normalization...")
        
    #     # 1. DEFINE CANONICAL MAP (Safe Merges Only)
    #     # Based on your Scan Results
    #     mapping = {
    #         "is_located_in": "located_in",
    #         "located in": "located_in",
    #         "located_on": "located_in", # Context dependent, but usually safe for RAG
    #         "includes": "include",
    #         "included": "include",
    #         "contains": "include", # Semantic synonym
    #         "is_part_of": "part_of",
    #         "part of": "part_of",
    #         "released_in": "released",
    #         "released_on": "released",
    #         "is_released_in": "released",
    #         "occurred_in": "occurred",
    #         "occurred_on": "occurred",
    #         "published_in": "published",
    #         "published_on": "published",
    #         "is_published_in": "published",
    #         "was_born_in": "born_in",
    #         "born in": "born_in"
    #     }
        
    #     normalized_count = 0
    #     deleted_redundant = 0
        
    #     # Iterate over a static list of edges because we will modify the graph
    #     # list(G.edges) gives (u, v) tuples
    #     for u, v in list(self.G.edges()):
    #         # Get the existing attributes
    #         attrs = self.G.get_edge_data(u, v)
    #         old_rel = attrs.get('relation', 'related_to')
            
    #         # Skip if not string
    #         if not isinstance(old_rel, str): continue
            
    #         old_rel_lower = old_rel.lower().strip()
    #         new_rel = old_rel_lower
            
    #         # --- APPLY RULES ---
            
    #         # Rule 1: Explicit Map
    #         if old_rel_lower in mapping:
    #             new_rel = mapping[old_rel_lower]
            
    #         # Rule 2: Syntactic Cleanup (snake_case)
    #         new_rel = new_rel.replace(" ", "_")
            
    #         # Rule 3: Naive Stemming (Remove trailing 's' if > 4 chars)
    #         # e.g., "releases" -> "release", but keep "is", "has"
    #         if new_rel.endswith("s") and len(new_rel) > 4 and not new_rel.endswith("ss"):
    #             # specific check to avoid "press" -> "pres"
    #             new_rel = new_rel[:-1]

    #         # --- EXECUTE UPDATE ---
            
    #         if new_rel != old_rel:
    #             # We need to change the relation name.
    #             # BUT, does an edge with 'new_rel' already exist?
                
    #             # Check if we are creating a collision
    #             # (Since G is a DiGraph, we can't have two edges u->v. 
    #             # But we can update the 'relation' attribute of the existing one)
                
    #             # Wait, DiGraph stores edge data in a dict.
    #             # If we just update attributes: G[u][v]['relation'] = new_rel
    #             # But what if G[u][v] ALREADY exists? 
    #             # In a DiGraph, there is only ONE edge between u and v. 
    #             # So we are just renaming that one edge's label.
                
    #             # However, if your graph WAS a MultiGraph (it isn't), this logic would be harder.
    #             # Since it is a DiGraph, we just update the attribute.
                
    #             self.G[u][v]['relation'] = new_rel
    #             normalized_count += 1
                
        # print("-" * 50)
        # print(f"   ‚úÖ Normalization Complete.")
        # print(f"   üè∑Ô∏è Relations Updated: {normalized_count:,}")

# Initialize
surgeon = GraphSurgeon(GRAPH_PATH)

üìÇ Loading Patient Graph from ./models/knowledge_graph.pkl...
   üìä Initial Nodes: 311,236


In [5]:
# ==========================================
# CELL 5: EXECUTE SURGERY & SAVE
# ==========================================

# 1. Perform the Merge
surgeon.merge_duplicates()

# 2. Save the Clean Version
# We save to a NEW file to be safe.
OUTPUT_PATH = "./models/knowledge_graph_clean.pkl"
surgeon.save(OUTPUT_PATH)

print("\n‚ö†Ô∏è ACTION REQUIRED: Update your 'omni_rag_modules.py' to point to:")
print(f"   GRAPH_PATH = '{OUTPUT_PATH}'")


üî™ Starting Merge Operation...
----------------------------------------
   ‚úÖ Operation Complete.
   üîó Clusters Merged: 15,396
   üóëÔ∏è Nodes Removed:   17,103
   üìâ Final Node Count: 294,133 (Was 311,236)

üíæ Saving Healthy Graph to ./models/knowledge_graph_clean.pkl...
   ‚úÖ Save Complete.

‚ö†Ô∏è ACTION REQUIRED: Update your 'omni_rag_modules.py' to point to:
   GRAPH_PATH = './models/knowledge_graph_clean.pkl'


In [6]:
# ==========================================
# GRAPH MRI SCANNER
# ==========================================
# 1. LOAD THE BRAIN
print(f"üìÇ Loading Graph from {OUTPUT_PATH}...")

with open(OUTPUT_PATH, "rb") as f:
    G = pickle.load(f)

print(f"‚úÖ Graph Loaded. Nodes: {G.number_of_nodes():,} | Edges: {G.number_of_edges():,}")
print("-" * 60)

# 2. DEGREE DISTRIBUTION (The Mathematics of Connectivity)
degrees = [d for n, d in G.degree()]
in_degrees = [d for n, d in G.in_degree()]
out_degrees = [d for n, d in G.out_degree()]

print("üìä CONNECTIVITY STATISTICS")
print(f"   ‚Ä¢ Average Connections per Node: {np.mean(degrees):.2f}")
print(f"   ‚Ä¢ Median Connections:           {np.median(degrees):.1f}")
print(f"   ‚Ä¢ Max Degree (The King Node):   {np.max(degrees)}")
print(f"   ‚Ä¢ Isolated Nodes (Degree=0):    {degrees.count(0)}")

# 3. SUPER-NODE DETECTION (The 99% Cutoff)
# We define a "Super Node" as anything in the top 1% or 0.5% of connectivity.
p95 = np.percentile(degrees, 95)
p99 = np.percentile(degrees, 99)
p99_9 = np.percentile(degrees, 99.9)

print("-" * 60)
print("üõë SUPER-NODE THRESHOLDS")
print(f"   ‚Ä¢ Top 5% Cutoff:   > {p95:.1f} connections")
print(f"   ‚Ä¢ Top 1% Cutoff:   > {p99:.1f} connections")
print(f"   ‚Ä¢ Top 0.1% Cutoff: > {p99_9:.1f} connections")
print(f"\n   üëâ RECOMMENDATION: Set SUPER_NODE_THRESHOLD = {int(p99)}")

# 4. IDENTIFYING THE "BLACK HOLES"
# Let's see exactly WHO these super-nodes are.
sorted_nodes = sorted(G.degree(), key=lambda x: x[1], reverse=True)
super_nodes = [n for n in sorted_nodes if n[1] > p99]

print("-" * 60)
print(f"üí£ LIST OF POTENTIAL 'BLACK HOLES' (Top {len(super_nodes)} Nodes)")
print("   (These are nodes we might need to blacklist or filter)")
print(pd.DataFrame(super_nodes[:20], columns=["Entity", "Connections"]))

# 5. RELATIONSHIP AUDIT (Finding the "Junk")
# We check which verbs are most common. If "is" appears 5000 times, we ban it.
edge_attrs = nx.get_edge_attributes(G, "relation")
relation_counts = Counter(edge_attrs.values())
df_rel = pd.DataFrame(relation_counts.most_common(20), columns=["Relation", "Count"])

print("-" * 60)
print("üîó TOP 20 RELATIONSHIPS (Candidates for Stop-List)")
print(df_rel)

# 6. ENTITY TYPE AUDIT (If available)
# Checking what kind of things we have (Person, Place, etc.) if extracted
node_types = nx.get_node_attributes(G, "type")
if node_types:
    type_counts = Counter(node_types.values())
    print("-" * 60)
    print("üè∑Ô∏è TOP ENTITY TYPES")
    print(pd.DataFrame(type_counts.most_common(10), columns=["Type", "Count"]))
else:
    print("\n‚ö†Ô∏è No 'type' attribute found on nodes (Standard for pure extraction).")

print("-" * 60)
print("‚úÖ SCAN COMPLETE")

üìÇ Loading Graph from ./models/knowledge_graph_clean.pkl...
‚úÖ Graph Loaded. Nodes: 294,133 | Edges: 369,783
------------------------------------------------------------
üìä CONNECTIVITY STATISTICS
   ‚Ä¢ Average Connections per Node: 2.51
   ‚Ä¢ Median Connections:           1.0
   ‚Ä¢ Max Degree (The King Node):   1904
   ‚Ä¢ Isolated Nodes (Degree=0):    0
------------------------------------------------------------
üõë SUPER-NODE THRESHOLDS
   ‚Ä¢ Top 5% Cutoff:   > 6.0 connections
   ‚Ä¢ Top 1% Cutoff:   > 18.0 connections
   ‚Ä¢ Top 0.1% Cutoff: > 143.0 connections

   üëâ RECOMMENDATION: Set SUPER_NODE_THRESHOLD = 18
------------------------------------------------------------
üí£ LIST OF POTENTIAL 'BLACK HOLES' (Top 2920 Nodes)
   (These are nodes we might need to blacklist or filter)
                   Entity  Connections
0           United States         1904
1                  France          976
2                  London          874
3                   Egypt        