In [1]:
import spacy
from fastcoref import FCoref
import nltk
from nltk.corpus import wordnet
import os
import torch # For device check
from collections import defaultdict
import json


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# --- Configuration ---
spacy_model = "en_core_web_lg"
# Try the potentially more accurate LingMess model
fastcoref_model_name = "biu-nlp/f-coref"
# fastcoref_model_name = 'biu-nlp/f-coref' # Alternative faster model

# --- Download NLTK WordNet Data (if needed) ---
try:
    wordnet.synsets('computer')
    print("WordNet data found.")
except LookupError:
    print("Downloading NLTK WordNet data...")
    nltk.download('wordnet')
    print("WordNet download complete.")
except Exception as e:
    print(f"An error occurred during WordNet check/download: {e}")


# --- Helper Functions ---

def get_verb_synset_lemma(verb_token):
    """Gets the first lemma of the first WordNet synset for a verb token."""
    if not verb_token or verb_token.pos_ != "VERB":
        return None
    verb_lemma = verb_token.lemma_
    try:
        synsets = wordnet.synsets(verb_lemma, pos=wordnet.VERB)
        if synsets:
            # Use the first lemma of the most common sense
            normalized = synsets[0].lemmas()[0].name()
            return normalized.replace('_', '').upper() # Use uppercase for convention
        else:
            # Fallback to the original lemma if not found
            return verb_lemma.upper()
    except Exception as e:
        print(f"WARN: Error normalizing verb '{verb_lemma}' with WordNet: {e}")
        return verb_lemma.upper() # Fallback on error

def build_coref_map(text, clusters_indices):
    """
    Builds an initial map from the start character index of a mention
    to the raw text of the representative mention in its cluster.
    """
    char_coref_mapping = {}
    if not clusters_indices:
        return char_coref_mapping

    for cluster in clusters_indices:
        if not cluster: continue
        rep_start_char, rep_end_char = cluster[0]
        # Store the raw text initially
        rep_text = text[rep_start_char:rep_end_char].strip()
        for mention_start_char, mention_end_char in cluster:
            char_coref_mapping[mention_start_char] = rep_text
    return char_coref_mapping

def refine_coref_map_with_entities(initial_char_coref_map, initial_entities):
    """
    Refines the coref map by replacing long/descriptive representative mentions
    with shorter NER entity names where possible.
    initial_entities: dict {entity_text: label} from initial NER pass
    """
    refined_map = initial_char_coref_map.copy()
    # Prefer shorter, known entity names
    entity_names = sorted(list(initial_entities.keys()), key=len) # Sort by length ascending

    long_name_to_short = {}

    # Find potential short names for long resolved names currently in the map values
    current_rep_texts = set(initial_char_coref_map.values())
    for long_name in current_rep_texts:
         # Heuristic: check if name seems too long/descriptive or isn't in initial entities
        if (',' in long_name or len(long_name) > 30) or (long_name not in initial_entities):
            best_match = None
            # Find the shortest entity name that's part of the long name
            for short_name in entity_names:
                # Check containment, ensure it's a meaningful part (optional enhancement)
                if short_name in long_name and len(short_name) >= 3:
                    best_match = short_name # Found shortest NER name within long name
                    break # Take the first (shortest) match
            if best_match:
                long_name_to_short[long_name] = best_match

    # Create the new map using the refined names
    for char_idx, mapped_text in initial_char_coref_map.items():
        # If the current mapped text has a shorter version identified, use it
        refined_map[char_idx] = long_name_to_short.get(mapped_text, mapped_text)

    return refined_map

def get_span_resolved_text(span, refined_char_coref_map):
    """
    Resolves the text of a spaCy Span using the refined coref map,
    checking the span's start character index. Falls back to span text.
    """
    if not span: return None
    return refined_char_coref_map.get(span.start_char, span.text.strip())

def get_token_resolved_text(token, refined_char_coref_map):
     """
     Resolves the text for a single token - less preferred than resolving spans.
     """
     if not token: return None
     return refined_char_coref_map.get(token.idx, token.text.strip())

def get_entity_span_for_token(token, doc):
    """Finds the encompassing NER entity span for a token, if any."""
    for ent in doc.ents:
        if token.i >= ent.start and token.i < ent.end:
            return ent
    return None

def extract_relation_attributes(token):
    """
    Extracts attributes (time, location, manner) linked to a verb/action token
    by checking its children in the dependency tree.
    """
    attributes = {}
    for child in token.children:
        # Time: Look for prepositions like "in", "on", "at" governing DATE entities
        if child.dep_ == 'prep' and child.text.lower() in ['in', 'on', 'at', 'during', 'since', 'until']:
            for grandchild in child.children:
                if grandchild.dep_ == 'pobj':
                     ent_span = get_entity_span_for_token(grandchild, grandchild.doc)
                     if ent_span and ent_span.label_ == 'DATE':
                         attributes['time'] = ent_span.text.strip()
                         break # Found time
                     elif grandchild.like_num: # Handle simple numbers like years
                          attributes['time'] = grandchild.text.strip()
                          break

        # Location: Look for prepositions governing GPE/LOC entities
        elif child.dep_ == 'prep' and child.text.lower() in ['in', 'at', 'on', 'near', 'from', 'to']:
             for grandchild in child.children:
                 if grandchild.dep_ == 'pobj':
                    ent_span = get_entity_span_for_token(grandchild, grandchild.doc)
                    if ent_span and ent_span.label_ in ['GPE', 'LOC', 'FAC']:
                        attributes['location'] = ent_span.text.strip()
                        break # Found location

        # Manner: Look for adverbial modifiers
        elif child.dep_ == 'advmod':
            attributes.setdefault('manner', []).append(child.text.strip())

    # Combine manner adverbs if multiple found
    if 'manner' in attributes:
        attributes['manner'] = " ".join(attributes['manner'])

    return attributes

def extract_enhanced_relationships(doc, refined_char_coref_map):
    """
    Extracts relationships with improved entity span resolution,
    multi-word relation extraction, and attribute extraction.
    """
    relationships = []
    processed_verbs = set() # Avoid processing compound verbs multiple times

    for token in doc:
        if token.i in processed_verbs: continue

        relation_info = None

        # --- Trigger based on Verbs ---
        if token.pos_ == "VERB":
            # Handle auxiliary verbs and passive voice
            subject_token = None
            object_token = None
            passive = False
            verb_phrase_tokens = [token] # Start with current verb

            # Find subject (nsubj or nsubjpass)
            for child in token.children:
                if "subj" in child.dep_:
                    subject_token = child
                    if child.dep_ == "nsubjpass":
                        passive = True
                    break # Assume one subject per verb for simplicity here

            # If no subject attached to this verb, check its head (auxiliary case)
            if not subject_token and token.dep_ in ["aux", "auxpass", "xcomp", "ccomp", "advcl"]:
                 verb_head = token.head
                 if verb_head.pos_ == "VERB":
                     # Mark related verb tokens to avoid re-processing
                     processed_verbs.add(verb_head.i)
                     verb_phrase_tokens.insert(0, verb_head) # Add head verb
                     # Find subject attached to the head verb
                     for child in verb_head.children:
                         if "subj" in child.dep_:
                            subject_token = child
                            if child.dep_ == "nsubjpass": passive = True
                            break

            if not subject_token: continue # Cannot form relationship without subject

            # Determine the main verb for relation name and attributes
            main_verb_token = token if token.dep_ not in ["aux", "auxpass"] else token.head
            if main_verb_token.pos_ != "VERB": continue # Ensure it's still a verb

            # Find object(s) - dobj, pobj (via prep), attr, oprd, agent (passive)
            prep_text = None
            for child in token.children:
                if child.dep_ == "dobj":
                    object_token = child
                    break
                elif child.dep_ == "attr": # Attribute link (e.g., "is CEO")
                    object_token = child
                    break
                elif child.dep_ == "oprd": # Object predicate
                     object_token = child
                     break
                elif child.dep_ == "prep": # Prepositional object
                    prep_text = child.text.lower()
                    for grandchild in child.children:
                        if grandchild.dep_ == "pobj":
                             object_token = grandchild
                             break # Take first pobj
                    if object_token: break
                elif passive and child.dep_ == "agent": # Agent in passive voice ("by X")
                    for grandchild in child.children:
                        if grandchild.dep_ == "pobj":
                            # In passive, the agent becomes the logical subject
                            object_token = subject_token # Original subject is logical object
                            subject_token = grandchild # Agent is logical subject
                            break
                    if subject_token: break


            if not object_token: continue # Require subject and object

            # --- Resolve Entities for the full spans ---
            subj_span = get_entity_span_for_token(subject_token, doc) or subject_token.sent # Fallback to sentence span? No, use token/chunk
            if not subj_span: subj_span = next(subject_token.subtree, subject_token) # Basic chunk idea

            obj_span = get_entity_span_for_token(object_token, doc) or object_token.sent
            if not obj_span: obj_span = next(object_token.subtree, object_token) # Basic chunk idea

            resolved_subj = get_span_resolved_text(subj_span if isinstance(subj_span, spacy.tokens.Span) else None, refined_char_coref_map) or get_token_resolved_text(subject_token, refined_char_coref_map)
            resolved_obj = get_span_resolved_text(obj_span if isinstance(obj_span, spacy.tokens.Span) else None, refined_char_coref_map) or get_token_resolved_text(object_token, refined_char_coref_map)


            # --- Determine Relation Phrase ---
            relation_phrase = get_verb_synset_lemma(main_verb_token) or main_verb_token.lemma_.upper()
            # Enhance with prepositions or particles if present
            if prep_text:
                relation_phrase = f"{relation_phrase}_{prep_text.upper()}"
            else:
                 for child in main_verb_token.children:
                      if child.dep_ == 'prt': # Particle, e.g., "set up"
                           relation_phrase = f"{relation_phrase}_{child.text.upper()}"
                           break


            # --- Extract Attributes ---
            attributes = extract_relation_attributes(main_verb_token)

            # Add the relationship
            if resolved_subj and resolved_obj and resolved_subj.lower() != resolved_obj.lower():
                relation_info = (resolved_subj, relation_phrase, resolved_obj, attributes)
                print(f"  Extracted Rel: ({resolved_subj}) --[{relation_phrase}]--> ({resolved_obj}) | Attrs: {attributes}")
                relationships.append(relation_info)

    # Add rules for specific verbs like 'founded', 'co-founded', 'acquired' if needed

    return relationships


def extract_final_entities(doc, refined_char_coref_map):
    """
    Extracts Named Entities and resolves their names using the *refined* coref map.
    Returns dict: {resolved_entity_name: entity_label}
    """
    entities = {}
    print("\nFinal Entities (NER + Refined Coref):")
    for ent in doc.ents:
        resolved_name = refined_char_coref_map.get(ent.start_char, ent.text.strip())
        label = ent.label_
        print(f"  - Original: '{ent.text}' ({label}), StartChar: {ent.start_char} -> Resolved: '{resolved_name}'")

        if resolved_name not in entities:
            entities[resolved_name] = label
        elif entities[resolved_name] != label:
             print(f"    WARN: Conflicting labels for '{resolved_name}'. Keeping '{entities[resolved_name]}', ignoring new label '{label}'.")

    return entities


# --- Main Pipeline Function ---

def process_text_to_graph_info_v2(text, spacy_nlp, fastcoref_model):
    """
    Revised pipeline implementing improvements.
    """
    print(f"\n--- Processing Text ---")
    print(text.strip()) # Use stripped text

    # 1. Process with spaCy
    print("\n1. Running spaCy NLP pipeline...")
    doc = spacy_nlp(text.strip()) # Process stripped text

    # 2. Initial NER Extraction (for refining coref map)
    print("\n2. Extracting initial NER entities...")
    initial_entities = {ent.text.strip(): ent.label_ for ent in doc.ents}
    print(f"   Initial entities: {initial_entities}")

    # 3. Run Coreference Resolution
    print("\n3. Running FastCoref pipeline...")
    preds = fastcoref_model.predict(texts=[text.strip()])
    result = preds[0]
    clusters_indices = result.get_clusters(as_strings=False)
    clusters_strings = result.get_clusters(as_strings=True) # For logging
    print(f"   Coref Clusters (Indices): {clusters_indices}")
    print(f"   Coref Clusters (Strings): {clusters_strings}")

    # 4. Build Initial Coref Map
    print("\n4. Building initial coreference map...")
    initial_char_coref_map = build_coref_map(text.strip(), clusters_indices)
    print(f"   Initial Coref Map: {initial_char_coref_map}")

    # 5. Refine Coref Map
    print("\n5. Refining coreference map...")
    refined_char_coref_map = refine_coref_map_with_entities(initial_char_coref_map, initial_entities)
    print(f"   Refined Coref Map: {refined_char_coref_map}")

    # 6. Extract Enhanced Relationships
    print("\n6. Extracting enhanced relationships...")
    relationships = extract_enhanced_relationships(doc, refined_char_coref_map)

    # 7. Extract Final Entities using Refined Map
    print("\n7. Extracting final entities...")
    final_entities = extract_final_entities(doc, refined_char_coref_map)

    print("\n--- Processing Complete ---")
    return relationships, final_entities


# --- Main Execution ---

if __name__ == "__main__":
    # --- 1. Load spaCy Model ---
    print(f"Loading spaCy model: {spacy_model}")
    try:
        nlp = spacy.load(spacy_model)
    except OSError:
        print(f"Error loading spaCy model '{spacy_model}'. Download it.")
        exit()

    # --- 2. Initialize FastCoref Model ---
    print("\nInitializing FastCoref model...")
    # Determine device
    if torch.backends.mps.is_available():
        device = 'mps'
        print("Attempting to use MPS (Apple Silicon GPU).")
    elif torch.cuda.is_available():
        device = 'cuda:0'
        print("Attempting to use CUDA GPU.")
    else:
        device = 'cpu'
        print("GPU not available, using CPU.")

    try:
        # Try initializing with the determined device
        fc_model = FCoref(fastcoref_model_name, nlp, device=device)
        print(f"Successfully initialized FastCoref on {device.upper()}.")
    except Exception as e:
        print(f"{device.upper()} initialization failed ({e}). Falling back to CPU.")
        # Fallback to CPU if GPU fails
        device = 'mps'
        fc_model = FCoref(fastcoref_model_name, nlp=nlp, device=device)
        print("Successfully initialized FastCoref on CPU.")


    # --- 3. Define Input Text ---
    text = """
    Apple Inc., founded by Steve Jobs and Steve Wozniak, is based in Cupertino.
    Tim Cook became the CEO of Apple in 2011. He previously worked at IBM.
    Apple produces the popular iPhone. Google, its competitor, makes Android.
    Steve Jobs also co-founded Pixar, which was later acquired by Disney. Bob Iger leads Disney.
    """

    # --- 4. Run the Full Pipeline ---
    extracted_relationships, extracted_entities = process_text_to_graph_info_v2(
        text, nlp, fc_model
    )

    # --- 5. Display Results ---
    print("\n" + "="*30 + " FINAL RESULTS " + "="*30)
    print("\nEntities (Resolved Name: Type):")
    if extracted_entities:
        # Sort for consistent output
        for name in sorted(extracted_entities.keys()):
            print(f"  - {name}: {extracted_entities[name]}")
    else:
        print("  (No entities found)")

    print("\nRelationships (Subject, Verb, Object, Attributes):")
    if extracted_relationships:
        # Sort for consistent output
        extracted_relationships.sort()
        for rel in extracted_relationships:
            subj, verb, obj, attrs = rel
            attr_str = f"| {attrs}" if attrs else ""
            print(f"  - ({subj}) --[{verb}]--> ({obj}) {attr_str}")
    else:
        print("  (No relationships found)")

    print("\nDone.")

WordNet data found.
Loading spaCy model: en_core_web_lg

Initializing FastCoref model...
Attempting to use MPS (Apple Silicon GPU).
MPS initialization failed (FCoref.__init__() got multiple values for argument 'device'). Falling back to CPU.


04/21/2025 16:45:56 - INFO - 	 missing_keys: []
04/21/2025 16:45:56 - INFO - 	 unexpected_keys: []
04/21/2025 16:45:56 - INFO - 	 mismatched_keys: []
04/21/2025 16:45:56 - INFO - 	 error_msgs: []
04/21/2025 16:45:56 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M
04/21/2025 16:45:56 - INFO - 	 Tokenize 1 inputs...


Successfully initialized FastCoref on CPU.

--- Processing Text ---
Apple Inc., founded by Steve Jobs and Steve Wozniak, is based in Cupertino.
    Tim Cook became the CEO of Apple in 2011. He previously worked at IBM.
    Apple produces the popular iPhone. Google, its competitor, makes Android.
    Steve Jobs also co-founded Pixar, which was later acquired by Disney. Bob Iger leads Disney.

1. Running spaCy NLP pipeline...

2. Extracting initial NER entities...
   Initial entities: {'Apple Inc.': 'ORG', 'Steve Jobs': 'PERSON', 'Steve Wozniak': 'PERSON', 'Cupertino': 'GPE', 'Tim Cook': 'PERSON', 'Apple': 'ORG', '2011': 'DATE', 'IBM': 'ORG', 'iPhone': 'ORG', 'Google': 'ORG', 'Android': 'ORG', 'Pixar': 'ORG', 'Disney': 'ORG', 'Bob Iger': 'PERSON'}

3. Running FastCoref pipeline...


Map: 100%|██████████| 1/1 [00:00<00:00, 40.70 examples/s]
04/21/2025 16:45:57 - INFO - 	 ***** Running Inference on 1 texts *****
Inference: 100%|██████████| 1/1 [00:00<00:00,  3.02it/s]

   Coref Clusters (Indices): [[(0, 52), (107, 112), (155, 160), (198, 201)], [(80, 88), (122, 124)], [(23, 33), (233, 243)], [(295, 301), (318, 324)]]
   Coref Clusters (Strings): [['Apple Inc., founded by Steve Jobs and Steve Wozniak,', 'Apple', 'Apple', 'its'], ['Tim Cook', 'He'], ['Steve Jobs', 'Steve Jobs'], ['Disney', 'Disney']]

4. Building initial coreference map...
   Initial Coref Map: {0: 'Apple Inc., founded by Steve Jobs and Steve Wozniak,', 107: 'Apple Inc., founded by Steve Jobs and Steve Wozniak,', 155: 'Apple Inc., founded by Steve Jobs and Steve Wozniak,', 198: 'Apple Inc., founded by Steve Jobs and Steve Wozniak,', 80: 'Tim Cook', 122: 'Tim Cook', 23: 'Steve Jobs', 233: 'Steve Jobs', 295: 'Disney', 318: 'Disney'}

5. Refining coreference map...
   Refined Coref Map: {0: 'Apple', 107: 'Apple', 155: 'Apple', 198: 'Apple', 80: 'Tim Cook', 122: 'Tim Cook', 23: 'Steve Jobs', 233: 'Steve Jobs', 295: 'Disney', 318: 'Disney'}

6. Extracting enhanced relationships...
  Extract




In [3]:
from neo4j import GraphDatabase
import spacy

# 1. Neo4j Connection Details
uri = "bolt://localhost:7687"  # Default Bolt port
username = "neo4j"  # Or your username
password = "thesith123"  # Replace with the password you set

# 2. Initialize Neo4j Driver
driver = GraphDatabase.driver(uri, auth=(username, password))

# 3. SpaCy Setup
nlp = spacy.load("en_core_web_lg")

In [4]:
def test_connection(uri, username, password):
    driver = GraphDatabase.driver(uri, auth=(username, password))
    result = None
    try:
        def create_test_node(tx):
            query = "CREATE (test:TestNode {message: 'Hello from Python'}) RETURN test.message AS message"
            result = tx.run(query).single()
            return result['message'] if result else None

        with driver.session() as session:
            message = session.execute_write(create_test_node)
            if message == 'Hello from Python':
                print("Successfully created and retrieved a test node from Neo4j!")
            else:
                print("Failed to retrieve the expected message.")
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        driver.close()

if __name__ == '__main__':
    test_connection(uri, username, password)

Successfully created and retrieved a test node from Neo4j!


In [5]:
def create_graph(uri, username, password, entities, relationships):
    """
    Connects to Neo4j and inserts entities and relationships.
    """
    driver = GraphDatabase.driver(uri, auth=(username, password))
    
    def create_nodes_and_relationships(tx):
        # Create entity nodes
        entity_nodes = {} # Stores mapping from entity name (string) to Neo4j Node object
        print("\n--- create_nodes_and_relationships: Creating Entity Nodes ---")
        print(f"Entities: {entities}")
        for entity_name, entity_type in entities.items():
            print(f"\nProcessing entity: name='{entity_name}', type='{entity_type}'")
            # MERGE finds or creates the node. RETURN n gives us the node back.
            query = "MERGE (n:{type} {{name: $name}}) RETURN n".format(type=entity_type)
            print(f"Executing query: '{query}' with parameters: {{'name': '{entity_name}'}}")
            try:
                result = tx.run(query, name=entity_name)
                
                # Use single() directly to get the record
                record = result.single() 
                
                if record:
                    # Access the node using the alias 'n' from the RETURN clause
                    node_object = record['n'] 
                    # Store the actual Neo4j Node object in the dictionary
                    entity_nodes[entity_name] = node_object 
                    # You can print node details if needed for confirmation
                    print(f"Stored node: ID={node_object.id}, Labels={list(node_object.labels)}, Properties={dict(node_object)}")
                else:
                    # This could happen if the transaction somehow failed before commit,
                    # or if MERGE failed constraints, but it's less likely for this simple query.
                    print(f"WARN: No record returned from MERGE query for entity '{entity_name}'.")
                    
            except Exception as e:
                print(f"ERROR creating/merging node '{entity_name}': {e}")
                raise  # Re-raise the exception to potentially abort the transaction

        print("\n--- create_nodes_and_relationships: Creating Relationships ---")
        print(f"Relationships: {relationships}")
        # The relationship creation part should now work because entity_nodes will be populated correctly
        for subj, verb, obj, attrs in relationships:
            print(f"\nProcessing relationship: subj='{subj}', verb='{verb}', obj='{obj}', attrs='{attrs}'")
            # Ensure both subject and object entities were successfully retrieved/created
            if subj in entity_nodes and obj in entity_nodes:
                # Get the actual Node objects from the dictionary
                subj_node = entity_nodes[subj]
                obj_node = entity_nodes[obj]
                
                # Use internal IDs to match the nodes efficiently
                query = """
                MATCH (a) WHERE id(a) = $subject_id
                MATCH (b) WHERE id(b) = $object_id
                MERGE (a)-[r:{relation}]->(b)
                ON CREATE SET r = $attributes
                ON MATCH SET r += $attributes 
                """.format(relation=verb) # Use MERGE for relationships too if you want to avoid duplicates
                
                print(f"Executing query: MATCH (a) WHERE id(a)={subj_node.id} MATCH (b) WHERE id(b)={obj_node.id} ...")
                try:
                    tx.run(
                        query,
                        subject_id=subj_node.id, # Pass the internal ID
                        object_id=obj_node.id,   # Pass the internal ID
                        attributes=attrs if attrs else {} # Ensure attributes is a dict
                    )
                    print("Relationship merged/created successfully")
                except Exception as e:
                    print(f"ERROR creating relationship ({subj})-[{verb}]->({obj}): {e}")
                    raise # Re-raise to potentially abort transaction
            else:
                # Print which entity was missing
                missing = []
                if subj not in entity_nodes: missing.append(subj)
                if obj not in entity_nodes: missing.append(obj)
                print(f"WARN: Skipping relationship ({subj})--[{verb}]-->({obj}) because entity(ies) not found in entity_nodes dict: {', '.join(missing)}")

    # The rest of your function (session handling, driver close) remains the same
    with driver.session() as session:
        session.execute_write(create_nodes_and_relationships)

    driver.close()
    print("\nNeo4j operation finished.") # Changed message slightly
        
# The rest of your existing code (import statements, helper functions,
# process_text_to_graph_info_v2 function, and the main execution block
# up to the point where extracted_relationships and extracted_entities are defined)
# goes here.

if __name__ == "__main__":
    # --- 1. Load spaCy Model ---
    print(f"Loading spaCy model: {spacy_model}")
    try:
        nlp = spacy.load(spacy_model)
    except OSError:
        print(f"Error loading spaCy model '{spacy_model}'. Download it.")
        exit()

    # --- 2. Initialize FastCoref Model ---
    print("\nInitializing FastCoref model...")
    # Determine device
    if torch.backends.mps.is_available():
        device = 'mps'
        print("Attempting to use MPS (Apple Silicon GPU).")
    elif torch.cuda.is_available():
        device = 'cuda:0'
        print("Attempting to use CUDA GPU.")
    else:
        device = 'cpu'
        print("GPU not available, using CPU.")

    try:
        # Try initializing with the determined device
        fc_model = FCoref(fastcoref_model_name, nlp, device=device)
        print(f"Successfully initialized FastCoref on {device.upper()}.")
    except Exception as e:
        print(f"{device.upper()} initialization failed ({e}). Falling back to CPU.")
        # Fallback to CPU if GPU fails
        device = 'mps'
        fc_model = FCoref(fastcoref_model_name, nlp=nlp, device=device)
        print("Successfully initialized FastCoref on CPU.")


    # --- 3. Define Input Text ---
    text = """
    Apple Inc., founded by Steve Jobs and Steve Wozniak, is based in Cupertino.
    Tim Cook became the CEO of Apple in 2011. He previously worked at IBM.
    Apple produces the popular iPhone. Google, its competitor, makes Android.
    Steve Jobs also co-founded Pixar, which was later acquired by Disney. Bob Iger leads Disney.
    """

    # --- 4. Run the Full Pipeline ---
    extracted_relationships, extracted_entities = process_text_to_graph_info_v2(
        text, nlp, fc_model
    )

    # --- 5. Display Results ---
    print("\n" + "="*30 + " FINAL RESULTS " + "="*30)
    print("\nEntities (Resolved Name: Type):")
    if extracted_entities:
        # Sort for consistent output
        for name in sorted(extracted_entities.keys()):
            print(f"  - {name}: {extracted_entities[name]}")
    else:
        print("  (No entities found)")

    print("\nRelationships (Subject, Verb, Object, Attributes):")
    if extracted_relationships:
        # Sort for consistent output
        extracted_relationships.sort()
        for rel in extracted_relationships:
            subj, verb, obj, attrs = rel
            attr_str = f"| {attrs}" if attrs else ""
            print(f"  - ({subj}) --[{verb}]--> ({obj}) {attr_str}")
    else:
        print("  (No relationships found)")

    # --- 6. Insert into Neo4j ---
    print("EE:")
    print(extracted_entities)
    create_graph(uri, username, password, extracted_entities, extracted_relationships)

    print("\nDone.")


Loading spaCy model: en_core_web_lg

Initializing FastCoref model...
Attempting to use MPS (Apple Silicon GPU).
MPS initialization failed (FCoref.__init__() got multiple values for argument 'device'). Falling back to CPU.


04/21/2025 16:45:59 - INFO - 	 missing_keys: []
04/21/2025 16:45:59 - INFO - 	 unexpected_keys: []
04/21/2025 16:45:59 - INFO - 	 mismatched_keys: []
04/21/2025 16:45:59 - INFO - 	 error_msgs: []
04/21/2025 16:45:59 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M
04/21/2025 16:46:00 - INFO - 	 Tokenize 1 inputs...


Successfully initialized FastCoref on CPU.

--- Processing Text ---
Apple Inc., founded by Steve Jobs and Steve Wozniak, is based in Cupertino.
    Tim Cook became the CEO of Apple in 2011. He previously worked at IBM.
    Apple produces the popular iPhone. Google, its competitor, makes Android.
    Steve Jobs also co-founded Pixar, which was later acquired by Disney. Bob Iger leads Disney.

1. Running spaCy NLP pipeline...

2. Extracting initial NER entities...
   Initial entities: {'Apple Inc.': 'ORG', 'Steve Jobs': 'PERSON', 'Steve Wozniak': 'PERSON', 'Cupertino': 'GPE', 'Tim Cook': 'PERSON', 'Apple': 'ORG', '2011': 'DATE', 'IBM': 'ORG', 'iPhone': 'ORG', 'Google': 'ORG', 'Android': 'ORG', 'Pixar': 'ORG', 'Disney': 'ORG', 'Bob Iger': 'PERSON'}

3. Running FastCoref pipeline...


Map: 100%|██████████| 1/1 [00:00<00:00, 98.59 examples/s]
04/21/2025 16:46:01 - INFO - 	 ***** Running Inference on 1 texts *****
Inference: 100%|██████████| 1/1 [00:00<00:00, 21.98it/s]
  print(f"Stored node: ID={node_object.id}, Labels={list(node_object.labels)}, Properties={dict(node_object)}")
  print(f"Executing query: MATCH (a) WHERE id(a)={subj_node.id} MATCH (b) WHERE id(b)={obj_node.id} ...")
  subject_id=subj_node.id, # Pass the internal ID
  object_id=obj_node.id,   # Pass the internal ID


   Coref Clusters (Indices): [[(0, 52), (107, 112), (155, 160), (198, 201)], [(80, 88), (122, 124)], [(23, 33), (233, 243)], [(295, 301), (318, 324)]]
   Coref Clusters (Strings): [['Apple Inc., founded by Steve Jobs and Steve Wozniak,', 'Apple', 'Apple', 'its'], ['Tim Cook', 'He'], ['Steve Jobs', 'Steve Jobs'], ['Disney', 'Disney']]

4. Building initial coreference map...
   Initial Coref Map: {0: 'Apple Inc., founded by Steve Jobs and Steve Wozniak,', 107: 'Apple Inc., founded by Steve Jobs and Steve Wozniak,', 155: 'Apple Inc., founded by Steve Jobs and Steve Wozniak,', 198: 'Apple Inc., founded by Steve Jobs and Steve Wozniak,', 80: 'Tim Cook', 122: 'Tim Cook', 23: 'Steve Jobs', 233: 'Steve Jobs', 295: 'Disney', 318: 'Disney'}

5. Refining coreference map...
   Refined Coref Map: {0: 'Apple', 107: 'Apple', 155: 'Apple', 198: 'Apple', 80: 'Tim Cook', 122: 'Tim Cook', 23: 'Steve Jobs', 233: 'Steve Jobs', 295: 'Disney', 318: 'Disney'}

6. Extracting enhanced relationships...
  Extract



Relationship merged/created successfully

Processing relationship: subj='Disney', verb='GET', obj='Steve Jobs', attrs='{'manner': 'later'}'
Executing query: MATCH (a) WHERE id(a)=13 MATCH (b) WHERE id(b)=3 ...
Relationship merged/created successfully

Processing relationship: subj='Google', verb='MAKE', obj='Android', attrs='{}'
Executing query: MATCH (a) WHERE id(a)=10 MATCH (b) WHERE id(b)=11 ...
Relationship merged/created successfully

Processing relationship: subj='Tim Cook', verb='WORK_AT', obj='IBM', attrs='{'manner': 'previously'}'
Executing query: MATCH (a) WHERE id(a)=6 MATCH (b) WHERE id(b)=8 ...
Relationship merged/created successfully

Neo4j operation finished.

Done.


In [6]:
def get_entity_relationships(uri, username, password, entity_name):
    """
    Queries Neo4j to find all relationships connected to a specific entity,
    identified by its 'name' property.

    Args:
        uri (str): The connection URI for the Neo4j database.
        username (str): The username for database authentication.
        password (str): The password for database authentication.
        entity_name (str): The value of the 'name' property of the entity
                           to query relationships for.

    Returns:
        list: A list of dictionaries, where each dictionary represents a
              relationship and contains:
              - 'start_node': Properties (dict) of the start node.
              - 'relationship_type': The type (str) of the relationship.
              - 'relationship_props': Properties (dict) of the relationship itself.
              - 'end_node': Properties (dict) of the end node.
              Returns an empty list if the entity is not found or has no
              relationships.
    """
    driver = GraphDatabase.driver(uri, auth=(username, password))
    relationships_list = []

    # Internal function to run within a transaction
    def _find_relationships_tx(tx, name):
        # This Cypher query finds the entity 'e' by its name.
        # Then it matches any relationship '[r]' connected to 'e',
        # regardless of direction (which is why there are no arrows: -[r]-).
        # 'other' represents the node at the other end of the relationship.
        # We RETURN the actual start node, the relationship itself, and the actual end node.
        query = """
        MATCH (e {name: $entity_name})-[r]-(other)
        RETURN startNode(r) as start_node, r as relationship, endNode(r) as end_node
        """
        print(f"\nExecuting query: {query.strip()} with params: {{'entity_name': '{name}'}}")
        result = tx.run(query, entity_name=name)

        # Process each record (relationship) found
        found_relationships = []
        for record in result:
            start_node = record['start_node']
            relationship = record['relationship']
            end_node = record['end_node']

            # Store the details in a dictionary
            found_relationships.append({
                # Convert node/relationship properties to standard dictionaries
                'start_node': dict(start_node.items()),
                'relationship_type': relationship.type,
                'relationship_props': dict(relationship.items()),
                'end_node': dict(end_node.items())
            })
        return found_relationships

    try:
        # Use a read transaction as we are only querying data
        with driver.session() as session:
            relationships_list = session.execute_read(_find_relationships_tx, entity_name)
        print(f"Query finished. Found {len(relationships_list)} relationships for '{entity_name}'.")
    except Exception as e:
        print(f"An error occurred querying relationships for '{entity_name}': {e}")
    finally:
        # Ensure the driver connection is closed
        driver.close()

    return relationships_list


# --- Example Usage ---
if __name__ == "__main__":


    entities_to_query = ["Apple", "Steve Jobs", "Disney", "NonExistentEntity"]

    for entity in entities_to_query:
        print("\n" + "="*40)
        print(f"Querying relationships for: '{entity}'")
        print("="*40)
        found_rels = get_entity_relationships(uri, username, password, entity)

        if found_rels:
            # Pretty print the list of relationship dictionaries
            print(json.dumps(found_rels, indent=2))

            # Or print in a more readable graph-like format:
            print("\n--- Graph-like format ---")
            for rel in found_rels:
                start = rel['start_node'].get('name', 'Unknown') # Use .get for safety
                end = rel['end_node'].get('name', 'Unknown')
                rel_type = rel['relationship_type']
                rel_props = rel['relationship_props']
                prop_str = f" {rel_props}" if rel_props else "" # Show props if they exist

                # Check if the queried entity is the start or end node to show directionality clearly
                if start == entity:
                    print(f"  ({start}) -[{rel_type}{prop_str}]-> ({end})")
                elif end == entity:
                     print(f"  ({start}) <-[{rel_type}{prop_str}]- ({end})")
                else:
                     # Should not happen with the query used, but good for robustness
                     print(f"  ({start}) -[{rel_type}{prop_str}]- ({end})  (entity '{entity}' involved but direction unclear?)")

        else:
            print(f"No relationships found for entity '{entity}' (or entity does not exist).")


Querying relationships for: 'Apple'

Executing query: MATCH (e {name: $entity_name})-[r]-(other)
        RETURN startNode(r) as start_node, r as relationship, endNode(r) as end_node with params: {'entity_name': 'Apple'}
Query finished. Found 2 relationships for 'Apple'.
[
  {
    "start_node": {
      "name": "Apple"
    },
    "relationship_type": "ESTABLISH_IN",
    "relationship_props": {},
    "end_node": {
      "name": "Cupertino"
    }
  },
  {
    "start_node": {
      "name": "Apple"
    },
    "relationship_type": "PRODUCE",
    "relationship_props": {},
    "end_node": {
      "name": "iPhone"
    }
  }
]

--- Graph-like format ---
  (Apple) -[ESTABLISH_IN]-> (Cupertino)
  (Apple) -[PRODUCE]-> (iPhone)

Querying relationships for: 'Steve Jobs'

Executing query: MATCH (e {name: $entity_name})-[r]-(other)
        RETURN startNode(r) as start_node, r as relationship, endNode(r) as end_node with params: {'entity_name': 'Steve Jobs'}
Query finished. Found 1 relationships for 'St

In [7]:
import spacy
import torch
from fastcoref import FCoref
import nltk
from nltk.corpus import wordnet
import logging # Import logging

# --- Configuration ---
spacy_model = "en_core_web_lg"
# Try the potentially more accurate LingMess model
fastcoref_model_name = "biu-nlp/f-coref"
# fastcoref_model_name = 'lingmess-coref' # Alternative potentially more accurate but slower

# --- Setup Logging ---
# Replace print statements with logging for better control
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# --- Download NLTK WordNet Data (if needed) ---
try:
    wordnet.synsets('computer')
    logging.info("WordNet data found.")
except LookupError:
    logging.info("Downloading NLTK WordNet data...")
    nltk.download('wordnet')
    logging.info("WordNet download complete.")
except Exception as e:
    logging.error(f"An error occurred during WordNet check/download: {e}")
    # Decide if you want to exit or continue without WordNet
    # exit()


# --- Helper Functions ---

def get_verb_synset_lemma(verb_token):
    """Gets the first lemma of the first WordNet synset for a verb token."""
    if not verb_token or verb_token.pos_ != "VERB":
        return None
    verb_lemma = verb_token.lemma_
    try:
        synsets = wordnet.synsets(verb_lemma, pos=wordnet.VERB)
        if synsets:
            # Use the first lemma of the most common sense
            normalized = synsets[0].lemmas()[0].name()
            # Use uppercase and replace underscores for Neo4j convention
            return normalized.replace('_', '').upper()
        else:
            # Fallback to the original lemma if not found in WordNet
            return verb_lemma.upper()
    except Exception as e:
        logging.warning(f"Error normalizing verb '{verb_lemma}' with WordNet: {e}. Falling back to lemma.")
        return verb_lemma.upper() # Fallback on error

def build_coref_map(text, clusters_indices):
    """
    Builds an initial map from the start character index of a mention
    to the raw text of the representative mention in its cluster.
    """
    char_coref_mapping = {}
    if not clusters_indices:
        return char_coref_mapping

    for cluster in clusters_indices:
        if not cluster: continue
        # The first mention in the cluster is usually the representative one
        rep_start_char, rep_end_char = cluster[0]
        # Store the raw text initially
        rep_text = text[rep_start_char:rep_end_char].strip()
        for mention_start_char, mention_end_char in cluster:
            char_coref_mapping[mention_start_char] = rep_text
    return char_coref_mapping

def refine_coref_map_with_entities(initial_char_coref_map, initial_entities):
    """
    Refines the coref map by replacing representative mentions
    with known NER entity names where possible, prioritizing exact matches
    and then shorter entity names contained within longer mentions.

    Args:
        initial_char_coref_map (dict): Map {start_char: representative_mention_text}
        initial_entities (dict): Map {entity_text: label} from initial NER pass

    Returns:
        dict: Refined map {start_char: resolved_entity_name_or_original_rep_text}
    """
    refined_map = initial_char_coref_map.copy()
    representative_texts = set(initial_char_coref_map.values())
    entity_texts_set = set(initial_entities.keys())
    # Sort known entities by length (shortest first) to prefer shorter names
    sorted_entity_texts = sorted(list(entity_texts_set), key=len)

    rep_text_to_resolved_name = {}

    for rep_text in representative_texts:
        resolved_name = rep_text # Default to original representative text

        # Priority 1: Check if the representative text itself is a known entity
        if rep_text in entity_texts_set:
            resolved_name = rep_text # Already a good name
        else:
            # Priority 2: Find the shortest known entity contained within the representative text
            best_match = None
            for short_entity_name in sorted_entity_texts:
                 # Check for containment, ensure it's a meaningful substring
                 # (e.g., check word boundaries or length threshold)
                if short_entity_name in rep_text and len(short_entity_name) >= 3: # Basic check
                    # More robust check: Use word boundaries?
                    # import re
                    # if re.search(r'\b' + re.escape(short_entity_name) + r'\b', rep_text):
                    best_match = short_entity_name
                    break # Found the shortest contained entity, use it
            if best_match:
                resolved_name = best_match
            # else: resolved_name remains the original rep_text

        rep_text_to_resolved_name[rep_text] = resolved_name

    # Create the final refined map using the resolved names
    final_refined_map = {}
    for char_idx, original_rep_text in initial_char_coref_map.items():
        final_refined_map[char_idx] = rep_text_to_resolved_name.get(original_rep_text, original_rep_text)

    return final_refined_map


def get_resolved_text_for_span(span, refined_char_coref_map):
    """
    Resolves the text of a spaCy Span using the refined coref map,
    checking the span's start character index. Falls back to span text.
    """
    if not span: return None
    # Use start_char as the key for the coref map
    return refined_char_coref_map.get(span.start_char, span.text.strip())

def get_resolved_text_for_token(token, refined_char_coref_map):
     """
     Resolves the text for a single token using the coref map (less preferred than resolving spans).
     Falls back to token text.
     """
     if not token: return None
     # Use token's character index (idx) as the key
     return refined_char_coref_map.get(token.idx, token.text.strip())


def get_entity_span_for_token(token, doc):
    """Finds the encompassing NER entity span for a token, if any."""
    # Check if the token itself is part of an entity
    if token.ent_type_:
        # Find the span this token belongs to
        for ent in doc.ents:
             # Check token index range inclusion
            if token.i >= ent.start and token.i < ent.end:
                return ent
    return None # Token is not part of any detected entity

def get_best_span_for_token(token, doc):
    """
    Tries to find the best span representation for a token:
    1. The full NER entity span if the token is part of one.
    2. The noun chunk the token belongs to, if any.
    3. The token itself as a fallback.
    """
    ent_span = get_entity_span_for_token(token, doc)
    if ent_span:
        return ent_span

    # Check if the token is part of a noun chunk
    # Note: Noun chunks might span across multiple tokens
    # We want the chunk that *contains* this specific token
    for chunk in doc.noun_chunks:
        if token.i >= chunk.start and token.i < chunk.end:
            return chunk # Return the full noun chunk span

    # Fallback to the token itself (will just be its text)
    # Returning the token makes the caller handle text extraction
    return token


def extract_relation_attributes(verb_token):
    """
    Extracts attributes (time, location, manner) linked to a verb/action token
    by checking its children in the dependency tree.
    """
    attributes = {}
    # Also check ancestors for attributes attached higher up (e.g., sentence-level adverbs)
    # Be careful not to go too far up, maybe limit to the sentence?
    tokens_to_check = list(verb_token.children) #+ list(verb_token.ancestors)

    for child in tokens_to_check: # Check children first
        dep = child.dep_
        text_lower = child.text.lower()

        # Time: Look for temporal modifiers (advmod) or prepositions governing DATE/TIME entities or numbers
        if dep == 'advmod' and child.ent_type_ in ['DATE', 'TIME']:
             attributes['time'] = get_best_span_for_token(child, child.doc).text.strip()
        elif dep == 'prep' and text_lower in ['in', 'on', 'at', 'during', 'since', 'until', 'before', 'after']:
            for grandchild in child.children:
                if grandchild.dep_ == 'pobj':
                    pobj_span = get_best_span_for_token(grandchild, grandchild.doc)
                    # Check if the object is a recognized DATE/TIME entity or resembles a time expression
                    if pobj_span.label_ in ['DATE', 'TIME'] if hasattr(pobj_span, 'label_') else False:
                         attributes['time'] = pobj_span.text.strip()
                         break
                    elif grandchild.like_num and grandchild.text.isdigit() and len(grandchild.text) == 4 : # Simple year check
                          attributes['time'] = grandchild.text.strip()
                          break
                    # Add more checks? e.g., phrases like "next week"

        # Location: Look for prepositions governing GPE/LOC/FAC entities or locative advmod
        elif dep == 'advmod' and child.ent_type_ in ['GPE', 'LOC', 'FAC']:
             attributes['location'] = get_best_span_for_token(child, child.doc).text.strip()
        elif dep == 'prep' and text_lower in ['in', 'at', 'on', 'near', 'from', 'to', 'within', 'near', 'based_in']: # Added 'based_in' etc. if needed
             for grandchild in child.children:
                 if grandchild.dep_ == 'pobj':
                    pobj_span = get_best_span_for_token(grandchild, grandchild.doc)
                    if pobj_span.label_ in ['GPE', 'LOC', 'FAC'] if hasattr(pobj_span, 'label_') else False:
                        attributes['location'] = pobj_span.text.strip()
                        break # Found location

        # Manner: Look for adverbial modifiers (advmod) - typically adjectives modifying verbs
        elif dep == 'advmod' and child.pos_ == 'ADV':
            # Append manner adverbs, handle multiple ones
            attributes.setdefault('manner', []).append(child.text.strip())

    # Combine manner adverbs if multiple found
    if 'manner' in attributes:
        attributes['manner'] = " ".join(attributes['manner'])

    return attributes


def extract_enhanced_relationships(doc, refined_char_coref_map):
    """
    Extracts relationships using dependency parsing, coreference resolution,
    and attribute extraction. Attempts to resolve entities to their best representation.
    """
    relationships = []
    processed_verbs = set() # Avoid processing compound verbs multiple times

    logging.info("Starting relationship extraction...")
    for token in doc:
        if token.i in processed_verbs: continue

        # --- Trigger based on Verbs ---
        if token.pos_ == "VERB":
            subject_token = None
            object_token = None
            passive = False
            prep_text = None # Preposition connecting verb and object (e.g., "worked at")
            main_verb_token = token # Assume current token is main verb initially

            # --- Find Subject ---
            # Check children for nominal subjects (nsubj) or passive nominal subjects (nsubjpass)
            potential_subjects = [child for child in token.children if "subj" in child.dep_]
            if potential_subjects:
                subject_token = potential_subjects[0] # Take the first subject found
                if subject_token.dep_ == "nsubjpass":
                    passive = True
            else:
                 # If no subject found attached to this verb, check if it's an auxiliary or part of a clause
                 # and find the subject attached to the main verb it modifies.
                 if token.dep_ in ["aux", "auxpass", "xcomp", "ccomp", "advcl"] and token.head.pos_ == "VERB":
                     main_verb_token = token.head # The head is the main verb
                     # Mark related verb tokens to avoid re-processing them individually
                     processed_verbs.add(main_verb_token.i)
                     for aux_child in token.head.children:
                         if token.i >= aux_child.left_edge.i and token.i <= aux_child.right_edge.i:
                            processed_verbs.add(aux_child.i) # Mark other verbs in the phrase too

                     # Find subject attached to the actual main verb
                     potential_head_subjects = [child for child in main_verb_token.children if "subj" in child.dep_]
                     if potential_head_subjects:
                         subject_token = potential_head_subjects[0]
                         if subject_token.dep_ == "nsubjpass": passive = True


            if not subject_token: continue # Cannot form relationship without a subject

            # Ensure the main verb is correctly identified (not an auxiliary)
            if main_verb_token.dep_ in ["aux", "auxpass"]: continue # Already processed via its head

            # --- Find Object ---
            # Look for direct object (dobj), prepositional object (pobj), attribute (attr),
            # clausal complement (ccomp), open clausal complement (xcomp), or agent in passive voice.
            potential_objects = []
            for child in main_verb_token.children:
                 # Direct object
                 if child.dep_ == "dobj":
                     potential_objects.append((child, None)) # (token, prep_text)
                     break # Prioritize direct object
                 # Attribute/predicate nominative (e.g., "Tim Cook is CEO")
                 elif child.dep_ in ["attr", "oprd"]:
                      potential_objects.append((child, None))
                      break
                 # Clausal complements (can sometimes be treated as objects)
                 # elif child.dep_ in ["ccomp", "xcomp"]:
                 #     # Find the verb or main noun within the complement? More complex.
                 #     potential_objects.append((child, None)) # Simplification
                 #     break
                 # Prepositional Object
                 elif child.dep_ == "prep":
                     prep_token = child
                     for grandchild in prep_token.children:
                         if grandchild.dep_ == "pobj":
                              potential_objects.append((grandchild, prep_token.text.lower()))
                              # Don't break here, could have multiple prep phrases attached to verb
                 # Agent in passive voice ("by X")
                 elif passive and child.dep_ == "agent":
                     for grandchild in child.children:
                          if grandchild.dep_ == "pobj":
                               # In passive: agent is logical subject, original subject is logical object
                               object_token = subject_token # The original subject becomes the object
                               subject_token = grandchild # The agent ('pobj' of 'by') becomes the subject
                               passive = False # Treat as active now with swapped roles
                               potential_objects = [] # Clear previous finds as roles are swapped
                               break # Found the agent, stop searching object for main verb
                     if object_token: break # Agent found and roles swapped

            # Select the primary object if found
            if not object_token and potential_objects: # If roles weren't swapped by passive agent
                 # Prioritize objects: dobj/attr > pobj
                 # This simple logic takes the first found object based on loop order above.
                 # A more sophisticated approach might score or prioritize based on dependency type.
                 object_token, prep_text = potential_objects[0]


            if not object_token: continue # Require both subject and object


            # --- Resolve Entities using best spans and coref map ---
            subj_span = get_best_span_for_token(subject_token, doc)
            obj_span = get_best_span_for_token(object_token, doc)

            # Use the refined coref map preferentially, fall back to span text
            resolved_subj = get_resolved_text_for_span(subj_span, refined_char_coref_map) if isinstance(subj_span, (spacy.tokens.Span, spacy.tokens.Token)) else subject_token.text.strip()
            resolved_obj = get_resolved_text_for_span(obj_span, refined_char_coref_map) if isinstance(obj_span, (spacy.tokens.Span, spacy.tokens.Token)) else object_token.text.strip()


            # If resolution failed or resulted in empty string, try token text directly
            if not resolved_subj: resolved_subj = subject_token.text.strip()
            if not resolved_obj: resolved_obj = object_token.text.strip()


            # --- Determine Relation Phrase ---
            # Normalize the main verb using WordNet
            relation_phrase = get_verb_synset_lemma(main_verb_token) or main_verb_token.lemma_.upper()

            # Enhance relation with preposition (e.g., WORK_AT) or particle (e.g., SET_UP)
            if prep_text:
                relation_phrase = f"{relation_phrase}_{prep_text.upper()}"
            else:
                 # Check for particle attached to the main verb
                 for child in main_verb_token.children:
                      if child.dep_ == 'prt': # Phrasal verb particle
                           relation_phrase = f"{relation_phrase}_{child.text.upper()}"
                           break


            # --- Extract Attributes ---
            # Get attributes associated with the *main* verb token
            attributes = extract_relation_attributes(main_verb_token)

            # --- Add Relationship ---
            # Ensure subject and object are different and non-empty
            if resolved_subj and resolved_obj and resolved_subj.lower() != resolved_obj.lower():
                rel_tuple = (resolved_subj, relation_phrase, resolved_obj, attributes)
                logging.info(f"  Extracted Rel: {rel_tuple}")
                relationships.append(rel_tuple)

    logging.info(f"Finished relationship extraction. Found {len(relationships)} relationships.")
    return relationships


def extract_final_entities(doc, refined_char_coref_map):
    """
    Extracts Named Entities and resolves their names using the *refined* coref map.
    Returns a dictionary: {resolved_entity_name: entity_label}
    Handles potential label conflicts for the same resolved name.
    """
    final_entities = {}
    logging.info("Extracting final entities using refined coref map...")
    for ent in doc.ents:
        # Resolve using the map, fallback to original text
        resolved_name = refined_char_coref_map.get(ent.start_char, ent.text.strip())
        label = ent.label_
        # logging.debug(f"  - Original: '{ent.text}' ({label}), StartChar: {ent.start_char} -> Resolved: '{resolved_name}'")

        # Check for conflicts: if name exists with a different label
        if resolved_name in final_entities:
            existing_label = final_entities[resolved_name]
            if existing_label != label:
                # Conflict resolution strategy:
                # - Keep the first label encountered?
                # - Keep the label from the longer/shorter original span?
                # - Prioritize certain labels (e.g., ORG over PERSON if ambiguous)?
                # Simple strategy: Keep the first one encountered and log a warning.
                logging.warning(f"Conflicting labels for '{resolved_name}'. Keeping existing '{existing_label}', ignoring new label '{label}' from original text '{ent.text}'.")
            # If labels are the same, no action needed.
        else:
            # Add the new entity and its label
            final_entities[resolved_name] = label

    logging.info(f"Finished entity extraction. Found {len(final_entities)} unique final entities.")
    return final_entities


# --- Main Pipeline Function ---

def process_text_to_graph_info(text, spacy_nlp, fastcoref_model):
    """
    Main pipeline to process text and extract entities and relationships.
    """
    logging.info("--- Starting Text Processing Pipeline ---")
    if not text or not text.strip():
        logging.warning("Input text is empty or whitespace only.")
        return [], {}

    stripped_text = text.strip()
    logging.info(f"Processing text: '{stripped_text[:100]}...'") # Log snippet

    # 1. Process with spaCy (NER, POS, Dependency Parsing)
    logging.info("Running spaCy NLP pipeline...")
    doc = spacy_nlp(stripped_text)
    logging.info("spaCy processing complete.")

    # 2. Initial NER Extraction (used to help refine coref)
    logging.info("Extracting initial NER entities...")
    initial_entities = {ent.text.strip(): ent.label_ for ent in doc.ents}
    logging.info(f"Found {len(initial_entities)} initial unique entity texts.")
    # logging.debug(f"Initial entities: {initial_entities}") # Debug level

    # 3. Run Coreference Resolution
    logging.info("Running FastCoref pipeline...")
    # Ensure text matches exactly what spaCy processed if models expect consistency
    try:
        preds = fastcoref_model.predict(texts=[stripped_text])
        result = preds[0]
        clusters_indices = result.get_clusters(as_strings=False)
        # clusters_strings = result.get_clusters(as_strings=True) # For logging if needed
        logging.info(f"FastCoref processing complete. Found {len(clusters_indices)} clusters.")
        # logging.debug(f"Coref Clusters (Indices): {clusters_indices}")
        # logging.debug(f"Coref Clusters (Strings): {clusters_strings}")
    except Exception as e:
        logging.error(f"FastCoref prediction failed: {e}")
        clusters_indices = [] # Continue without coref if it fails


    # 4. Build Initial Coref Map (Character Index -> Representative Mention Text)
    logging.info("Building initial coreference map...")
    initial_char_coref_map = build_coref_map(stripped_text, clusters_indices)
    # logging.debug(f"Initial Coref Map: {initial_char_coref_map}")

    # 5. Refine Coref Map using NER entities
    logging.info("Refining coreference map using NER entities...")
    refined_char_coref_map = refine_coref_map_with_entities(initial_char_coref_map, initial_entities)
    logging.info("Coreference map refined.")
    # logging.debug(f"Refined Coref Map: {refined_char_coref_map}")

    # 6. Extract Relationships using dependency parse and refined coref map
    logging.info("Extracting enhanced relationships...")
    relationships = extract_enhanced_relationships(doc, refined_char_coref_map)

    # 7. Extract Final Entities using the refined coref map
    # This ensures entity nodes in the graph use the resolved names
    logging.info("Extracting final entities with resolved names...")
    final_entities = extract_final_entities(doc, refined_char_coref_map)

    logging.info("--- Text Processing Pipeline Complete ---")
    return relationships, final_entities


# --- Main Execution ---

if __name__ == "__main__":
    # --- 1. Load spaCy Model ---
    logging.info(f"Loading spaCy model: {spacy_model}")
    try:
        # Consider disabling components not strictly needed if memory/speed is critical
        # nlp = spacy.load(spacy_model, disable=['parser'] if only NER needed elsewhere)
        nlp = spacy.load(spacy_model)
        logging.info("spaCy model loaded successfully.")
    except OSError:
        logging.error(f"spaCy model '{spacy_model}' not found. Please download it: python -m spacy download {spacy_model}")
        exit(1)
    except Exception as e:
        logging.error(f"Error loading spaCy model '{spacy_model}': {e}")
        exit(1)


    # --- 2. Initialize FastCoref Model ---
    logging.info("Initializing FastCoref model...")
    device = 'cpu' # Default to CPU
    if torch.backends.mps.is_available():
        try:
            # Test MPS availability properly
            torch.tensor([1], device='mps')
            device = 'mps'
            logging.info("Attempting to use MPS (Apple Silicon GPU).")
        except Exception as e:
            logging.warning(f"MPS device requested but not available or functional ({e}), using CPU.")
            device = 'cpu'
    elif torch.cuda.is_available():
        device = 'cuda:0'
        logging.info("Attempting to use CUDA GPU.")

    try:
        # Pass the loaded spaCy model to FastCoref if it accepts/requires it
        # Check FastCoref documentation for the best way to integrate
        # Some versions might prefer text only, others benefit from pre-tokenization
        fc_model = FCoref(model_name_or_path=fastcoref_model_name, device=device) # Adjust based on FCoref version
        # Older usage might have been: FCoref(fastcoref_model_name, nlp=nlp, device=device)
        logging.info(f"Successfully initialized FastCoref model '{fastcoref_model_name}' on {device.upper()}.")
    except Exception as e:
        logging.error(f"Failed to initialize FastCoref model on {device.upper()}: {e}")
        # Optionally, try falling back to CPU if GPU failed
        if device != 'cpu':
            logging.warning("Attempting fallback to CPU for FastCoref.")
            try:
                device = 'cpu'
                fc_model = FCoref(model_name_or_path=fastcoref_model_name, device=device)
                logging.info(f"Successfully initialized FastCoref model '{fastcoref_model_name}' on CPU.")
            except Exception as e2:
                 logging.error(f"Failed to initialize FastCoref model on CPU as fallback: {e2}")
                 exit(1) # Exit if model cannot be loaded
        else:
             exit(1) # Exit if CPU init failed


    # --- 3. Define Input Text ---
    text = """
    Apple Inc., founded by Steve Jobs and Steve Wozniak in 1976, is based in Cupertino, California.
    Tim Cook became the CEO of Apple in August 2011 after Jobs resigned. He previously worked at IBM Corp.
    Apple produces the popular iPhone smartphone. Its main competitor, Google LLC, makes the Android operating system.
    Steve Jobs also co-founded Pixar Animation Studios, which was later acquired by The Walt Disney Company in 2006. Bob Iger leads Disney currently.
    """

    # --- 4. Run the Full Pipeline ---
    extracted_relationships, extracted_entities = process_text_to_graph_info(
        text, nlp, fc_model
    )

    # --- 5. Display Results ---
    print("\n" + "="*30 + " FINAL RESULTS " + "="*30)
    print("\nEntities (Resolved Name: Type):")
    if extracted_entities:
        # Sort for consistent output
        for name in sorted(extracted_entities.keys()):
            print(f"  - {name}: {extracted_entities[name]}")
    else:
        print("  (No entities found)")

    print("\nRelationships (Subject, Verb, Object, Attributes):")
    if extracted_relationships:
        # Sort for consistent output
        extracted_relationships.sort()
        for rel in extracted_relationships:
            subj, verb, obj, attrs = rel
            attr_str = f"| Attributes: {attrs}" if attrs else ""
            print(f"  - ({subj}) --[{verb}]--> ({obj}) {attr_str}")
    else:
        print("  (No relationships found)")

    print("\nDone.")

04/21/2025 16:46:01 - INFO - 	 WordNet data found.
04/21/2025 16:46:01 - INFO - 	 Loading spaCy model: en_core_web_lg
04/21/2025 16:46:02 - INFO - 	 spaCy model loaded successfully.
04/21/2025 16:46:02 - INFO - 	 Initializing FastCoref model...
04/21/2025 16:46:02 - INFO - 	 Attempting to use MPS (Apple Silicon GPU).
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m63.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


04/21/2025 16:46:04 - INFO - 	 missing_keys: []
04/21/2025 16:46:04 - INFO - 	 unexpected_keys: []
04/21/2025 16:46:04 - INFO - 	 mismatched_keys: []
04/21/2025 16:46:04 - INFO - 	 error_msgs: []
04/21/2025 16:46:04 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M
04/21/2025 16:46:04 - INFO - 	 Successfully initialized FastCoref model 'biu-nlp/f-coref' on MPS.
04/21/2025 16:46:04 - INFO - 	 --- Starting Text Processing Pipeline ---
04/21/2025 16:46:04 - INFO - 	 Processing text: 'Apple Inc., founded by Steve Jobs and Steve Wozniak in 1976, is based in Cupertino, California.
    ...'
04/21/2025 16:46:04 - INFO - 	 Running spaCy NLP pipeline...
04/21/2025 16:46:04 - INFO - 	 spaCy processing complete.
04/21/2025 16:46:04 - INFO - 	 Extracting initial NER entities...
04/21/2025 16:46:04 - INFO - 	 Found 19 initial unique entity texts.
04/21/2025 16:46:04 - INFO - 	 Running FastCoref pipeline...
04/21/2025 16:46:04 - INFO - 	 Tokenize 1 inputs...
Map: 100%|█████████



Entities (Resolved Name: Type):
  - 1976: ORG
  - 2006: DATE
  - Android: ORG
  - August 2011: DATE
  - Bob Iger: PERSON
  - California: GPE
  - Cupertino: GPE
  - Google LLC: ORG
  - IBM Corp.
    Apple: ORG
  - Pixar Animation Studios: ORG
  - Steve Jobs: PERSON
  - Steve Wozniak: PERSON
  - The Walt Disney Company: ORG
  - Tim Cook: PERSON
  - iPhone: ORG

Relationships (Subject, Verb, Object, Attributes):
  - (1976) --[ESTABLISH_IN]--> (Cupertino) 
  - (1976) --[MAKE]--> (the Android operating system) 
  - (Bob Iger) --[LEAD]--> (The Walt Disney Company) | Attributes: {'manner': 'currently'}
  - (Steve Jobs) --[CO]--> (Pixar Animation Studios) | Attributes: {'manner': 'also'}
  - (The Walt Disney Company) --[GET]--> (which) | Attributes: {'manner': 'later', 'time': '2006'}
  - (Tim Cook) --[BECOME]--> (the CEO) | Attributes: {'time': 'August 2011'}
  - (Tim Cook) --[WORK_AT]--> (IBM Corp.
    Apple) | Attributes: {'manner': 'previously'}

Done.
