### Watson creds & Model parameters

In [None]:
pip install ibm-watson

In [None]:
import os
from ibm_watsonx_ai import APIClient, Credentials
import getpass

credentials = Credentials(
    url="https://us-south.ml.cloud.ibm.com",
    api_key= input("Enter your IBM CLoud API Key:")
)

In [None]:
model_id = "meta-llama/llama-3-405b-instruct"

In [None]:
parameters = {
    "frequency_penalty": 0,
    "max_tokens": 2000,
    "presence_penalty": 0,
    "temperature": 0,
    "top_p": 1
}

In [None]:
project_id = os.getenv("PROJECT_ID")
space_id = os.getenv("SPACE_ID")

In [None]:
from ibm_watsonx_ai.foundation_models import ModelInference

model = ModelInference(
	model_id = model_id,
	params = parameters,
	credentials = credentials,
	project_id = project_id,
	space_id = space_id
	)

### Web page fetching & content cleanup function (extract text from html page)

In [None]:
import requests
import trafilatura

url = input("Enter the Wikipedia URL:")

downloaded = trafilatura.fetch_url(url)

if downloaded:
    extracted_text = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
    #print(extracted_text)
else:
    print("Failed to fetch the content.")


### Watson Discovery Creds & Client

In [None]:
DISCOVERY_API_KEY = input("Enter your IBM Watson Discovery API Key:")
DISCOVERY_SERVICE_URL = input("Enter your IBM Watson Discovery Service URL:")
DISCOVERY_PROJECT_ID =  input("Enter your IBM Watson Discovery Project ID:")
DISCOVERY_COLLECTION_ID =  input("Enter your IBM Watson Discovery Collection ID:") 

# The version of the Discovery API to use (recommended to use a recent stable version)
# Check IBM Watson Discovery documentation for the latest recommended version.
DISCOVERY_API_VERSION = "2023-03-31"

print("Watson Discovery configuration variables set.")

In [None]:
from ibm_watson import DiscoveryV2
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

try:
    # Authenticate with IAM
    authenticator = IAMAuthenticator(DISCOVERY_API_KEY)

    # Initialize the Discovery client
    discovery_client = DiscoveryV2(
        version=DISCOVERY_API_VERSION,
        authenticator=authenticator
    )

    # Set the service URL
    discovery_client.set_service_url(DISCOVERY_SERVICE_URL)

    print("Watson Discovery client initialized successfully.")
except Exception as e:
    print(f"ERROR: Failed to initialize Watson Discovery client: {e}")
    print("Please check your API Key, Service URL, and ensure the SDK is installed correctly.")

### Upload content to Watson Discovery for processing

In [None]:
import io
import datetime

def upload_text_to_discovery(text):
    try:
        # Create an in-memory text file from the extracted text
        file_obj = io.BytesIO(text.encode("utf-8"))

        # Create a unique name using timestamp or hash if needed
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        doc_name = f"doc_{timestamp}.txt"

        # Upload the document to Discovery
        response = discovery_client.add_document(
            project_id=DISCOVERY_PROJECT_ID,
            collection_id=DISCOVERY_COLLECTION_ID,
            file=file_obj,
            filename=doc_name,
            file_content_type="text/plain"
        ).get_result()

        document_id = response.get("document_id")
        print(f"Document uploaded successfully. Document ID: {document_id}")
        return document_id

    except Exception as e:
        print(f"Failed to upload document to Discovery: {e}")
        return None


# --- Upload the Wikipedia extracted text ---
document_id = upload_text_to_discovery(extracted_text)



In [None]:
import time

def check_status_until_available(document_id, discovery_client, project_id, collection_id,
                                 interval_seconds=10, timeout_minutes=10, _start_time=None):
    """
    Recursively checks the document's processing status until it's 'available',
    or a timeout is reached. Prints status at each check.
    """
    if _start_time is None:
        _start_time = time.time()
    
    elapsed = time.time() - _start_time

    # Base case: timeout reached
    if elapsed > timeout_minutes * 60:
        print(f"⏰ TIMEOUT: Document ID {document_id} still not available after {timeout_minutes} minutes.")
        return False

    try:
        metadata = discovery_client.get_document(
            project_id=project_id,
            collection_id=collection_id,
            document_id=document_id
        ).get_result()

        status = metadata.get("status", "unknown")
        print(f"📡 Status Check: {status} | Elapsed: {int(elapsed)}s")

        if status == "available":
            print(f"✅ Document is ready for querying (status: available).")
            return True
        elif status == "failed":
            print(f"❌ Document processing failed.")
            return False
        else:
            time.sleep(interval_seconds)
            return check_status_until_available(
                document_id, discovery_client, project_id, collection_id,
                interval_seconds, timeout_minutes, _start_time
            )

    except Exception as e:
        print(f"⚠️ Error while checking document status: {e}")
        time.sleep(interval_seconds * 2)
        return check_status_until_available(
            document_id, discovery_client, project_id, collection_id,
            interval_seconds, timeout_minutes, _start_time
        )

# After uploading the document
status_ready = check_status_until_available(
    document_id=document_id,
    discovery_client=discovery_client,
    project_id=DISCOVERY_PROJECT_ID,
    collection_id=DISCOVERY_COLLECTION_ID
)

if status_ready:
    print("✅ You may now safely proceed to query the document.")
else:
    print("🚫 Aborting: Document is not available.")
    raise "Document is not available"



In [None]:
import json

try:
    # Perform a query on the Discovery collection to retrieve enriched entities and FULL TEXT from the document
    response = discovery_client.query(
        project_id=DISCOVERY_PROJECT_ID,
        collection_ids=[DISCOVERY_COLLECTION_ID],
        filter=f'document_id::"{document_id}"',
        # Now explicitly asking for the 'text' field to get the full content
        return_=['text', 'enriched_text.entities.text', 'enriched_text.entities.type','enriched_text.entities.model']
    ).get_result()

    results = response.get("results", [])
    first_document_result = results[0]

except Exception as e:
    print(f"ERROR: Failed to query content from the document: {e}")

In [None]:
# This cell assumes first_document_result is already defined from the previous query cell

enriched_text_content_list = first_document_result.get("enriched_text", [])

if enriched_text_content_list and isinstance(enriched_text_content_list, list) and enriched_text_content_list:
    # Access the first dictionary in the list which contains the 'entities'
    # Adding a check for the content of the list
    if isinstance(enriched_text_content_list[0], dict):
        all_entities = enriched_text_content_list[0].get("entities", [])

        # Filter the entities to only keep those from "extractor1" model
        extractor1_entities = [
            entity for entity in all_entities
            if entity.get('model_name') == 'extractor'
        ]

        if extractor_entities:
            print("\nExtracted Entities (filtered for 'extractor' model):")
            for entity in extractor1_entities: # Iterate through the FILTERED list
                entity_text = entity.get('text', 'N/A')
                entity_type = entity.get('type', 'N/A')
                print(f"- {entity_text} (Type: {entity_type})")
        else:
            print("No entities from 'extractor' model found in the 'enriched_text' section of the document.")
    else:
        print("The first element of 'enriched_text' was not a dictionary as expected.")
else:
    print("The 'enriched_text' field was not in the expected list format or was empty.")

In [None]:
import json
from ibm_watson import NaturalLanguageUnderstandingV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watson.natural_language_understanding_v1 import Features, RelationsOptions

# --- Configuration ---
# Replace with your IBM Cloud API Key for Natural Language Understanding
# You can find this in your IBM Cloud service credentials.
API_KEY = input("Enter your IBM Cloud NLU API Key")

# Replace with your NLU service URL (e.g., 'https://api.us-south.natural-language-understanding.watson.cloud.ibm.com/instances/...')
# You can find this in your IBM Cloud service credentials.
SERVICE_URL =  input("Enter your IBM Cloud NLU Service URL:")

# The text you want to analyze for relations
text_to_analyze = extracted_text

# --- Initialize NLU Service ---
try:
    authenticator = IAMAuthenticator(API_KEY)
    natural_language_understanding = NaturalLanguageUnderstandingV1(
        version='2022-04-07', # Use a recent API version
        authenticator=authenticator
    )
    natural_language_understanding.set_service_url(SERVICE_URL)

    print("Watson Natural Language Understanding service initialized successfully.\n")

    # --- Define Features for Analysis ---
    # We are specifically interested in 'relations'
    features = Features(
        relations=RelationsOptions()
    )

    # --- Analyze the Text ---
    print("Analyzing text for relations...\n")
    response = natural_language_understanding.analyze(
        text=text_to_analyze,
        features=features
    ).get_result()

    # --- Process and Print Results ---
    if 'relations' in response and len(response['relations']) > 0:
        print("Extracted Relations:")
        for relation in response['relations']:
            print(f"  Type: {relation.get('type')}")
            print(f"  Score: {relation.get('score'):.2f}")
            print(f"  Sentence: {relation.get('sentence')}")
            print(f"  Arguments:")
            for arg in relation.get('arguments', []):
                print(f"    - Text: {arg.get('text')}")
                print(f"      Type: {arg.get('entities', [{}])[0].get('type')}") # Get type from first entity if available
            print("-" * 30)
    else:
        print("No relations found in the provided text.")

except Exception as e:
    print(f"An error occurred: {e}")
    print("Please ensure your API_KEY and SERVICE_URL are correct and you have access to the NLU service.")



In [None]:
!pip install neo4j

In [None]:
from neo4j import GraphDatabase

NEO4J_URI = input("Enter your Neo4j AuraDB URI:")
NEO4J_USER = input("Enter your Neo4j AuraDB Username:")
NEO4J_PASSWORD = input("Enter your Neo4j AuraDB Password:") 

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

try:
    driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
    driver.verify_connectivity()
    print("Successfully connected to Neo4j AuraDB!")
except Exception as e:
    print(f"ERROR: Could not connect to Neo4j: {e}")
    driver = None 

In [None]:
import re

def sanitize_relationship(rel):
    """
    Converts a relationship string into a valid Cypher relationship type.
    - Replaces spaces and invalid characters with underscores.
    - Ensures it only contains alphanumeric characters and underscores.
    """
    rel = rel.strip()
    rel = rel.replace(" ", "_")  # Replace spaces with underscores
    rel = re.sub(r"[^A-Za-z0-9_]", "_", rel)  # Replace invalid characters with underscores
    rel = re.sub(r"_+", "_", rel)  # Replace multiple underscores with a single one
    rel = rel.strip("_")  # Remove leading/trailing underscores if any
    if not rel:
        rel = "RELATED_TO"  # Fallback if relation becomes empty
    return rel

def save_relationships_to_neo4j(llm_output, driver):
    """
    Parses relationships from LLM output and saves them to a Neo4j graph.
    """
    pattern = r"\{(.*?),\s*(.*?),\s*(.*?)\}"
    matches = re.findall(pattern, llm_output)

    with driver.session() as session:
        for entity1, relation, entity2 in matches:
            entity1 = entity1.strip()
            entity2 = entity2.strip()
            sanitized_relation = sanitize_relationship(relation)

            print(f"Inserting: ({entity1}) -[:{sanitized_relation}]-> ({entity2})")

            cypher_query = f"""
            MERGE (a:Entity {{name: $entity1}})
            MERGE (b:Entity {{name: $entity2}})
            MERGE (a)-[r:{sanitized_relation}]->(b)
            """

            session.run(cypher_query, entity1=entity1, entity2=entity2)


In [None]:
save_relationships_to_neo4j(result, driver)