In [4]:
import subprocess
import os
import urllib.request
import time
from typing import List

In [5]:
from langchain_community.document_loaders import WikipediaLoader
from langchain_text_splitters import TokenTextSplitter
from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer
from langchain_core.documents import Document
from langchain_neo4j import Neo4jGraph

In [6]:
# Configuration for Diffbot API Key
os.environ["DIFFBOT_API_KEY"] = "da94fef74b2133457fec1bfd3293855c"

In [7]:
# Load and Split Wikipedia Data
def load_and_split_wikipedia_article(query: str) -> List[Document]:
    """Load a Wikipedia article and split it into chunks."""

    print("Loading Wikipedia article...")
    raw_documents = WikipediaLoader(query=query).load()
    
    print("Splitting documents into chunks...")
    text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)
    documents = text_splitter.split_documents(raw_documents)
    
    return documents

In [8]:
# Example usage
query = "Yann LeCun"
wiki_document = load_and_split_wikipedia_article(query)

print(f"Loaded and split {len(wiki_document)} document chunks from Wikipedia article on {query}.")

Loading Wikipedia article...
Splitting documents into chunks...
Loaded and split 50 document chunks from Wikipedia article on Yann LeCun.


In [9]:
# Extract Graph Data with Diffbot
def extract_graph_with_diffbot(documents: List[Document], api_key: str) -> DiffbotGraphTransformer:
    """Extract graph data from documents using Diffbot."""

    print("Extracting graph data with Diffbot...")
    try:
        graph_transformer = DiffbotGraphTransformer(api_key)
        graph = graph_transformer.convert_to_graph_documents(documents)
        print("Graph extraction complete.")

        return graph
    
    except Exception as e:
        print(f"Error during graph extraction: {e}")
        
        return None

In [10]:
# Example usage of graph extraction
wiki_graph = extract_graph_with_diffbot(wiki_document,
                                        os.environ["DIFFBOT_API_KEY"])

if wiki_graph:
    print(f"Extracted graph with {sum(len(page.nodes) for page in wiki_graph)} nodes and \
{sum(len(page.relationships) for page in wiki_graph)} edges.")

Extracting graph data with Diffbot...
Graph extraction complete.
Extracted graph with 368 nodes and 614 edges.


In [None]:
if wiki_graph:
    print("Sample nodes and relationships:")
    # Display first 3 nodes and relationships from the first page
    page = wiki_graph[0]
    if page:
        for node in page.nodes[:3]:       
            print(f"Node: {node}")
        for relationship in page.relationships[:3]:
            print(f"Relationship: {relationship}")

Sample nodes and relationships:
Node: id='http://www.wikidata.org/entity/Q3571662' type='Person' properties={'name': 'Yann LeCun', 'positionHeld': 'computer scientist', 'dateOfBirth': '1960-07-08'}
Node: id='Adaptive Systems Research Department' type='Organization' properties={'name': 'Adaptive Systems Research Department'}
Node: id='http://www.wikidata.org/entity/Q217365' type='Organization' properties={'name': 'Bell Laboratories'}
Relationship: source=Node(id='http://www.wikidata.org/entity/Q3571662', type='Person', properties={}) target=Node(id='Adaptive Systems Research Department', type='Organization', properties={}) type='EMPLOYEE_OR_MEMBER_OF' properties={'evidence': 'In 1988, LeCun joined the Adaptive Systems Research Department at AT&T Bell Laboratories in Holmdel, New Jersey, United States, headed by Lawrence D. Jackel, where he developed a number of new machine learning methods, such as a biologically inspired model of image recognition called convolutional neural networks (

In [None]:
def setup_neo4j_with_plugins():
    container_name = "neo4j"
    password = "password"

    # Create local files
    os.makedirs("./neo4j_data/plugins", exist_ok=True)
    os.makedirs("./neo4j_data/data", exist_ok=True)
    os.makedirs("./neo4j_data/logs", exist_ok=True)

    # Downloading the plugins (APOC, GDS, Bloom)
    print("APOC, GDS and Bloom will be auto-installed by Neo4j.")

    # Stop and delete the old container if it exists
    subprocess.run(["docker", "stop", container_name], stderr=subprocess.DEVNULL)
    subprocess.run(["docker", "rm", container_name], stderr=subprocess.DEVNULL)

    # Absolute paths for Docker
    paths = {p: os.path.abspath(f"./neo4j_data/{p}") 
             for p in ["plugins", "data", "logs"]}

    # Launch of Neo4j Community with Docker
    cmd = [
        "docker", "run", "--name", container_name,
        "-p", "7474:7474", "-p", "7687:7687",
        "-e", f"NEO4J_AUTH=neo4j/{password}",                               # Authentication
        "-e", 'NEO4J_PLUGINS=["apoc","graph-data-science","bloom"]',        # Install plugins automatically
        "-e", "NEO4J_dbms_security_procedures_unrestricted=apoc.*,gds.*",   # Allow APOC & GDS procedures
        "-e", "NEO4J_dbms_security_procedures_allowlist=apoc.*,gds.*",
        "-e", "NEO4J_server_config_strict__validation_enabled=false",       # Disable strict validation to prevent APOC errors
        "-v", f"{paths['plugins']}:/plugins",                               # Mount volumes
        "-v", f"{paths['data']}:/data",
        "-v", f"{paths['logs']}:/logs",
        "-d", "neo4j:5.20"                                                  # Use Neo4j 5.20 Community Edition
    ]

    subprocess.run(cmd, check=True)

    print("Neo4j Community launched with APOC + GDS + Bloom! Awaiting startup...")
    time.sleep(20)
    print("Neo4j ready at bolt://localhost:7687 (default DB: db)")


# Run setup
setup_neo4j_with_plugins()

APOC, GDS and Bloom will be auto-installed by Neo4j.
Neo4j Community launched with APOC + GDS + Bloom! Awaiting startup...
Neo4j ready at bolt://localhost:7687 (default DB: db)


In [2]:
# Displays running Docker containers
!docker ps

CONTAINER ID   IMAGE        COMMAND                  CREATED        STATUS              PORTS                                                                                      NAMES
86dccdae88fd   neo4j:5.20   "tini -g -- /startupâ€¦"   13 hours ago   Up About a minute   0.0.0.0:7474->7474/tcp, [::]:7474->7474/tcp, 0.0.0.0:7687->7687/tcp, [::]:7687->7687/tcp   neo4j


In [3]:
from neo4j import GraphDatabase

URI = "bolt://localhost:7687"
AUTH = ("neo4j", "password")

# Connect to Neo4j
try:
    driver = GraphDatabase.driver(URI, auth=AUTH)
    driver.verify_connectivity()  # raises if cannot connect
except Exception as e:
    print("Neo4j connection failed:", e)
    raise

# print the driver object 
display(driver)

<neo4j._sync.driver.BoltDriver at 0x1cd48b37ed0>

In [None]:
# Connect to Neo4j using langchain_neo4j
graph = Neo4jGraph(
    url=URI,
    username=AUTH[0],
    password=AUTH[1],
    database="shop",
    refresh_schema=False)

# --- IGNORE ---
#graph.query("MATCH (n)-[r]->(m) DELETE r")
#graph.query("MATCH (n) DELETE n")

graph.add_graph_documents(wiki_graph)
print("Graph documents added to Neo4j.")

Graph documents added to Neo4j.
