In [1]:
%%capture

# install PyG if running on Google Colab
import sys
import torch

if 'google.colab' in sys.modules:

  def format_pytorch_version(version):
    return version.split('+')[0]

  TORCH_version = torch.__version__
  TORCH = format_pytorch_version(TORCH_version)

  def format_cuda_version(version):
    return 'cu' + version.replace('.', '')

  CUDA_version = torch.version.cuda
  CUDA = format_cuda_version(CUDA_version)

  !pip install torch-scatter     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
  !pip install torch-sparse      -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
  !pip install torch-cluster     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
  !pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
  !pip install torch-geometric

Let's create a toy dataset

In [2]:
from torch_geometric.data import Data

# Assume a toy dataset with 3 papers (nodes), edges, and labels
data = Data(
    x=torch.rand(3, 10),  # let's use random features for simplicity
    edge_index=torch.tensor([[0, 1], [1, 2]], dtype=torch.long),  # Edges
    y=torch.tensor([0, 1, 2], dtype=torch.long),  # True labels
    text=["Paper A abstract", "Paper B abstract", "Paper C abstract"],  # Text data
)

In [3]:
import torch
from transformers import AutoTokenizer, AutoModel
from torch_geometric.nn import GCNConv

# 1. Define the Graph Neural Network (GNN)
class GNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

# 2. Define the LLM (e.g., BERT for text encoding)
class TextEncoder(torch.nn.Module):
    def __init__(self, model_name="bert-base-uncased", output_dim=128):
        super(TextEncoder, self).__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.fc = torch.nn.Linear(self.model.config.hidden_size, output_dim)

    def forward(self, texts):
        # Tokenize and encode text data
        inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
        outputs = self.model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token embedding
        return self.fc(cls_embedding)

# 3. Training Loop with Pseudo-label Exchange
def train_prediction_alignment(data, gnn, text_encoder, num_iterations=5):
    optimizer_gnn = torch.optim.Adam(gnn.parameters(), lr=0.01)
    optimizer_text = torch.optim.Adam(text_encoder.parameters(), lr=0.0001)

    for iteration in range(num_iterations):
        # 3.1 Train GNN
        gnn.train()
        optimizer_gnn.zero_grad()
        gnn_labels = gnn(data.x, data.edge_index)  # GNN generates pseudo-labels
        gnn_loss = torch.nn.CrossEntropyLoss()(gnn_labels, data.y)  # Assume `data.y` is the target
        gnn_loss.backward()
        optimizer_gnn.step()

        # 3.2 Train LLM with GNN-generated pseudo-labels
        text_encoder.train()
        optimizer_text.zero_grad()
        text_features = text_encoder(data.text)  # Text encoding
        llm_loss = torch.nn.CrossEntropyLoss()(text_features, torch.argmax(gnn_labels, dim=1))
        llm_loss.backward()
        optimizer_text.step()

        print(f"Iteration {iteration+1}: GNN Loss = {gnn_loss.item()}, LLM Loss = {llm_loss.item()}")

# Initialize models and train
gnn = GNN(input_dim=10, hidden_dim=16, output_dim=3)
text_encoder = TextEncoder()
train_prediction_alignment(data, gnn, text_encoder)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Iteration 1: GNN Loss = 1.1347641944885254, LLM Loss = 4.79880952835083
Iteration 2: GNN Loss = 1.0975130796432495, LLM Loss = 2.962622880935669
Iteration 3: GNN Loss = 1.0674976110458374, LLM Loss = 1.480251669883728
Iteration 4: GNN Loss = 1.044685959815979, LLM Loss = 0.6315489411354065
Iteration 5: GNN Loss = 1.0310826301574707, LLM Loss = 0.29822805523872375


In [4]:
# Import libraries
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from torch_geometric.nn import GraphConv
from torch_geometric.data import Data

# 1. Define the GNN
class GNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(GNN, self).__init__()
        self.conv = GraphConv(input_dim, hidden_dim)

    def forward(self, x, edge_index):
        return self.conv(x, edge_index)

# 2. Define the Text Encoder (LLM)
class TextEncoder(torch.nn.Module):
    def __init__(self, model_name="bert-base-uncased", output_dim=128):
        super(TextEncoder, self).__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.fc = torch.nn.Linear(self.model.config.hidden_size, output_dim)

    def forward(self, texts):
        inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
        outputs = self.model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token embedding
        return self.fc(cls_embedding)

# 3. Contrastive Learning Objective
def contrastive_loss(graph_emb, text_emb, tau=0.1):
    sim = F.cosine_similarity(graph_emb.unsqueeze(1), text_emb.unsqueeze(0), dim=2)
    labels = torch.arange(sim.size(0)).to(sim.device)
    loss = F.cross_entropy(sim / tau, labels)
    return loss

# 4. Training Loop for Latent Space Alignment
def train_latent_alignment(data, gnn, text_encoder, epochs=10):
    optimizer = torch.optim.Adam(list(gnn.parameters()) + list(text_encoder.parameters()), lr=0.001)
    for epoch in range(epochs):
        optimizer.zero_grad()

        # Encode graph and text
        graph_emb = gnn(data.x, data.edge_index)  # Graph embeddings
        text_emb = text_encoder(data.text)  # Text embeddings

        # Compute contrastive loss
        loss = contrastive_loss(graph_emb, text_emb)
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch+1}: Loss = {loss.item()}")

# 5. Example Data
# Toy data with 3 products and their relationships
data = Data(
    x=torch.rand(3, 10),  # Node features
    edge_index=torch.tensor([[0, 1], [1, 2]], dtype=torch.long),  # Edges
    text=["Product A description", "Product B description", "Product C description"],  # Text data
)

# Initialize models and train
gnn = GNN(input_dim=10, hidden_dim=128)
text_encoder = TextEncoder()
train_latent_alignment(data, gnn, text_encoder)

Epoch 1: Loss = 1.1030889749526978
Epoch 2: Loss = 0.8914819359779358
Epoch 3: Loss = 0.654697060585022
Epoch 4: Loss = 1.1298424005508423
Epoch 5: Loss = 1.1417795419692993
Epoch 6: Loss = 1.0708292722702026
Epoch 7: Loss = 0.987954318523407
Epoch 8: Loss = 1.0997623205184937
Epoch 9: Loss = 1.0972650051116943
Epoch 10: Loss = 1.0914229154586792


# GraphRAG

If using Colab you can simply run the following cells.

Otherwise, if you want to use the local backend, please:
- download neo4j desktop on [docker](https://neo4j.com/docs/graph-data-science/current/installation/installation-docker/)*
- download [lm-studio](https://lmstudio.ai/) and download the minicpm-llama3-v-2_5 and nomic-embed-text model

*run docker as:


```
docker run --rm --env NEO4J_AUTH=neo4j/defaultpass -p 7474:7474 -p 7687:7687 -v $PWD/data:/data -v $PWD/plugins:/plugins --name neo4j-apoc -e NEO4J_apoc_export_file_enabled=true -e NEO4J_apoc_import_file_enabled=true -e NEO4J_apoc_import_file_use__neo4j__config=true -e NEO4J_PLUGINS=\[\"apoc-extended\"\] neo4j
```



In [21]:
import sys

if 'google.colab' in sys.modules:
  LLM_BACKEND = "ollama" # choose ["ollama" | "lm-studio"]
else:
  LLM_BACKEND = "lm-studio"

assert LLM_BACKEND in ["ollama", "lm-studio"]

if LLM_BACKEND == "ollama":
  base_url = "http://localhost:11434/v1"
  api_key = "ollama"
  llm_model = "phi4"
else:
  base_url = "http://localhost:1234/v1"
  api_key = "lm-studio"
  llm_model = "minicpm-llama3-v-2_5"

If Colab you need to download ollama and start the server

In [31]:
if 'google.colab' in sys.modules:
  !curl https://ollama.ai/install.sh | sh
  !nohup ollama serve > ollama.log &
  !sleep 5
  !time curl -i localhost:11434

nohup: redirecting stderr to stdout
HTTP/1.1 200 OK
[1mContent-Type[0m: text/plain; charset=utf-8
[1mDate[0m: Sun, 02 Feb 2025 21:36:50 GMT
[1mContent-Length[0m: 17

Ollama is running
real	0m0.009s
user	0m0.005s
sys	0m0.004s


In [20]:
# Dowload the language model 'minicpm-llama3-v-2_5' and the text embedder
# WARNING: models are > 5GB
if LLM_BACKEND == "ollama":
  !ollama pull minicpm-v
  !ollama pull nomic-embed-text
  !ollama list

[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest 
pulling 262843d4806a...   0% ▕▏    0 B/4.4 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 262843d4806a...   0% ▕▏    0 B/4.4 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 262843d4806a...   0% ▕▏    0 B/4.4 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 262843d4806a...   1% ▕▏  30 MB/4.4 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 262843d4806a...   2% ▕▏  84 MB/4.4 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 262843d4806a...   2% ▕▏ 109 MB/4.4 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 262843d4806a...   3% ▕▏

# Neo4j

In [27]:
import os
import sys

if 'google.colab' in sys.modules:
  !pip install -q "neo4j-graphrag[ollama]"
  !pip install -q langchain_community langchain_experimental langchain_neo4j langchain_openai

  # Install the JDK
  !apt-get install openjdk-17-jdk-headless -qq > /dev/null
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"
  !update-alternatives --set java /usr/lib/jvm/java-17-openjdk-amd64/jre/bin/java
  !java -version

  # Download Neo4j
  !curl -O https://dist.neo4j.org/neo4j-community-5.26.1-unix.tar.gz
  # Decompress and rename the directory
  !tar -xf neo4j-community-5.26.1-unix.tar.gz
  !mv neo4j-community-5.26.1 nj

  # (Optional) Disable authentication (if desired)
  !sed -i '/#dbms.security.auth_enabled/s/^#//g' nj/conf/neo4j.conf

  # Copy the APOC jar from the labs directory to plugins
  !cp nj/labs/apoc-*.jar nj/plugins/

  # Update configuration to allow APOC procedures
  !echo "dbms.security.procedures.unrestricted=apoc.*" >> nj/conf/neo4j.conf
  !echo "dbms.security.procedures.allowlist=apoc.*" >> nj/conf/neo4j.conf

  # Start Neo4j
  !nj/bin/neo4j start

update-alternatives: error: alternative /usr/lib/jvm/java-17-openjdk-amd64/jre/bin/java for java not registered; not setting
openjdk version "17.0.13" 2024-10-15
OpenJDK Runtime Environment (build 17.0.13+11-Ubuntu-2ubuntu122.04)
OpenJDK 64-Bit Server VM (build 17.0.13+11-Ubuntu-2ubuntu122.04, mixed mode, sharing)
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  151M  100  151M    0     0   217M      0 --:--:-- --:--:-- --:--:--  217M
Directories in use:
home:         /content/nj
config:       /content/nj/conf
logs:         /content/nj/logs
plugins:      /content/nj/plugins
import:       /content/nj/import
data:         /content/nj/data
certificates: /content/nj/certificates
licenses:     /content/nj/licenses
run:          /content/nj/run
Starting Neo4j.
Started neo4j (pid:8890). It is available at http://localhost:7474
There may be a short delay until the server is ready.


In [None]:
# colab may kill the ollama process, start again
if 'google.colab' in sys.modules:
  !nohup ollama serve > ollama.log &
  !sleep 5
  !time curl -i localhost:11434

In [43]:
from neo4j import GraphDatabase
from langchain_neo4j import Neo4jGraph

# ---- Step 1: Setup Neo4j Connection ----
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "your_password"

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
graph = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USER, password=NEO4J_PASSWORD)

# ---- Step 2: Create knowledge graph from text ----
import os
from langchain_experimental.graph_transformers.llm import LLMGraphTransformer
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0,
                 model_name=llm_model,
                 base_url=base_url,
                 api_key=api_key)

llm_transformer = LLMGraphTransformer(llm=llm)

from langchain_core.documents import Document

text = """
Marie Curie, born in 1867, was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.
She was the first woman to win a Nobel Prize, the first person to win a Nobel Prize twice, and the only person to win a Nobel Prize in two scientific fields.
Her husband, Pierre Curie, was a co-winner of her first Nobel Prize, making them the first-ever married couple to win the Nobel Prize and launching the Curie family legacy of five Nobel Prizes.
She was, in 1906, the first woman to become a professor at the University of Paris.
"""
documents = [Document(page_content=text)]
graph_documents = llm_transformer.convert_to_graph_documents(documents)
print(f"Nodes:{graph_documents[0].nodes}")
print(f"Relationships:{graph_documents[0].relationships}")

# Add graph to neo4j
graph.add_graph_documents(graph_documents)

# ---- Step 3: Perform GraphRAG ----

def escape(s):
  return s.replace("{","").replace("}","")

CYPHER_GENERATION_TEMPLATE = f"""You are a Neo4j expert. Generate a Cypher query to answer the given question.

Database Schema: {escape(graph.schema)}

Rules:
1. Always use explicit `MATCH` for relationships.
2. Never use `WHERE` for relationship matching.
3. Use `RETURN DISTINCT` when appropriate.

Example Queries:
1. Question: "Who won the Nobel Prize?"
   Cypher: MATCH (p:Person)-[:WON_NOBEL_PRIZE]->(:Awarded) RETURN p.id AS winner

Question: {{query}}
Return only the Cypher query without any explanation or additional text.
Cypher:"""

from langchain_neo4j import GraphCypherQAChain
from langchain_core.prompts import PromptTemplate

chain = GraphCypherQAChain.from_llm(
    llm=llm,
    graph=graph,
    verbose=True,
    cypher_prompt=PromptTemplate(
        input_variables=["query"],
        template=CYPHER_GENERATION_TEMPLATE
    ),
    allow_dangerous_requests=True
)

# ---- Step 5: Test Queries ----
print("\nTesting queries...")

question = "Who married a Nobel Prize?"

print(f"\nQuestion: {question}")
response = chain.invoke(question)
print("Response:", response['result'])

# Close the driver
driver.close()

Nodes:[Node(id='Marie Curie', type='Person', properties={}), Node(id='1867', type='Year', properties={}), Node(id='Polish', type='Nationality', properties={}), Node(id='French', type='Nationality', properties={}), Node(id='Physicist', type='Profession', properties={}), Node(id='Chemist', type='Profession', properties={}), Node(id='Radioactivity', type='Research field', properties={}), Node(id='Nobel Prize', type='Award', properties={}), Node(id='Pierre Curie', type='Person', properties={}), Node(id='University Of Paris', type='Institution', properties={})]
Relationships:[Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='1867', type='Year', properties={}), type='BORN_IN_YEAR', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='Polish', type='Nationality', properties={}), type='NATIONALITY', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id