# LMM and Graphs

Let's create a toy dataset

In [1]:
import sys
import torch

from torch_geometric.data import Data

# Assume a toy dataset with 3 papers (nodes), edges, and labels
data = Data(
    x=torch.rand(3, 10),  # Random node features
    edge_index=torch.tensor([[0, 1], [1, 2]], dtype=torch.long).t().contiguous(),  # Edges (transposed for PyG)
    y=torch.tensor([0, 1, 2], dtype=torch.long),  # True labels (3 classes)
    text=["Paper A abstract about machine learning", 
          "Paper B abstract about deep learning", 
          "Paper C abstract about neural networks"],  # Text data
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
num_classes = len(torch.unique(data.y))  # Number of unique classes

print(f"Dataset info:")
print(f"  Number of nodes: {data.x.size(0)}")
print(f"  Node feature dimension: {data.x.size(1)}")
print(f"  Number of edges: {data.edge_index.size(1)}")
print(f"  Number of classes: {num_classes}")
print(f"  True labels: {data.y.tolist()}")

Dataset info:
  Number of nodes: 3
  Node feature dimension: 10
  Number of edges: 2
  Number of classes: 3
  True labels: [0, 1, 2]


In [4]:
from transformers import AutoTokenizer, AutoModel
from torch_geometric.nn import GCNConv
import torch.nn.functional as F

# 1. Define the Graph Neural Network (GNN)
class GNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, num_classes)  # Output num_classes
        self.dropout = torch.nn.Dropout(0.2)
    
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index)
        return x  # Return logits (not softmax)

# 2. Define the Text Encoder (BERT-based)
class TextEncoder(torch.nn.Module):
    def __init__(self, model_name="bert-base-uncased", num_classes=3):
        super(TextEncoder, self).__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        # Project from BERT's hidden size to number of classes
        self.classifier = torch.nn.Linear(self.model.config.hidden_size, num_classes)
        self.dropout = torch.nn.Dropout(0.1)
    
    def forward(self, texts):
        # Tokenize and encode text data
        inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        
        with torch.no_grad():  # Freeze BERT parameters during training
            outputs = self.model(**inputs)
        
        # Use [CLS] token embedding
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        cls_embedding = self.dropout(cls_embedding)
        logits = self.classifier(cls_embedding)
        return logits  # Return logits (not softmax)


# 4. Training Loop with Bidirectional Pseudo-label Exchange
def train_prediction_alignment(data, gnn, text_encoder, num_iterations=5):
    optimizer_gnn = torch.optim.Adam(gnn.parameters(), lr=0.01)
    optimizer_text = torch.optim.Adam(text_encoder.parameters(), lr=0.0001)
    
    # Initialize with true labels for first iteration
    gnn_pseudo_labels = data.y.clone()
    llm_pseudo_labels = data.y.clone()
    
    for iteration in range(num_iterations):
        # 4.1 Train GNN using LLM pseudo-labels from previous iteration
        gnn.train()
        optimizer_gnn.zero_grad()
        gnn_logits = gnn(data.x, data.edge_index)
        gnn_loss = torch.nn.CrossEntropyLoss()(gnn_logits, llm_pseudo_labels)
        gnn_loss.backward()
        optimizer_gnn.step()
        
        # Generate new GNN pseudo-labels
        with torch.no_grad():
            gnn_pseudo_labels = torch.argmax(gnn_logits, dim=1)
        
        # 4.2 Train Text Encoder using GNN pseudo-labels
        text_encoder.train()
        optimizer_text.zero_grad()
        text_logits = text_encoder(data.text)
        llm_loss = torch.nn.CrossEntropyLoss()(text_logits, gnn_pseudo_labels)
        llm_loss.backward()
        optimizer_text.step()
        
        # Generate new LLM pseudo-labels for next iteration
        with torch.no_grad():
            llm_pseudo_labels = torch.argmax(text_logits, dim=1)
        
        print(f"Iteration {iteration+1}: GNN Loss = {gnn_loss.item():.4f}, LLM Loss = {llm_loss.item():.4f}")
        print(f"  GNN predictions: {gnn_pseudo_labels.tolist()}")
        print(f"  LLM predictions: {llm_pseudo_labels.tolist()}")

# Initialize models and train
input_dim = data.x.size(1)  # Node feature dimension
hidden_dim = 64

gnn = GNN(input_dim=input_dim, hidden_dim=hidden_dim, num_classes=num_classes)
text_encoder = TextEncoder(num_classes=num_classes)

print("Starting training...")
train_prediction_alignment(data, gnn, text_encoder, num_iterations=5)

Starting training...
Iteration 1: GNN Loss = 1.1190, LLM Loss = 0.9824
  GNN predictions: [1, 1, 1]
  LLM predictions: [1, 1, 1]
Iteration 2: GNN Loss = 0.8503, LLM Loss = 1.0020
  GNN predictions: [1, 1, 1]
  LLM predictions: [2, 2, 1]
Iteration 3: GNN Loss = 0.9946, LLM Loss = 0.9380
  GNN predictions: [1, 1, 1]
  LLM predictions: [1, 2, 1]
Iteration 4: GNN Loss = 0.8984, LLM Loss = 0.9884
  GNN predictions: [1, 1, 1]
  LLM predictions: [0, 1, 1]
Iteration 5: GNN Loss = 0.9916, LLM Loss = 0.7875
  GNN predictions: [1, 1, 1]
  LLM predictions: [1, 1, 1]


In [5]:
# Import libraries
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from torch_geometric.nn import GraphConv
from torch_geometric.data import Data

# 1. Define the GNN
class GNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(GNN, self).__init__()
        self.conv = GraphConv(input_dim, hidden_dim)

    def forward(self, x, edge_index):
        return self.conv(x, edge_index)

# 2. Define the Text Encoder (LLM)
class TextEncoder(torch.nn.Module):
    def __init__(self, model_name="bert-base-uncased", output_dim=128):
        super(TextEncoder, self).__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.fc = torch.nn.Linear(self.model.config.hidden_size, output_dim)

    def forward(self, texts):
        inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
        outputs = self.model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token embedding
        return self.fc(cls_embedding)

# 3. Contrastive Learning Objective
def contrastive_loss(graph_emb, text_emb, tau=0.1):
    sim = F.cosine_similarity(graph_emb.unsqueeze(1), text_emb.unsqueeze(0), dim=2)
    labels = torch.arange(sim.size(0)).to(sim.device)
    loss = F.cross_entropy(sim / tau, labels)
    return loss

# 4. Training Loop for Latent Space Alignment
def train_latent_alignment(data, gnn, text_encoder, epochs=10):
    optimizer = torch.optim.Adam(list(gnn.parameters()) + list(text_encoder.parameters()), lr=0.001)
    for epoch in range(epochs):
        optimizer.zero_grad()

        # Encode graph and text
        graph_emb = gnn(data.x, data.edge_index)  # Graph embeddings
        text_emb = text_encoder(data.text)  # Text embeddings

        # Compute contrastive loss
        loss = contrastive_loss(graph_emb, text_emb)
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch+1}: Loss = {loss.item()}")

# 5. Example Data
# Toy data with 3 products and their relationships
data = Data(
    x=torch.rand(3, 10),  # Node features
    edge_index=torch.tensor([[0, 1], [1, 2]], dtype=torch.long),  # Edges
    text=["Product A description", "Product B description", "Product C description"],  # Text data
)

# Initialize models and train
gnn = GNN(input_dim=10, hidden_dim=128)
text_encoder = TextEncoder()
train_latent_alignment(data, gnn, text_encoder)

Epoch 1: Loss = 1.1974958181381226
Epoch 2: Loss = 0.9338013529777527
Epoch 3: Loss = 1.0983785390853882
Epoch 4: Loss = 1.0986661911010742
Epoch 5: Loss = 1.0841368436813354
Epoch 6: Loss = 1.0974050760269165
Epoch 7: Loss = 0.621391773223877
Epoch 8: Loss = 0.5264388918876648
Epoch 9: Loss = 1.1055184602737427
Epoch 10: Loss = 1.0985291004180908


# GraphRAG

If using Colab you can simply run the following cells.

Otherwise, if you want to use the local backend, please:
- download neo4j desktop on [docker](https://neo4j.com/docs/graph-data-science/current/installation/installation-docker/)*
- download [lm-studio](https://lmstudio.ai/) and download the minicpm-llama3-v-2_5 and nomic-embed-text model

*run docker as:


```
docker run --rm --env NEO4J_AUTH=neo4j/defaultpass -p 7474:7474 -p 7687:7687 -v $PWD/data:/data -v $PWD/plugins:/plugins --name neo4j-apoc -e NEO4J_apoc_export_file_enabled=true -e NEO4J_apoc_import_file_enabled=true -e NEO4J_apoc_import_file_use__neo4j__config=true -e NEO4J_PLUGINS=\[\"apoc-extended\"\] neo4j
```



In [6]:
import os
import sys

LLM_BACKEND = "ollama" # choose ["ollama" | "lm-studio"]
# LLM_BACKEND = "lm-studio"

assert LLM_BACKEND in ["ollama", "lm-studio"]

if LLM_BACKEND == "ollama":
  base_url = f"http://{os.environ.get('OLLAMA_HOST', 'localhost')}:11434/v1"
  api_key = "ollama"
  # llm_model = "minicpm-v"
  llm_model = "phi4"
else:
  base_url = "http://localhost:1234/v1"
  api_key = "lm-studio"
  llm_model = "minicpm-llama3-v-2_5"

If Colab you need to download ollama and start the server

In [7]:
import ollama

# ollama.pull(llm_model)
# ollama.pull("nomic-embed-text")
ollama.pull("phi4")

ProgressResponse(status='success', completed=None, total=None, digest=None)

In [8]:
ollama.list()

ListResponse(models=[Model(model='phi4:latest', modified_at=datetime.datetime(2025, 6, 21, 20, 30, 17, 738327, tzinfo=TzInfo(UTC)), digest='ac896e5b8b34a1f4efa7b14d7520725140d5512484457fab45d2a4ea14c69dba', size=9053116391, details=ModelDetails(parent_model='', format='gguf', family='phi3', families=['phi3'], parameter_size='14.7B', quantization_level='Q4_K_M'))])

# Neo4j

In [None]:
import os

from neo4j import GraphDatabase
from langchain_neo4j import Neo4jGraph

host = os.environ.get("NEO4J_HOST", "localhost")

# ---- Step 1: Setup Neo4j Connection ----
NEO4J_URI = f"bolt://{host}:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "neo5j"

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
graph = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USER, password=NEO4J_PASSWORD)

# ---- Step 2: Create knowledge graph from text ----
import os
from langchain_experimental.graph_transformers.llm import LLMGraphTransformer
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0,
                 model_name=llm_model,
                 base_url=base_url,
                 api_key=api_key)

llm_transformer = LLMGraphTransformer(llm=llm)

from langchain_core.documents import Document

text = """
Marie Curie, born in 1867, was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.
She was the first woman to win a Nobel Prize, the first person to win a Nobel Prize twice, and the only person to win a Nobel Prize in two scientific fields.
Her husband, Pierre Curie, was a co-winner of her first Nobel Prize, making them the first-ever married couple to win the Nobel Prize and launching the Curie family legacy of five Nobel Prizes.
She was, in 1906, the first woman to become a professor at the University of Paris.
"""
documents = [Document(page_content=text)]
graph_documents = llm_transformer.convert_to_graph_documents(documents)
print(f"Nodes:{graph_documents[0].nodes}")
print(f"Relationships:{graph_documents[0].relationships}")

# Add graph to neo4j
graph.add_graph_documents(graph_documents)

# ---- Step 3: Perform GraphRAG ----

def escape(s):
  return s.replace("{","").replace("}","")

CYPHER_GENERATION_TEMPLATE = f"""You are a Neo4j expert. Generate a Cypher query to answer the given question.

Database Schema: {escape(graph.schema)}

Rules:
1. Always use explicit `MATCH` for relationships.
2. Never use `WHERE` for relationship matching.
3. Use `RETURN DISTINCT` when appropriate.

Example Queries:
1. Question: "Who won the Nobel Prize?"
   Cypher: MATCH (p:Person)-[:WON_NOBEL_PRIZE]->(:Awarded) RETURN p.id AS winner

Question: {{query}}
Return only the Cypher query without any explanation or additional text.
Cypher:"""

from langchain_neo4j import GraphCypherQAChain
from langchain_core.prompts import PromptTemplate

chain = GraphCypherQAChain.from_llm(
    llm=llm,
    graph=graph,
    verbose=True,
    cypher_prompt=PromptTemplate(
        input_variables=["query"],
        template=CYPHER_GENERATION_TEMPLATE
    ),
    allow_dangerous_requests=True
)

# ---- Step 5: Test Queries ----
print("\nTesting queries...")

question = "Who married a Nobel Prize?"

print(f"\nQuestion: {question}")
response = chain.invoke(question)
print("Response:", response['result'])

# Close the driver
driver.close()