# LMM and Graphs

Let's create a toy dataset

In [2]:
import sys
import torch

from torch_geometric.data import Data

# Assume a toy dataset with 3 papers (nodes), edges, and labels
data = Data(
    x=torch.rand(3, 10),  # let's use random features for simplicity
    edge_index=torch.tensor([[0, 1], [1, 2]], dtype=torch.long),  # Edges
    y=torch.tensor([0, 1, 2], dtype=torch.long),  # True labels
    text=["Paper A abstract", "Paper B abstract", "Paper C abstract"],  # Text data
)

In [6]:
from transformers import AutoTokenizer, AutoModel
from torch_geometric.nn import GCNConv

# 1. Define the Graph Neural Network (GNN)
class GNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

# 2. Define the LLM (e.g., BERT for text encoding)
class TextEncoder(torch.nn.Module):
    def __init__(self, model_name="bert-base-uncased", output_dim=128):
        super(TextEncoder, self).__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.fc = torch.nn.Linear(self.model.config.hidden_size, output_dim)

    def forward(self, texts):
        # Tokenize and encode text data
        inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
        outputs = self.model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token embedding
        return self.fc(cls_embedding)

# 3. Training Loop with Pseudo-label Exchange
def train_prediction_alignment(data, gnn, text_encoder, num_iterations=5):
    optimizer_gnn = torch.optim.Adam(gnn.parameters(), lr=0.01)
    optimizer_text = torch.optim.Adam(text_encoder.parameters(), lr=0.0001)

    for iteration in range(num_iterations):
        # 3.1 Train GNN
        gnn.train()
        optimizer_gnn.zero_grad()
        gnn_labels = gnn(data.x, data.edge_index)  # GNN generates pseudo-labels
        gnn_loss = torch.nn.CrossEntropyLoss()(gnn_labels, data.y)  # Assume `data.y` is the target
        gnn_loss.backward()
        optimizer_gnn.step()

        # 3.2 Train LLM with GNN-generated pseudo-labels
        text_encoder.train()
        optimizer_text.zero_grad()
        text_features = text_encoder(data.text)  # Text encoding
        llm_loss = torch.nn.CrossEntropyLoss()(text_features, torch.argmax(gnn_labels, dim=1))
        llm_loss.backward()
        optimizer_text.step()

        print(f"Iteration {iteration+1}: GNN Loss = {gnn_loss.item()}, LLM Loss = {llm_loss.item()}")

# Initialize models and train
gnn = GNN(input_dim=10, hidden_dim=16, output_dim=3)
text_encoder = TextEncoder()
train_prediction_alignment(data, gnn, text_encoder)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Iteration 1: GNN Loss = 1.122875690460205, LLM Loss = 5.0661540031433105
Iteration 2: GNN Loss = 1.0656100511550903, LLM Loss = 3.393122911453247
Iteration 3: GNN Loss = 1.0160475969314575, LLM Loss = 2.1740071773529053
Iteration 4: GNN Loss = 0.9782276153564453, LLM Loss = 3.1693878173828125
Iteration 5: GNN Loss = 0.9556815028190613, LLM Loss = 3.8063628673553467


In [7]:
# Import libraries
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from torch_geometric.nn import GraphConv
from torch_geometric.data import Data

# 1. Define the GNN
class GNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(GNN, self).__init__()
        self.conv = GraphConv(input_dim, hidden_dim)

    def forward(self, x, edge_index):
        return self.conv(x, edge_index)

# 2. Define the Text Encoder (LLM)
class TextEncoder(torch.nn.Module):
    def __init__(self, model_name="bert-base-uncased", output_dim=128):
        super(TextEncoder, self).__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.fc = torch.nn.Linear(self.model.config.hidden_size, output_dim)

    def forward(self, texts):
        inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
        outputs = self.model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token embedding
        return self.fc(cls_embedding)

# 3. Contrastive Learning Objective
def contrastive_loss(graph_emb, text_emb, tau=0.1):
    sim = F.cosine_similarity(graph_emb.unsqueeze(1), text_emb.unsqueeze(0), dim=2)
    labels = torch.arange(sim.size(0)).to(sim.device)
    loss = F.cross_entropy(sim / tau, labels)
    return loss

# 4. Training Loop for Latent Space Alignment
def train_latent_alignment(data, gnn, text_encoder, epochs=10):
    optimizer = torch.optim.Adam(list(gnn.parameters()) + list(text_encoder.parameters()), lr=0.001)
    for epoch in range(epochs):
        optimizer.zero_grad()

        # Encode graph and text
        graph_emb = gnn(data.x, data.edge_index)  # Graph embeddings
        text_emb = text_encoder(data.text)  # Text embeddings

        # Compute contrastive loss
        loss = contrastive_loss(graph_emb, text_emb)
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch+1}: Loss = {loss.item()}")

# 5. Example Data
# Toy data with 3 products and their relationships
data = Data(
    x=torch.rand(3, 10),  # Node features
    edge_index=torch.tensor([[0, 1], [1, 2]], dtype=torch.long),  # Edges
    text=["Product A description", "Product B description", "Product C description"],  # Text data
)

# Initialize models and train
gnn = GNN(input_dim=10, hidden_dim=128)
text_encoder = TextEncoder()
train_latent_alignment(data, gnn, text_encoder)

Epoch 1: Loss = 1.059374213218689
Epoch 2: Loss = 1.0176868438720703
Epoch 3: Loss = 1.0710476636886597
Epoch 4: Loss = 1.0982847213745117
Epoch 5: Loss = 1.098282814025879
Epoch 6: Loss = 1.0987071990966797
Epoch 7: Loss = 1.0966845750808716
Epoch 8: Loss = 1.0122456550598145
Epoch 9: Loss = 1.0971144437789917
Epoch 10: Loss = 1.0901302099227905


# GraphRAG

If using Colab you can simply run the following cells.

Otherwise, if you want to use the local backend, please:
- download neo4j desktop on [docker](https://neo4j.com/docs/graph-data-science/current/installation/installation-docker/)*
- download [lm-studio](https://lmstudio.ai/) and download the minicpm-llama3-v-2_5 and nomic-embed-text model

*run docker as:


```
docker run --rm --env NEO4J_AUTH=neo4j/defaultpass -p 7474:7474 -p 7687:7687 -v $PWD/data:/data -v $PWD/plugins:/plugins --name neo4j-apoc -e NEO4J_apoc_export_file_enabled=true -e NEO4J_apoc_import_file_enabled=true -e NEO4J_apoc_import_file_use__neo4j__config=true -e NEO4J_PLUGINS=\[\"apoc-extended\"\] neo4j
```



In [38]:
import os
import sys

LLM_BACKEND = "ollama" # choose ["ollama" | "lm-studio"]
# LLM_BACKEND = "lm-studio"

assert LLM_BACKEND in ["ollama", "lm-studio"]

if LLM_BACKEND == "ollama":
  base_url = f"http://{os.environ.get('OLLAMA_HOST', 'localhost')}:11434/v1"
  api_key = "ollama"
  # llm_model = "minicpm-v"
  llm_model = "phi4"
else:
  base_url = "http://localhost:1234/v1"
  api_key = "lm-studio"
  llm_model = "minicpm-llama3-v-2_5"

If Colab you need to download ollama and start the server

In [39]:
import ollama

# ollama.pull(llm_model)
# ollama.pull("nomic-embed-text")
ollama.pull("phi4")

ProgressResponse(status='success', completed=None, total=None, digest=None)

In [40]:
ollama.list()

ListResponse(models=[Model(model='phi4:latest', modified_at=datetime.datetime(2025, 4, 5, 21, 8, 26, 809068, tzinfo=TzInfo(UTC)), digest='ac896e5b8b34a1f4efa7b14d7520725140d5512484457fab45d2a4ea14c69dba', size=9053116391, details=ModelDetails(parent_model='', format='gguf', family='phi3', families=['phi3'], parameter_size='14.7B', quantization_level='Q4_K_M')), Model(model='minicpm-v:latest', modified_at=datetime.datetime(2025, 4, 5, 21, 4, 17, 111987, tzinfo=TzInfo(UTC)), digest='c92bfad0120556eda311984f1ac2f0d0a589b8d68c4053c13486b526276aa205', size=5473838466, details=ModelDetails(parent_model='', format='gguf', family='qwen2', families=['qwen2', 'clip'], parameter_size='7.6B', quantization_level='Q4_0'))])

# Neo4j

In [41]:
import os

from neo4j import GraphDatabase
from langchain_neo4j import Neo4jGraph

host = os.environ.get("NEO4J_HOST", "localhost")

# ---- Step 1: Setup Neo4j Connection ----
NEO4J_URI = f"bolt://{host}:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "neo5j"

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
graph = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USER, password=NEO4J_PASSWORD)

# ---- Step 2: Create knowledge graph from text ----
import os
from langchain_experimental.graph_transformers.llm import LLMGraphTransformer
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0,
                 model_name=llm_model,
                 base_url=base_url,
                 api_key=api_key)

llm_transformer = LLMGraphTransformer(llm=llm)

from langchain_core.documents import Document

text = """
Marie Curie, born in 1867, was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.
She was the first woman to win a Nobel Prize, the first person to win a Nobel Prize twice, and the only person to win a Nobel Prize in two scientific fields.
Her husband, Pierre Curie, was a co-winner of her first Nobel Prize, making them the first-ever married couple to win the Nobel Prize and launching the Curie family legacy of five Nobel Prizes.
She was, in 1906, the first woman to become a professor at the University of Paris.
"""
documents = [Document(page_content=text)]
graph_documents = llm_transformer.convert_to_graph_documents(documents)
print(f"Nodes:{graph_documents[0].nodes}")
print(f"Relationships:{graph_documents[0].relationships}")

# Add graph to neo4j
graph.add_graph_documents(graph_documents)

# ---- Step 3: Perform GraphRAG ----

def escape(s):
  return s.replace("{","").replace("}","")

CYPHER_GENERATION_TEMPLATE = f"""You are a Neo4j expert. Generate a Cypher query to answer the given question.

Database Schema: {escape(graph.schema)}

Rules:
1. Always use explicit `MATCH` for relationships.
2. Never use `WHERE` for relationship matching.
3. Use `RETURN DISTINCT` when appropriate.

Example Queries:
1. Question: "Who won the Nobel Prize?"
   Cypher: MATCH (p:Person)-[:WON_NOBEL_PRIZE]->(:Awarded) RETURN p.id AS winner

Question: {{query}}
Return only the Cypher query without any explanation or additional text.
Cypher:"""

from langchain_neo4j import GraphCypherQAChain
from langchain_core.prompts import PromptTemplate

chain = GraphCypherQAChain.from_llm(
    llm=llm,
    graph=graph,
    verbose=True,
    cypher_prompt=PromptTemplate(
        input_variables=["query"],
        template=CYPHER_GENERATION_TEMPLATE
    ),
    allow_dangerous_requests=True
)

# ---- Step 5: Test Queries ----
print("\nTesting queries...")

question = "Who married a Nobel Prize?"

print(f"\nQuestion: {question}")
response = chain.invoke(question)
print("Response:", response['result'])

# Close the driver
driver.close()

Nodes:[Node(id='Marie Curie', type='Person', properties={}), Node(id='1867', type='Date', properties={}), Node(id='Polish And Naturalised-French', type='Nationality', properties={}), Node(id='Physicist', type='Profession', properties={}), Node(id='Chemist', type='Profession', properties={}), Node(id='Radioactivity', type='Concept', properties={}), Node(id='Nobel Prize', type='Award', properties={}), Node(id='Pierre Curie', type='Person', properties={}), Node(id='University Of Paris', type='Organization', properties={})]
Relationships:[Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='1867', type='Date', properties={}), type='BORN_IN_YEAR', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='Polish And Naturalised-French', type='Nationality', properties={}), type='NATIONALITY', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='Physicist', 



Generated Cypher:
[32;1m[1;3mcypher
MATCH (person:Person)-[:RECIPIENT_OF]->(award:Award {name: "Nobel Prize"})
MATCH (spouse:Person)-[:MARRIED_TO]->(person)
RETURN DISTINCT spouse.id AS marriedToNobelPrizeWinner
[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m
Response: I don't know the answer.
