In [None]:
import os 
from dotenv import load_dotenv
load_dotenv(override=True)
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

import pandas as pd

from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.testset import TestsetGenerator
from ragas.cache import DiskCacheBackend
from ragas.testset.graph import KnowledgeGraph
from ragas.testset.transforms import default_transforms, apply_transforms
from ragas.testset.graph import Node, NodeType

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import DirectoryLoader, JSONLoader
from langchain.schema import Document

from internal.database_setup.preprocessing import clean_text

from pathlib import Path

# if we want to see the cache in action, set the logging level to debug
#import logging
#from ragas.utils import set_logging_level
#set_logging_level("ragas.cache", logging.DEBUG)

CACHE_DIR = ".cache/ragas" # or data/.cache/ragas
cacher = DiskCacheBackend(cache_dir=CACHE_DIR) # ".cache/ragas"
print("Cache entries:", len(cacher.cache))

llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini", 
                                         model_kwargs={"response_format": {"type": "json_object"}}),
                                           cache=cacher)
    
embedder = LangchainEmbeddingsWrapper(OpenAIEmbeddings(), cache=cacher)

In [None]:
# after loading the old KG:
kg = KnowledgeGraph.load("../output/archive/knowledge_graph_first_run.json")

# retain only the *unfinished* nodes
unfinished = [
    node for node in kg.nodes
    if not all(k in node.properties for k in ("headlines", "summary", "summary_embedding"))
]
kg.nodes = unfinished

print(len(unfinished))
print(len(kg.nodes))

3426
3426


In [None]:
# now run the transforms again *only* on those
apply_transforms(kg, default_transforms(...))
# finally, merge back the newly-finished nodes into your old KG


In [None]:
from ragas.testset.graph import KnowledgeGraph

kg = KnowledgeGraph.load("../output/archive/full_knowledge_graph_second_test copy.json")
# 2298 in first

total = len(kg.nodes)
print(f"Total nodes in graph: {total}")


Total nodes in graph: 3616


In [5]:
# pick the property that always gets added by your filter, e.g. "score"
missing = [
    node for node in kg.nodes
    if node.get_property("score") is None
]
print(f"Nodes without a score: {len(missing)}")


Nodes without a score: 3616


In [9]:
#print(kg.nodes[0].properties)  
# or
print(kg.nodes[0].properties.keys())


dict_keys(['page_content', 'document_metadata', 'headlines', 'summary', 'summary_embedding'])


In [10]:
# Count how many chunks got themes/entities
themes_count = sum(1 for n in kg.nodes if "themes" in n.properties)
entities_count = sum(1 for n in kg.nodes if "entities" in n.properties)
print(f"Themes on {themes_count} chunks, Entities on {entities_count} chunks")

# Inspect a sample chunk
sample = next(n for n in kg.nodes if "themes" in n.properties)
print(sample.id, sample.properties["themes"], sample.properties["entities"])

Themes on 2948 chunks, Entities on 2919 chunks
1f8e1a02-5618-46c2-a8d3-47031a256905 ['Heating patterns', 'Temperature change', 'Measurement patterns', 'Missing data', 'Linear mixed-effects model', 'Lognormal mixed-effects model', "Newton's Law of Cooling/Heating", 'Heat diffusion equation', 'Lumped capacitance model', 'Bayesian implementation'] ['Newton’s Law of Cooling/Heating', 'Stan', 'pizza stone', 'gas-fired oven', 'heat diffusion equation', 'heat transfer coefficient', 'temperature', 'time', 'specific heat capacity', 'thermal conductivity']


In [15]:
# 1) Inspect node types present
print("Node types in this graph:", {n.type for n in kg.nodes})

# 2) Count embeddings correctly
docs_for_embed = sum(
    1
    for n in kg.nodes
    if n.type == NodeType.DOCUMENT
       and n.properties.get("summary_embedding") is not None
)
print(f"Documents with embeddings: {docs_for_embed}")

# 3) Count chunks for comparison
chunk_count = sum(1 for n in kg.nodes if n.type == NodeType.CHUNK)

# 2) How many chunks got themes & entities?
chunks_with_themes   = sum(1 for n in kg.nodes if "themes"    in n.properties)
chunks_with_entities = sum(1 for n in kg.nodes if "entities"  in n.properties)

print(f"Surviving chunks     : {chunk_count}")
print(f"Embedding tasks   : {docs_for_embed}")
print(f"Themes tasks      : {chunks_with_themes}")
print(f"NER tasks         : {chunks_with_entities}")
print(f"Total reported    : {docs_for_embed + chunks_with_themes + chunks_with_entities}")

Node types in this graph: {<NodeType.DOCUMENT: 'document'>, <NodeType.CHUNK: 'chunk'>}
Documents with embeddings: 272
Surviving chunks     : 3161
Embedding tasks   : 272
Themes tasks      : 2948
NER tasks         : 2919
Total reported    : 6139


In [None]:
from ragas.testset.graph import KnowledgeGraph

kg = KnowledgeGraph.load(r"../output/archive/full_knowledge_graph_third_test.json")
# 2298 in first

total = len(kg.nodes)
print(f"Total nodes in graph: {total}")

# 1) Inspect node types present
print("Node types in this graph:", {n.type for n in kg.nodes})

# 2) Count embeddings correctly
docs_for_embed = sum(
    1
    for n in kg.nodes
    if n.type == NodeType.DOCUMENT
       and n.properties.get("summary_embedding") is not None
)
print(f"Documents with embeddings: {docs_for_embed}")

# 3) Count chunks for comparison
chunk_count = sum(1 for n in kg.nodes if n.type == NodeType.CHUNK)

# 2) How many chunks got themes & entities?
chunks_with_themes   = sum(1 for n in kg.nodes if "themes"    in n.properties)
chunks_with_entities = sum(1 for n in kg.nodes if "entities"  in n.properties)

print(f"Surviving chunks     : {chunk_count}")
print(f"Embedding tasks   : {docs_for_embed}")
print(f"Themes tasks      : {chunks_with_themes}")
print(f"NER tasks         : {chunks_with_entities}")
print(f"Total reported    : {docs_for_embed + chunks_with_themes + chunks_with_entities}")
# third

Total nodes in graph: 5240
Node types in this graph: {<NodeType.DOCUMENT: 'document'>, <NodeType.CHUNK: 'chunk'>}
Documents with embeddings: 396
Surviving chunks     : 4785
Embedding tasks   : 396
Themes tasks      : 2975
NER tasks         : 2764
Total reported    : 6135


In [None]:
from ragas.testset.graph import KnowledgeGraph

kg = KnowledgeGraph.load("../output/archive/full_knowledge_graph_fourth_test.json")
# 2298 in first

total = len(kg.nodes)
print(f"Total nodes in graph: {total}")

# 1) Inspect node types present
print("Node types in this graph:", {n.type for n in kg.nodes})

# 2) Count embeddings correctly
docs_for_embed = sum(
    1
    for n in kg.nodes
    if n.type == NodeType.DOCUMENT
       and n.properties.get("summary_embedding") is not None
)
print(f"Documents with embeddings: {docs_for_embed}")

# 3) Count chunks for comparison
chunk_count = sum(1 for n in kg.nodes if n.type == NodeType.CHUNK)

# 2) How many chunks got themes & entities?
chunks_with_themes   = sum(1 for n in kg.nodes if "themes"    in n.properties)
chunks_with_entities = sum(1 for n in kg.nodes if "entities"  in n.properties)

print(f"Surviving chunks     : {chunk_count}")
print(f"Embedding tasks   : {docs_for_embed}")
print(f"Themes tasks      : {chunks_with_themes}")
print(f"NER tasks         : {chunks_with_entities}")
print(f"Total reported    : {docs_for_embed + chunks_with_themes + chunks_with_entities}")
# fourth

Total nodes in graph: 4844
Node types in this graph: {<NodeType.DOCUMENT: 'document'>, <NodeType.CHUNK: 'chunk'>}
Documents with embeddings: 437
Surviving chunks     : 4389
Embedding tasks   : 437
Themes tasks      : 4205
NER tasks         : 4186
Total reported    : 8828


In [None]:

kg = KnowledgeGraph.load("../output/archive/full_knowledge_graph_fourth_test.json")
kg


KnowledgeGraph(nodes: 4844, relationships: 8052)

In [34]:
from ragas.testset.synthesizers import (
    SingleHopSpecificQuerySynthesizer,
    MultiHopSpecificQuerySynthesizer,
    MultiHopAbstractQuerySynthesizer,
)
spec = MultiHopSpecificQuerySynthesizer(generator_llm)
clusters = spec.get_node_clusters(kg)
print(f"#clusters found: {len(clusters)}")

assert len(MultiHopSpecificQuerySynthesizer(generator_llm).get_node_clusters(loaded_kg)) > 0, \
       "No multi-hop clusters found; generation will be single-hop only"


#clusters found: 0


AssertionError: No multi-hop clusters found; generation will be single-hop only

In [None]:
from ragas.testset.synthesizers import MultiHopSpecificQuerySynthesizer
kg = KnowledgeGraph.load("../output/archive/full_knowledge_graph_fourth_test.json")
print(kg)

synth = MultiHopSpecificQuerySynthesizer(generator_llm)
#print("edges by type:", {r.relation_type for r in kg.relationships})

clusters = synth.get_node_clusters(kg)
print("#valid clusters:", len(clusters))

missing_embed = sum(1 for n in kg.nodes if "summary_embedding" not in n.properties)
print("#nodes without embedding:", missing_embed)


KnowledgeGraph(nodes: 4844, relationships: 8052)
#valid clusters: 0
#nodes without embedding: 4407


In [None]:
kg = KnowledgeGraph.load("../output/archive/knowledge_graph_pilot_test.json")
print(kg)

synth = MultiHopSpecificQuerySynthesizer(generator_llm)

clusters = synth.get_node_clusters(kg)
print("#valid clusters:", len(clusters))


broken = [n for n in kg.nodes if not isinstance(n.properties.get("summary"), str)]
no_emb = [n for n in kg.nodes if not isinstance(n.properties.get("summary_embedding"), list)]

print(f"❗ Nodes with missing or invalid summaries: {len(broken)}")

print(f"❗ Nodes missing embeddings: {len(no_emb)}")


print(f"✅ Total nodes: {len(kg.nodes)}")

content_nodes = [n for n in kg.nodes if "page_content" in n.properties]

broken = [n for n in content_nodes if not isinstance(n.properties.get("summary"), str)]
no_emb = [n for n in content_nodes if not isinstance(n.properties.get("summary_embedding"), list)]

print(f"❗ Content nodes with missing/invalid summary: {len(broken)}")
print(f"❗ Content nodes missing embeddings: {len(no_emb)}")
print(f"✅ Total content nodes: {len(content_nodes)}")



KnowledgeGraph(nodes: 255, relationships: 944)
#valid clusters: 538
❗ Nodes with missing or invalid summaries: 223
❗ Nodes missing embeddings: 223
✅ Total nodes: 255
❗ Content nodes with missing/invalid summary: 223
❗ Content nodes missing embeddings: 223
✅ Total content nodes: 255


In [None]:
kg = KnowledgeGraph.load("../output/archive/knowledge_graph_split_01_100.json")
print(kg)

for synth in [SingleHopSpecificQuerySynthesizer(generator_llm),
              MultiHopSpecificQuerySynthesizer(generator_llm),
              MultiHopAbstractQuerySynthesizer(generator_llm)]: # could this be failing because i run out of ram? size is 166600 for the succeeding graph
    try:
        count = len(synth.get_node_clusters(kg))
        print(f"✅ {synth.name} → {count} cluster(s)")
    except Exception as e:
        print(f"❌ {synth.name} → error: {e}")



KnowledgeGraph(nodes: 1606, relationships: 20974)
✅ LangchainLLMWrapper(langchain_llm=ChatOpenAI(...)) → 1500 cluster(s)
✅ LangchainLLMWrapper(langchain_llm=ChatOpenAI(...)) → 12700 cluster(s)


KeyboardInterrupt: 

In [52]:
query_distribution = []
synths = [
    SingleHopSpecificQuerySynthesizer(generator_llm),
    MultiHopSpecificQuerySynthesizer(generator_llm),
    MultiHopAbstractQuerySynthesizer(generator_llm)
]

for s in synths:
    try:
        clusters = s.get_node_clusters(kg)
        if clusters:
            query_distribution.append((s, 1.0))  # weight can be adjusted
        else:
            print(f"⚠️ No clusters found for {s.name}; skipping.")
    except Exception as e:
        print(f"❌ Synthesizer {s.name} failed: {e}")

# re-normalise weights
total = sum(w for _, w in query_distribution)
query_distribution = [(s, w/total) for s, w in query_distribution]


⚠️ No clusters found for LangchainLLMWrapper(langchain_llm=ChatOpenAI(...)); skipping.
