In [1]:
import os 
from dotenv import load_dotenv
load_dotenv(override=True)
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
print("OPENAI_API_KEY:", os.getenv("OPENAI_API_KEY"))

import pandas as pd

from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.testset import TestsetGenerator
from ragas.cache import DiskCacheBackend
from ragas.testset.graph import KnowledgeGraph
from ragas.testset.transforms import default_transforms, apply_transforms
from ragas.testset.graph import Node, NodeType

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import DirectoryLoader, JSONLoader
from langchain.schema import Document

from embedding_process.preprocessing import clean_text

from pathlib import Path

# if we want to see the cache in action, set the logging level to debug
#import logging
#from ragas.utils import set_logging_level
#set_logging_level("ragas.cache", logging.DEBUG)


# Default paths and parameters
DATA_DIR = "processed_syllabi/"
GLOB = "**/scraped_data/*.json"
KG_PATH = Path("knowledge_graph_first_run.json")
JSON_OUT = Path("testset_fr.json")
CSV_OUT = Path("testset_fr.csv")
TESTSET_SIZE = 5


OPENAI_API_KEY: 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


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# after loading the old KG:
kg = KnowledgeGraph.load(r"C:\Users\au644610\OneDrive - Aarhus universitet\Desktop\Thesis-Trustworthy-RAG\knowledge_graph_first_run.json")

# retain only the *unfinished* nodes
unfinished = [
    node for node in kg.nodes
    if not all(k in node.properties for k in ("headlines", "summary", "summary_embedding"))
]
kg.nodes = unfinished

print(len(unfinished))
print(len(kg.nodes))

3426
3426


In [None]:
# now run the transforms again *only* on those
apply_transforms(kg, default_transforms(...))
# finally, merge back the newly-finished nodes into your old KG


In [None]:
import os
from dotenv import load_dotenv
import pandas as pd
from pathlib import Path

# Ragas imports
from ragas.cache import DiskCacheBackend
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.testset import TestsetGenerator
from ragas.testset.graph import KnowledgeGraph, Node, NodeType
from ragas.testset.transforms import default_transforms, apply_transforms

# LangChain imports
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import DirectoryLoader, JSONLoader
from langchain.schema import Document

# Text cleaning
from embedding_process.preprocessing import clean_text

# Ensure your OpenAI key is set
load_dotenv(override=True)
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")


# Configuration
DATA_DIR = "processed_syllabi/"
GLOB = "**/scraped_data/*.json"
CLEANED_DOCS_PATH = Path("cleaned_docs.pkl")
KG_PATH = Path("output/full_knowledge_graph.json")
JSON_OUT = Path("output/full_testset.json")
CSV_OUT = Path("output/full_testset.csv")
TESTSET_SIZE = 100
CACHE_DIR = ".cache/ragas"

# test if the cached cleaned docs load correctly later
def load_and_clean_documents(
    data_dir: str = DATA_DIR,
    glob_pattern: str = GLOB
) -> list[Document]:
    """
    Reads JSON syllabus files, extracts text, and returns cleaned Document objects.
    """
    # If we have a cached cleaned docs file, load it
    if CLEANED_DOCS_PATH.exists():
        print(f"Loading cleaned documents from cache: {CLEANED_DOCS_PATH}")
        return pd.read_pickle(CLEANED_DOCS_PATH)

    loader = DirectoryLoader(
        data_dir,
        glob=glob_pattern,
        loader_cls=JSONLoader,
        loader_kwargs={
            "jq_schema": ".",
            "content_key": "text",
            "is_content_key_jq_parsable": False,
            "json_lines": False,
            "metadata_func": lambda obj, meta: {**meta, **{k: v for k, v in obj.items() if k != "text"}}
        }
    )

    docs = loader.load()
    print(len(docs))

    cleaned_documents = []
    for doc in docs:                    # `documents` is a list of langchain.schema.Document
        raw = doc.page_content               
        cleaned = clean_text(raw)            
        # rewrap into a Document, preserving metadata:
        cleaned_doc = Document(
            page_content=cleaned,
            metadata=doc.metadata
        )
        cleaned_documents.append(cleaned_doc)

    pd.to_pickle(cleaned_documents, CLEANED_DOCS_PATH)
    print(f"Cached cleaned docs to {CLEANED_DOCS_PATH}")

    return cleaned_documents


def build_or_load_kg(docs, generator_llm, generator_embeddings, cache):
    if KG_PATH.exists():
        print(f"Loading existing KG from {KG_PATH}")
        kg = KnowledgeGraph.load(str(KG_PATH))
    else:
        print("Creating new KG and applying transforms...")
        kg = KnowledgeGraph()
        for doc in docs:
            kg.nodes.append(Node(
                type=NodeType.DOCUMENT,
                properties={"page_content": doc.page_content, "document_metadata": doc.metadata}
            ))

        transforms = default_transforms(documents=docs, 
                                    llm=generator_llm, 
                                    embedding_model=generator_embeddings)

        apply_transforms(kg, transforms)
        kg.save(str(KG_PATH))
    return kg


def generate_test_data(kg, generator_llm, generator_embeddings, test_size=TESTSET_SIZE):

    # instantiate testsetgenerator with the finished kg
    generator = TestsetGenerator(
        llm=generator_llm,
        embedding_model=generator_embeddings,
        knowledge_graph=kg
    )

    # we are not using the generate_with_langchain_docs function from documentation
    # because it will create the kg all over. we create it separately, so that we can save it
    print("Generating testset from existing KG...")
    dataset = generator.generate(
        testset_size=test_size,
        query_distribution=None,
        run_config=None,
        callbacks=None,
        with_debugging_logs=True
    )

    # Persist any new KG nodes 
    # (should be none if KG was complete, but in case of re-runs, save the updated version)
    generator.knowledge_graph.save(str(KG_PATH))

    # save the samples in json and csv format
    df = dataset.to_pandas()
    df.to_json(JSON_OUT, orient="records", indent=2)
    df.to_csv(CSV_OUT, index=False)
    return df


def main():
    # Initialize persistent disk cache
    cacher = DiskCacheBackend(cache_dir=CACHE_DIR) # ".cache/ragas"
    print("Cache entries:", len(cacher.cache))

    # Prepare LLM + embedding wrappers with the shared cache
    llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"), cache=cacher)
    embedder = LangchainEmbeddingsWrapper(OpenAIEmbeddings(), cache=cacher)

    # Load & clean docs
    docs = load_and_clean_documents()

    # Build or load KG
    kg = build_or_load_kg(docs, llm, embedder, cache)

    # Generate testset
    df = generate_test_data(kg, llm, embedder)

    print(f"Pipeline complete. KG saved to {KG_PATH}; testset saved to {JSON_OUT} and {CSV_OUT}.")
    return

if __name__ == "__main__":
    main()

In [None]:
# archive, running version right now

import os 
from dotenv import load_dotenv
load_dotenv(override=True)
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
print("OPENAI_API_KEY:", os.getenv("OPENAI_API_KEY"))

import pandas as pd

from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.testset import TestsetGenerator
from ragas.cache import DiskCacheBackend
from ragas.testset.graph import KnowledgeGraph
from ragas.testset.transforms import default_transforms, apply_transforms
from ragas.testset.graph import Node, NodeType

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import DirectoryLoader, JSONLoader
from langchain.schema import Document

from embedding_process.preprocessing import clean_text

from pathlib import Path

# if we want to see the cache in action, set the logging level to debug
import logging
from ragas.utils import set_logging_level
set_logging_level("ragas.cache", logging.DEBUG)


# Default paths and parameters
DATA_DIR = "processed_syllabi/"
GLOB = "**/scraped_data/*.json"
KG_PATH = Path("full_knowledge_graph.json")
JSON_OUT = Path("full_testset.json")
CSV_OUT = Path("full_testset.csv")
TESTSET_SIZE = 100


def load_and_clean_documents(
    data_dir: str = DATA_DIR,
    glob_pattern: str = GLOB
) -> list[Document]:
    """
    Reads JSON syllabus files, extracts text, and returns cleaned Document objects.
    """
    loader = DirectoryLoader(
        data_dir,
        glob=glob_pattern,
        loader_cls=JSONLoader,
        loader_kwargs={
            "jq_schema": ".",
            "content_key": "text",
            "is_content_key_jq_parsable": False,
            "json_lines": False,
            "metadata_func": lambda obj, meta: {**meta, **{k: v for k, v in obj.items() if k != "text"}}
        }
    )

    docs = loader.load()
    print(len(docs))

    cleaned_documents = []
    for doc in docs:                    # `documents` is a list of langchain.schema.Document
        raw = doc.page_content               
        cleaned = clean_text(raw)            
        # rewrap into a Document, preserving metadata:
        cleaned_doc = Document(
            page_content=cleaned,
            metadata=doc.metadata
        )
        cleaned_documents.append(cleaned_doc)

    return cleaned_documents



def create_kg(
    docs: list[Document],
    generator_llm,
    generator_embeddings,
    kg_path: Path = KG_PATH
) -> KnowledgeGraph:
    """
    Builds a knowledge graph from cleaned docs, applies transforms, and saves to disk.
    """

    kg = KnowledgeGraph()
    for doc in docs:
        kg.nodes.append(
            Node(
                type=NodeType.DOCUMENT,
                properties={"page_content": doc.page_content, "document_metadata": doc.metadata}
            )
        )
    transforms = default_transforms(documents=docs, 
                                    llm=generator_llm, 
                                    embedding_model=generator_embeddings)
    apply_transforms(kg, transforms)
    kg.save(str(kg_path))

    return kg


def generate_test_data(
    docs: list[Document],
    kg: KnowledgeGraph,
    generator_llm,
    generator_embeddings,
    testset_size: int = TESTSET_SIZE,
    json_out: Path = JSON_OUT,
    csv_out: Path = CSV_OUT
) -> pd.DataFrame:
    """
    Uses Ragas TestsetGenerator to produce a pandas DataFrame and saves JSON/CSV.
    """
    print('starting generate_test_data function.')
    generator = TestsetGenerator(
        llm=generator_llm,
        embedding_model=generator_embeddings,
        knowledge_graph=kg
    )
    print('finished TestsetGenerator function')

    # change this code to use 

    #dataset = generator.generate_with_langchain_docs(docs, 
     #                                                testset_size=testset_size,
     #                                               with_debugging_logs=True)
    dataset = generator.generate(
        testset_size=TESTSET_SIZE,
        query_distribution=None,
        run_config=None,
        callbacks=None,
        with_debugging_logs=True
        )
    #print('finished generator.generate_with_langchain function')
    
    print('finished generator.generate')
    generator.knowledge_graph.save("updated_full_knowledge_graph.json")
    
    df = dataset.to_pandas()
    
    df.to_json(json_out, orient="records", indent=2)
    df.to_csv(csv_out, index=False)
    return df



def main():

    cacher = DiskCacheBackend(cache_dir=".cache/ragas")
    print('cache length:')
    print(len(cacher.cache))

    # Load and clean
    docs = load_and_clean_documents()

    # Generate test data
    #docs = docs[0:10] # TEST

    # Initialize raw models
    generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"), cache=cacher)
    generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(), cache=cacher)

    # Build or load KnowledgeGraph
    if KG_PATH.exists():
        print(f"Loading existing KnowledgeGraph from {KG_PATH}")
        kg = KnowledgeGraph.load(str(KG_PATH))

    else:
        print(f"Creating new KnowledgeGraph and saving to {KG_PATH}")
        kg = create_kg(docs, generator_llm, generator_embeddings)



    TESTSET_SIZE = 100

    df = generate_test_data(docs, kg, generator_llm, 
                            generator_embeddings, testset_size=TESTSET_SIZE)

    print(f"Pipeline complete. KG saved to {KG_PATH}; testset saved to {JSON_OUT} and {CSV_OUT}.")
    return

if __name__ == "__main__":
    main()



In [7]:
from ragas.testset.graph import KnowledgeGraph

kg = KnowledgeGraph.load(r"C:\Users\au644610\OneDrive - Aarhus universitet\Desktop\Thesis-Trustworthy-RAG\output\full_knowledge_graph_second_test copy.json")
# 2298 in first
#kg = KnowledgeGraph.load(r"C:\Users\au644610\OneDrive - Aarhus universitet\Desktop\Thesis-Trustworthy-RAG\output\full_knowledge_graph.json")

total = len(kg.nodes)
print(f"Total nodes in graph: {total}")


Total nodes in graph: 3616


In [5]:
# pick the property that always gets added by your filter, e.g. "score"
missing = [
    node for node in kg.nodes
    if node.get_property("score") is None
]
print(f"Nodes without a score: {len(missing)}")


Nodes without a score: 3616


In [9]:
#print(kg.nodes[0].properties)  
# or
print(kg.nodes[0].properties.keys())


dict_keys(['page_content', 'document_metadata', 'headlines', 'summary', 'summary_embedding'])


In [10]:
# Count how many chunks got themes/entities
themes_count = sum(1 for n in kg.nodes if "themes" in n.properties)
entities_count = sum(1 for n in kg.nodes if "entities" in n.properties)
print(f"Themes on {themes_count} chunks, Entities on {entities_count} chunks")

# Inspect a sample chunk
sample = next(n for n in kg.nodes if "themes" in n.properties)
print(sample.id, sample.properties["themes"], sample.properties["entities"])

Themes on 2948 chunks, Entities on 2919 chunks
1f8e1a02-5618-46c2-a8d3-47031a256905 ['Heating patterns', 'Temperature change', 'Measurement patterns', 'Missing data', 'Linear mixed-effects model', 'Lognormal mixed-effects model', "Newton's Law of Cooling/Heating", 'Heat diffusion equation', 'Lumped capacitance model', 'Bayesian implementation'] ['Newton’s Law of Cooling/Heating', 'Stan', 'pizza stone', 'gas-fired oven', 'heat diffusion equation', 'heat transfer coefficient', 'temperature', 'time', 'specific heat capacity', 'thermal conductivity']


In [15]:
# 1) Inspect node types present
print("Node types in this graph:", {n.type for n in kg.nodes})

# 2) Count embeddings correctly
docs_for_embed = sum(
    1
    for n in kg.nodes
    if n.type == NodeType.DOCUMENT
       and n.properties.get("summary_embedding") is not None
)
print(f"Documents with embeddings: {docs_for_embed}")

# 3) Count chunks for comparison
chunk_count = sum(1 for n in kg.nodes if n.type == NodeType.CHUNK)

# 2) How many chunks got themes & entities?
chunks_with_themes   = sum(1 for n in kg.nodes if "themes"    in n.properties)
chunks_with_entities = sum(1 for n in kg.nodes if "entities"  in n.properties)

print(f"Surviving chunks     : {chunk_count}")
print(f"Embedding tasks   : {docs_for_embed}")
print(f"Themes tasks      : {chunks_with_themes}")
print(f"NER tasks         : {chunks_with_entities}")
print(f"Total reported    : {docs_for_embed + chunks_with_themes + chunks_with_entities}")

Node types in this graph: {<NodeType.DOCUMENT: 'document'>, <NodeType.CHUNK: 'chunk'>}
Documents with embeddings: 272
Surviving chunks     : 3161
Embedding tasks   : 272
Themes tasks      : 2948
NER tasks         : 2919
Total reported    : 6139


In [None]:
from ragas.testset.graph import KnowledgeGraph

kg = KnowledgeGraph.load(r"C:\Users\au644610\OneDrive - Aarhus universitet\Desktop\Thesis-Trustworthy-RAG\output\full_knowledge_graph_third_test.json")
# 2298 in first

total = len(kg.nodes)
print(f"Total nodes in graph: {total}")

# 1) Inspect node types present
print("Node types in this graph:", {n.type for n in kg.nodes})

# 2) Count embeddings correctly
docs_for_embed = sum(
    1
    for n in kg.nodes
    if n.type == NodeType.DOCUMENT
       and n.properties.get("summary_embedding") is not None
)
print(f"Documents with embeddings: {docs_for_embed}")

# 3) Count chunks for comparison
chunk_count = sum(1 for n in kg.nodes if n.type == NodeType.CHUNK)

# 2) How many chunks got themes & entities?
chunks_with_themes   = sum(1 for n in kg.nodes if "themes"    in n.properties)
chunks_with_entities = sum(1 for n in kg.nodes if "entities"  in n.properties)

print(f"Surviving chunks     : {chunk_count}")
print(f"Embedding tasks   : {docs_for_embed}")
print(f"Themes tasks      : {chunks_with_themes}")
print(f"NER tasks         : {chunks_with_entities}")
print(f"Total reported    : {docs_for_embed + chunks_with_themes + chunks_with_entities}")
# third

Total nodes in graph: 5240
Node types in this graph: {<NodeType.DOCUMENT: 'document'>, <NodeType.CHUNK: 'chunk'>}
Documents with embeddings: 396
Surviving chunks     : 4785
Embedding tasks   : 396
Themes tasks      : 2975
NER tasks         : 2764
Total reported    : 6135


In [None]:
from ragas.testset.graph import KnowledgeGraph

kg = KnowledgeGraph.load(r"C:\Users\au644610\OneDrive - Aarhus universitet\Desktop\Thesis-Trustworthy-RAG\output\full_knowledge_graph_fourth_test.json")
# 2298 in first

total = len(kg.nodes)
print(f"Total nodes in graph: {total}")

# 1) Inspect node types present
print("Node types in this graph:", {n.type for n in kg.nodes})

# 2) Count embeddings correctly
docs_for_embed = sum(
    1
    for n in kg.nodes
    if n.type == NodeType.DOCUMENT
       and n.properties.get("summary_embedding") is not None
)
print(f"Documents with embeddings: {docs_for_embed}")

# 3) Count chunks for comparison
chunk_count = sum(1 for n in kg.nodes if n.type == NodeType.CHUNK)

# 2) How many chunks got themes & entities?
chunks_with_themes   = sum(1 for n in kg.nodes if "themes"    in n.properties)
chunks_with_entities = sum(1 for n in kg.nodes if "entities"  in n.properties)

print(f"Surviving chunks     : {chunk_count}")
print(f"Embedding tasks   : {docs_for_embed}")
print(f"Themes tasks      : {chunks_with_themes}")
print(f"NER tasks         : {chunks_with_entities}")
print(f"Total reported    : {docs_for_embed + chunks_with_themes + chunks_with_entities}")
# fourth

Total nodes in graph: 4844
Node types in this graph: {<NodeType.DOCUMENT: 'document'>, <NodeType.CHUNK: 'chunk'>}
Documents with embeddings: 437
Surviving chunks     : 4389
Embedding tasks   : 437
Themes tasks      : 4205
NER tasks         : 4186
Total reported    : 8828


In [None]:

kg = KnowledgeGraph.load(r"C:\Users\au644610\OneDrive - Aarhus universitet\Desktop\Thesis-Trustworthy-RAG\output\full_knowledge_graph_fourth_test.json")
kg


KnowledgeGraph(nodes: 4844, relationships: 8052)

In [25]:
import os
from dotenv import load_dotenv
import pandas as pd
from pathlib import Path

# Ragas imports
from ragas.cache import DiskCacheBackend
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.testset import TestsetGenerator
from ragas.testset.graph import KnowledgeGraph, Node, NodeType
from ragas.testset.transforms import default_transforms, apply_transforms

# LangChain imports
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import DirectoryLoader, JSONLoader
from langchain.schema import Document

# Text cleaning
from embedding_process.preprocessing import clean_text

# Configuration
data_dir = "C:/Users/au644610/OneDrive - Aarhus universitet/Desktop/Thesis-Trustworthy-RAG/processed_syllabi/"
glob_pattern = "**/scraped_data/*.json"
CLEANED_DOCS_PATH = Path("cleaned_docs.pkl")
KG_PATH = Path("output/full_knowledge_graph_fourth_test.json")
JSON_OUT = Path("output/full_testset_fourth_test.json")
CSV_OUT = Path("output/full_testset_fourth_test.csv")
TESTSET_SIZE = 200
CACHE_DIR = ".cache/ragas"

loader = DirectoryLoader(
    data_dir,
    glob=glob_pattern,
    loader_cls=JSONLoader,
    loader_kwargs={
        "jq_schema": ".",
        "content_key": "text",
        "is_content_key_jq_parsable": False,
        "json_lines": False,
        "metadata_func": lambda obj, meta: {**meta, **{k: v for k, v in obj.items() if k != "text"}}
    }
)

docs = loader.load()
print(len(docs))

cleaned_documents = []
for doc in docs:                    # `documents` is a list of langchain.schema.Document
    raw = doc.page_content               
    cleaned = clean_text(raw)            
    # rewrap into a Document, preserving metadata:
    cleaned_doc = Document(
        page_content=cleaned,
        metadata=doc.metadata
    )
    cleaned_documents.append(cleaned_doc)


print(len(cleaned_documents))


docs = pd.read_pickle(r"C:\Users\au644610\OneDrive - Aarhus universitet\Desktop\Thesis-Trustworthy-RAG\cleaned_docs.pkl")
print(len(docs))



422
422
422


In [None]:
from ragas.testset.synthesizers import default_query_distribution
from ragas.testset import TestsetGenerator
loaded_kg = KnowledgeGraph.load(r"C:\Users\au644610\OneDrive - Aarhus universitet\Desktop\Thesis-Trustworthy-RAG\output\full_knowledge_graph_fourth_test.json")
print(loaded_kg)

# Initialize persistent disk cache
cacher = DiskCacheBackend(cache_dir=r"C:\Users\au644610\OneDrive - Aarhus universitet\Desktop\Thesis-Trustworthy-RAG\.cache\ragas") # ".cache/ragas"
print("Cache entries:", len(cacher.cache))

# Prepare LLM + embedding wrappers with the shared cache
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"), cache=cacher)
embedding_model = LangchainEmbeddingsWrapper(OpenAIEmbeddings(), cache=cacher)

generator = TestsetGenerator(llm=generator_llm, embedding_model=embedding_model, knowledge_graph=loaded_kg)

query_distribution = default_query_distribution(generator_llm)
print(query_distribution)

testset = generator.generate(testset_size=8, query_distribution=query_distribution)
testset.to_pandas()

testset.to_csv('multihoptest.csv', index = False)

KnowledgeGraph(nodes: 4844, relationships: 8052)
Cache entries: 14910
[(SingleHopSpecificQuerySynthesizer(name='single_hop_specifc_query_synthesizer', llm=LangchainLLMWrapper(langchain_llm=ChatOpenAI(...)), generate_query_reference_prompt=QueryAnswerGenerationPrompt(instruction=Generate a single-hop query and answer based on the specified conditions (persona, term, style, length) and the provided context. Ensure the answer is entirely faithful to the context, using only the information directly from the provided context.### Instructions:
1. **Generate a Query**: Based on the context, persona, term, style, and length, create a question that aligns with the persona's perspective and incorporates the term.
2. **Generate an Answer**: Using only the content from the provided context, construct a detailed answer to the query. Do not add any information not included in or inferable from the context.
, examples=[(QueryCondition(persona=Persona(name='Software Engineer', role_description='Focuse

Generating personas: 100%|██████████| 3/3 [00:00<00:00, 235.50it/s]
Generating Scenarios:  33%|███▎      | 1/3 [00:00<00:00, 60.97it/s]


ValueError: No clusters found in the knowledge graph. Try changing the relationship condition.

In [34]:
from ragas.testset.synthesizers import (
    SingleHopSpecificQuerySynthesizer,
    MultiHopSpecificQuerySynthesizer,
    MultiHopAbstractQuerySynthesizer,
)
spec = MultiHopSpecificQuerySynthesizer(generator_llm)
clusters = spec.get_node_clusters(kg)
print(f"#clusters found: {len(clusters)}")

assert len(MultiHopSpecificQuerySynthesizer(generator_llm).get_node_clusters(loaded_kg)) > 0, \
       "No multi-hop clusters found; generation will be single-hop only"


#clusters found: 0


AssertionError: No multi-hop clusters found; generation will be single-hop only

In [36]:
from ragas.testset.synthesizers import MultiHopSpecificQuerySynthesizer
kg = KnowledgeGraph.load(r"C:\Users\au644610\OneDrive - Aarhus universitet\Desktop\Thesis-Trustworthy-RAG\output\full_knowledge_graph_fourth_test.json")
print(kg)

synth = MultiHopSpecificQuerySynthesizer(generator_llm)
#print("edges by type:", {r.relation_type for r in kg.relationships})

clusters = synth.get_node_clusters(kg)
print("#valid clusters:", len(clusters))

missing_embed = sum(1 for n in kg.nodes if "summary_embedding" not in n.properties)
print("#nodes without embedding:", missing_embed)


KnowledgeGraph(nodes: 4844, relationships: 8052)
#valid clusters: 0
#nodes without embedding: 4407


In [50]:
kg = KnowledgeGraph.load(r"C:\Users\au644610\OneDrive - Aarhus universitet\Desktop\Thesis-Trustworthy-RAG\output\knowledge_graph_pilot_test.json")
print(kg)

synth = MultiHopSpecificQuerySynthesizer(generator_llm)

clusters = synth.get_node_clusters(kg)
print("#valid clusters:", len(clusters))


broken = [n for n in kg.nodes if not isinstance(n.properties.get("summary"), str)]
no_emb = [n for n in kg.nodes if not isinstance(n.properties.get("summary_embedding"), list)]

print(f"❗ Nodes with missing or invalid summaries: {len(broken)}")

print(f"❗ Nodes missing embeddings: {len(no_emb)}")


print(f"✅ Total nodes: {len(kg.nodes)}")

content_nodes = [n for n in kg.nodes if "page_content" in n.properties]

broken = [n for n in content_nodes if not isinstance(n.properties.get("summary"), str)]
no_emb = [n for n in content_nodes if not isinstance(n.properties.get("summary_embedding"), list)]

print(f"❗ Content nodes with missing/invalid summary: {len(broken)}")
print(f"❗ Content nodes missing embeddings: {len(no_emb)}")
print(f"✅ Total content nodes: {len(content_nodes)}")



KnowledgeGraph(nodes: 255, relationships: 944)
#valid clusters: 538
❗ Nodes with missing or invalid summaries: 223
❗ Nodes missing embeddings: 223
✅ Total nodes: 255
❗ Content nodes with missing/invalid summary: 223
❗ Content nodes missing embeddings: 223
✅ Total content nodes: 255


In [None]:
kg = KnowledgeGraph.load(r"..\output\knowledge_graph_split_01_100.json")
print(kg)

for synth in [SingleHopSpecificQuerySynthesizer(generator_llm),
              MultiHopSpecificQuerySynthesizer(generator_llm),
              MultiHopAbstractQuerySynthesizer(generator_llm)]: # could this be failing because i run out of ram? size is 166600 for the succeeding graph
    try:
        count = len(synth.get_node_clusters(kg))
        print(f"✅ {synth.name} → {count} cluster(s)")
    except Exception as e:
        print(f"❌ {synth.name} → error: {e}")



KnowledgeGraph(nodes: 1606, relationships: 20974)
✅ LangchainLLMWrapper(langchain_llm=ChatOpenAI(...)) → 1500 cluster(s)
✅ LangchainLLMWrapper(langchain_llm=ChatOpenAI(...)) → 12700 cluster(s)


KeyboardInterrupt: 

In [48]:
broken = [n for n in kg.nodes if not isinstance(n.property("summary"), str)]
missing_embed = [n for n in kg.nodes if not isinstance(n.property("summary_embedding"), list)]

print(f"❗ Nodes with invalid/missing summaries: {len(broken)}")
print(f"❗ Nodes with missing embeddings: {len(missing_embed)}")
print(f"✅ Total nodes: {len(kg.nodes)}")


AttributeError: 'Node' object has no attribute 'property'

In [52]:
query_distribution = []
synths = [
    SingleHopSpecificQuerySynthesizer(generator_llm),
    MultiHopSpecificQuerySynthesizer(generator_llm),
    MultiHopAbstractQuerySynthesizer(generator_llm)
]

for s in synths:
    try:
        clusters = s.get_node_clusters(kg)
        if clusters:
            query_distribution.append((s, 1.0))  # weight can be adjusted
        else:
            print(f"⚠️ No clusters found for {s.name}; skipping.")
    except Exception as e:
        print(f"❌ Synthesizer {s.name} failed: {e}")

# re-normalise weights
total = sum(w for _, w in query_distribution)
query_distribution = [(s, w/total) for s, w in query_distribution]


⚠️ No clusters found for LangchainLLMWrapper(langchain_llm=ChatOpenAI(...)); skipping.


In [None]:
    query_distribution = [
        (
            SingleHopSpecificQuerySynthesizer(llm=generator_llm, property_name="headlines"),
            0.5,
        ),
        (
            SingleHopSpecificQuerySynthesizer(
                llm=generator_llm, property_name="keyphrases"
            ),
            0.5,
        ),
    ]


In [None]:
# merge the split version, contains some abstract
df = pd.read_csv("../output/raw_test_data/full_testset_fourth_test.csv") # the 200 single hop
df1 = pd.read_csv("../output/testset_multi_all.csv")  # the 50 multihop from all data
df2 = pd.read_csv("../output/testset_all_150.csv")  # the 150 multihop from all data

In [85]:
# merge the single and multi hop files for the gpu run:
df = pd.read_csv("../output/raw_test_data/testset_pilot_100.csv") # 50 without abstract'
df1 = pd.read_csv("../output/raw_test_data/testset_split_00_100.csv")  # 50 with abstract
df2 = pd.read_csv("../output/raw_test_data/testset_split_01_100.csv")  # 50 with abstract
df3 = pd.read_csv("../output/raw_test_data/testset_split_02_100.csv")  # 50 with abstract
df4 = pd.read_csv("../output/raw_test_data/testset_split_03_22.csv")  # 22 with abstract

testset_merged = pd.concat([df,df1,df2,df3,df4])
testset_merged.to_csv('split_testset_merged.csv', index = False)
testset_merged

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,What role does an electroencephalogram play in...,['What is computational modeling of behavioral...,An electroencephalogram (EEG) is used in laten...,single_hop_specifc_query_synthesizer
1,What are some key considerations for designing...,['Design a good experiment! Computational mode...,When designing a good experiment in computatio...,single_hop_specifc_query_synthesizer
2,How do Courville and Daw contribute to the und...,['good models Just as bad experiments can limi...,Courville and Daw are referenced in the contex...,single_hop_specifc_query_synthesizer
3,What does Daw et al. contribute to the underst...,"['Simulate, simulate, simulate! Once you have ...",Daw et al. (2011) emphasize the importance of ...,single_hop_specifc_query_synthesizer
4,How can Matlab's fmincon function assist in th...,['Fit the parameters A key component of comput...,Matlab's fmincon function can assist in the ma...,single_hop_specifc_query_synthesizer
...,...,...,...,...
45,What does the refinement paradox reveal about ...,['<1-hop>\n\nRESEARCH ARTICLE The refinement p...,The refinement paradox reveals that while refi...,multi_hop_abstract_query_synthesizer
46,How do the dynamics of scientific discovery an...,['<1-hop>\n\nRESEARCH ARTICLE Scientific disco...,The dynamics of scientific discovery and cultu...,multi_hop_abstract_query_synthesizer
47,How do social roles and leadership effectivene...,['<1-hop>\n\nSOCIAL ROLES AND THE EVOLUTION OF...,The evolution of networks in isolated environm...,multi_hop_abstract_query_synthesizer
48,How does the deluge of papers published annual...,['<1-hop>\n\nSlowed canonical progress in larg...,The deluge of papers published annually can im...,multi_hop_abstract_query_synthesizer


In [76]:
df = pd.read_csv("../output/annotated_merged_testset_3.csv", sep = ';') 
df

Unnamed: 0,annotation,user_input,reference_contexts,reference,synthesizer_name
0,1,How does the temperature of a pizza stone affe...,['This chapter introduces core modeling concep...,Understanding how pizza stones heat up is cruc...,single_hop_specifc_query_synthesizer
1,1,What are the key factors that influence the ti...,['needed to reach a target temperature: To und...,The time needed to reach the target temperatur...,single_hop_specifc_query_synthesizer
2,1,How does computational modeling contribute to ...,['What is computational modeling of behavioral...,Computational modeling in behavioral science a...,single_hop_specifc_query_synthesizer
3,1,What role does Anne Collins play in the contex...,['Design a good experiment! Computational mode...,Anne Collins is associated with providing illu...,single_hop_specifc_query_synthesizer
4,1,How do Courville and Daw contribute to the und...,['good models Just as bad experiments can limi...,Courville and Daw are referenced in the contex...,single_hop_specifc_query_synthesizer
...,...,...,...,...,...
395,1,What is the significance of Levene's test in a...,['<1-hop>\n\n5.7. Testing for homogeneity of v...,Levene's test is significant in assessing homo...,multi_hop_specific_query_synthesizer
396,1,What role does Bremner's research play in unde...,"[""<1-hop>\n\nA direction. This is a level of c...",Bremner's research is pivotal in understanding...,multi_hop_specific_query_synthesizer
397,1,What role does the hippocampus play in memory ...,['<1-hop>\n\nThe nature of child-directed spee...,The hippocampus plays a crucial role in the st...,multi_hop_specific_query_synthesizer
398,0,What does Figure 1.1 show about the relationsh...,"[""<1-hop>\n\n= 'lm') + ggtitle('Log RT ~ raw f...",Figure 1.1 illustrates that wage increases wit...,multi_hop_specific_query_synthesizer


In [79]:
df.to_csv("../output/annotated_merged_testset.csv", index = False)

In [87]:
df = pd.read_csv("../output/raw_test_data/anno_split_testset_merged.csv", sep = ';') 
df_filtered = df.loc[df['annotation'] == 1]
df_filtered.to_csv("../output/filt_anno_split_testset_merged.csv", index = False)

In [80]:
df = pd.read_csv("../output/filt_anno_merged_testset.csv") 
df

Unnamed: 0,annotation,user_input,reference_contexts,reference,synthesizer_name
0,1,How does the temperature of a pizza stone affe...,['This chapter introduces core modeling concep...,Understanding how pizza stones heat up is cruc...,single_hop_specifc_query_synthesizer
1,1,What are the key factors that influence the ti...,['needed to reach a target temperature: To und...,The time needed to reach the target temperatur...,single_hop_specifc_query_synthesizer
2,1,How does computational modeling contribute to ...,['What is computational modeling of behavioral...,Computational modeling in behavioral science a...,single_hop_specifc_query_synthesizer
3,1,What role does Anne Collins play in the contex...,['Design a good experiment! Computational mode...,Anne Collins is associated with providing illu...,single_hop_specifc_query_synthesizer
4,1,How do Courville and Daw contribute to the und...,['good models Just as bad experiments can limi...,Courville and Daw are referenced in the contex...,single_hop_specifc_query_synthesizer
...,...,...,...,...,...
281,1,What insights do Berk et al. and Bell et al. p...,['<1-hop>\n\nthe proposed iterative workflow i...,Berk et al. discuss the importance of understa...,multi_hop_specific_query_synthesizer
282,1,What is the significance of Levene's test in a...,['<1-hop>\n\n5.7. Testing for homogeneity of v...,Levene's test is significant in assessing homo...,multi_hop_specific_query_synthesizer
283,1,What role does Bremner's research play in unde...,"[""<1-hop>\n\nA direction. This is a level of c...",Bremner's research is pivotal in understanding...,multi_hop_specific_query_synthesizer
284,1,What role does the hippocampus play in memory ...,['<1-hop>\n\nThe nature of child-directed spee...,The hippocampus plays a crucial role in the st...,multi_hop_specific_query_synthesizer
