# Generating Syntethic Test-sets based on SSE lecture slides

## General Setup

In [17]:
import os
import dotenv
dotenv.load_dotenv()
from typing import List, Tuple, Any
import typing as t
import asyncio

# For PDF extraction
from langchain_community.document_loaders import PyPDFLoader

# Ragas components
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.testset.graph import KnowledgeGraph, Node, NodeType
from ragas.testset.transforms import (
    apply_transforms,
    HeadlinesExtractor,
    HeadlineSplitter,
    KeyphrasesExtractor,
    OverlapScoreBuilder
)
from ragas.testset.persona import Persona
from ragas.testset.synthesizers.single_hop.specific import SingleHopSpecificQuerySynthesizer
from ragas.testset import TestsetGenerator

from ragas.testset.synthesizers.multi_hop.base import (
    MultiHopQuerySynthesizer,
    MultiHopScenario,
)
from ragas.testset.synthesizers.prompts import (
    ThemesPersonasInput,
    ThemesPersonasMatchingPrompt,
)

# LLM and embeddings
from langchain_groq import ChatGroq
from langchain_openai import OpenAIEmbeddings
from dataclasses import dataclass
from pprint import pprint

import pandas as pd

In [8]:
def load_pdf_documents(pdf_dir: str) -> List[Any]:
    """
    Load all PDF files from a directory and convert them to documents.
    """
    documents = []
    for filename in os.listdir(pdf_dir):
        if filename.endswith(".pdf"):
            file_path = os.path.join(pdf_dir, filename)
            print(f"Loading {filename}...")
            try:
                loader = PyPDFLoader(file_path)
                # Split by page to manage large files better
                docs = loader.load_and_split()
                documents.extend(docs)
                print(f"  Added {len(docs)} pages from {filename}")
            except Exception as e:
                print(f"  Error loading {filename}: {e}")
    
    return documents


def create_sustainable_se_personas() -> List[Persona]:
    """
    Create personas relevant to sustainable software engineering.
    """
    persona_prepared_student = Persona(
        name="Prepared Computer Science Student",
        role_description="A university student learning about sustainable software engineering. Has a good general overview of topics, but intrested in the finer technical details."
    )

    persona_lazy_student = Persona(
        name="Lazy Computer Science Student",
        role_description="A computer science student that barely knows the content, hasn't attended lectures, but is making a start with his studies. In addition to less tecnical questions, is also interested in organisational matters."
    )

    persona_curious_developer = Persona(
        name="Professional Developer",
        role_description="A software developer in the industry looking to learn about green software practices to apply in their projects and convince management of their importance."
    )

    return [persona_prepared_student, persona_lazy_student, persona_curious_developer]


def prepare_knowledge_graph(pdf_dir: str) -> Tuple[KnowledgeGraph, LangchainLLMWrapper, LangchainEmbeddingsWrapper]:
        """
        Prepare the knowledge graph and models from PDF documents.
        
        Args:
            pdf_dir: Directory containing PDF files to process
            
        Returns:
            Tuple containing:
            - The prepared knowledge graph
            - The LLM wrapper
            - The embeddings wrapper
        """
        # Step 1: Load PDF documents
        print("Loading PDF documents...")
        documents = load_pdf_documents(pdf_dir)
        print(f"Loaded {len(documents)} document chunks.")
        
        # Step 2: Set up Groq LLM and embedding models
        print("Setting up LLM and embedding models...")
        generator_llm = LangchainLLMWrapper(
            ChatGroq(model="deepseek-r1-distill-qwen-32b", temperature=0.2, max_tokens=4096)
        )
        # Still using OpenAI for embeddings as Groq doesn't have embedding models
        generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-small"))
        
        # Step 3: Create knowledge graph
        print("Creating knowledge graph...")
        kg = KnowledgeGraph()
        
        for doc in documents:
            kg.nodes.append(
                Node(
                    type=NodeType.DOCUMENT,
                    properties={"page_content": doc.page_content, "document_metadata": doc.metadata}
                )
            )
        
        print(f"Created initial knowledge graph with {len(kg.nodes)} nodes.")
        return kg, generator_llm, generator_embeddings

## Setup for single-hop test-set generation

In [26]:
pdf_dir = "data/sse_lectures"

"""
Generate a testset for sustainable software engineering using PDF lecture slides.
"""
# Steps 1 - 3: Prepare knowledge graph and models
kg, generator_llm, generator_embeddings = prepare_knowledge_graph(pdf_dir)

# Step 4: Apply transforms
print("Applying transforms to enhance knowledge graph...")
transforms = [
    HeadlinesExtractor(llm=generator_llm, max_num=10),
    HeadlineSplitter(max_tokens=1000),
    KeyphrasesExtractor(llm=generator_llm, property_name="keyphrases", max_num=10)
]

apply_transforms(kg, transforms=transforms)
print(f"Knowledge graph now has {len(kg.nodes)} nodes after transforms.")

# Step 5: Configure personas
personas = create_sustainable_se_personas()

# Step 6: Set up query synthesizers
query_distribution = [
    (
        SingleHopSpecificQuerySynthesizer(llm=generator_llm, property_name="headlines"),
        0.5,
    ),
    (
        SingleHopSpecificQuerySynthesizer(llm=generator_llm, property_name="keyphrases"),
        0.5,
    ),
]



Ignoring wrong pointing object 64 0 (offset 0)
Ignoring wrong pointing object 70 0 (offset 0)
Ignoring wrong pointing object 192 0 (offset 0)
Ignoring wrong pointing object 319 0 (offset 0)
Ignoring wrong pointing object 320 0 (offset 0)
Ignoring wrong pointing object 90 0 (offset 0)


Loading PDF documents...
Loading 01_intro.pdf...
  Added 42 pages from 01_intro.pdf
Loading 02_lab_measuring_energy.pdf...


Ignoring wrong pointing object 206 0 (offset 0)
Ignoring wrong pointing object 207 0 (offset 0)
Ignoring wrong pointing object 32 0 (offset 0)
Ignoring wrong pointing object 66 0 (offset 0)
Ignoring wrong pointing object 123 0 (offset 0)


  Added 27 pages from 02_lab_measuring_energy.pdf
Loading 03_scientific_guide.pdf...


Ignoring wrong pointing object 46 0 (offset 0)
Ignoring wrong pointing object 72 0 (offset 0)
Ignoring wrong pointing object 142 0 (offset 0)
Ignoring wrong pointing object 242 0 (offset 0)
Ignoring wrong pointing object 243 0 (offset 0)


  Added 39 pages from 03_scientific_guide.pdf
Loading 05_metrics.pdf...


Ignoring wrong pointing object 174 0 (offset 0)
Ignoring wrong pointing object 234 0 (offset 0)
Ignoring wrong pointing object 243 0 (offset 0)
Ignoring wrong pointing object 355 0 (offset 0)
Ignoring wrong pointing object 386 0 (offset 0)
Ignoring wrong pointing object 388 0 (offset 0)
Ignoring wrong pointing object 398 0 (offset 0)
Ignoring wrong pointing object 400 0 (offset 0)
Ignoring wrong pointing object 407 0 (offset 0)
Ignoring wrong pointing object 409 0 (offset 0)
Ignoring wrong pointing object 415 0 (offset 0)
Ignoring wrong pointing object 417 0 (offset 0)
Ignoring wrong pointing object 423 0 (offset 0)
Ignoring wrong pointing object 425 0 (offset 0)
Ignoring wrong pointing object 431 0 (offset 0)
Ignoring wrong pointing object 433 0 (offset 0)
Ignoring wrong pointing object 489 0 (offset 0)
Ignoring wrong pointing object 491 0 (offset 0)
Ignoring wrong pointing object 493 0 (offset 0)
Ignoring wrong pointing object 495 0 (offset 0)
Ignoring wrong pointing object 497 0 (of

  Added 38 pages from 05_metrics.pdf
Loading 07_green_software_research.pdf...


Ignoring wrong pointing object 72 0 (offset 0)
Ignoring wrong pointing object 221 0 (offset 0)
Ignoring wrong pointing object 241 0 (offset 0)
Ignoring wrong pointing object 247 0 (offset 0)
Ignoring wrong pointing object 248 0 (offset 0)
Ignoring wrong pointing object 249 0 (offset 0)
Ignoring wrong pointing object 250 0 (offset 0)
Ignoring wrong pointing object 251 0 (offset 0)
Ignoring wrong pointing object 252 0 (offset 0)
Ignoring wrong pointing object 253 0 (offset 0)
Ignoring wrong pointing object 254 0 (offset 0)
Ignoring wrong pointing object 255 0 (offset 0)
Ignoring wrong pointing object 256 0 (offset 0)
Ignoring wrong pointing object 257 0 (offset 0)
Ignoring wrong pointing object 258 0 (offset 0)
Ignoring wrong pointing object 335 0 (offset 0)
Ignoring wrong pointing object 336 0 (offset 0)


  Added 52 pages from 07_green_software_research.pdf
Loading 08_green_ai.pdf...


Ignoring wrong pointing object 62 0 (offset 0)
Ignoring wrong pointing object 105 0 (offset 0)
Ignoring wrong pointing object 108 0 (offset 0)
Ignoring wrong pointing object 110 0 (offset 0)
Ignoring wrong pointing object 112 0 (offset 0)
Ignoring wrong pointing object 115 0 (offset 0)
Ignoring wrong pointing object 116 0 (offset 0)
Ignoring wrong pointing object 118 0 (offset 0)
Ignoring wrong pointing object 122 0 (offset 0)
Ignoring wrong pointing object 126 0 (offset 0)
Ignoring wrong pointing object 128 0 (offset 0)
Ignoring wrong pointing object 130 0 (offset 0)
Ignoring wrong pointing object 132 0 (offset 0)
Ignoring wrong pointing object 134 0 (offset 0)
Ignoring wrong pointing object 136 0 (offset 0)
Ignoring wrong pointing object 138 0 (offset 0)
Ignoring wrong pointing object 140 0 (offset 0)
Ignoring wrong pointing object 142 0 (offset 0)
Ignoring wrong pointing object 144 0 (offset 0)
Ignoring wrong pointing object 146 0 (offset 0)
Ignoring wrong pointing object 151 0 (off

  Added 47 pages from 08_green_ai.pdf
Loading 10_project2.pdf...
  Added 30 pages from 10_project2.pdf
Loading 11_SusSE Lecture - Neuromorphic computing 2025.pdf...
  Added 65 pages from 11_SusSE Lecture - Neuromorphic computing 2025.pdf
Loaded 340 document chunks.
Setting up LLM and embedding models...
Creating knowledge graph...
Created initial knowledge graph with 340 nodes.
Applying transforms to enhance knowledge graph...


Applying HeadlinesExtractor:   9%|▊         | 29/340 [00:11<01:26,  3.61it/s]unable to apply transformation: Invalid json output: of the electricity consumed worldwide 
by 2040 will stem from ICT
14%
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 
Applying HeadlinesExtractor:  11%|█         | 38/340 [00:13<01:33,  3.24it/s]unable to apply transformation: Invalid json output: Software for Sustainability 
• We are not covering it in this course.
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 
Applying HeadlinesExtractor:  33%|███▎      | 112/340 [00:30<00:45,  5.05it/s]unable to apply transformation: Invalid json output: Problems in the Computer Language Benchmarks Game
34
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 
Applying HeadlinesExtractor:  37%|███▋      | 125/340 [00:34<00:54,  3.92it/s]unable to a

Knowledge graph now has 673 nodes after transforms.




## Generate the single-hop dataset based on SSE Course Slides

In [27]:
testset_size = 50
output_file = "sse_single_hop_testset.csv"

# Step 7: Generate testset
print("Generating testset...")
generator = TestsetGenerator(
    llm=generator_llm,
    embedding_model=generator_embeddings,
    knowledge_graph=kg,
    persona_list=personas,
)

testset = generator.generate(testset_size=testset_size, query_distribution=query_distribution)

# Save to CSV
print(f"Saving {len(testset)} generated test samples to {output_file}...")
testset.to_pandas().to_csv(output_file, index=False)
print("Done!")

Generating testset...


Generating Scenarios: 100%|██████████| 2/2 [02:30<00:00, 75.38s/it] 
Generating Samples: 100%|██████████| 50/50 [00:21<00:00,  2.37it/s]


Saving 50 generated test samples to sse_single_hop_testset.csv...
Done!


In [None]:
testset.to_pandas()

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,What is the focus of the '1. Intro to Sustaina...,[1. Intro to Sustainable SE \n2. Intro to the ...,The '1. Intro to Sustainable SE' section serve...,single_hop_specifc_query_synthesizer
1,What is environmental-responsible?,"[Buzz words\n• Eco-friendly\n• Climate change,...",Environmental-responsible is a buzzword.,single_hop_specifc_query_synthesizer
2,What is Sustainable Software Engineering?,[https://www.menti.com/uns9d89kzn\nWhat is Sus...,Sustainable Software Engineering refers to the...,single_hop_specifc_query_synthesizer
3,What is Sustainable Software Engineering?,[Sustainable Software \nEngineering is…\n7\n…t...,Sustainable Software Engineering is the discip...,single_hop_specifc_query_synthesizer
4,What role does the social aspect play in softw...,[8\nEconomical\nSocial\nTechnical\nIndividual\...,The social aspect is one of the key dimensions...,single_hop_specifc_query_synthesizer
5,What economical factors are important in softw...,"[Economical\n• Focused on assets, capital and ...","Economical factors include assets, capital, ad...",single_hop_specifc_query_synthesizer
6,What is the relevance of Bus factor to softwar...,"[Technical\n• Longevity of information, system...",Bus-factor is a relevant consideration in soft...,single_hop_specifc_query_synthesizer
7,What is Democrazy?,[Social\n• concerned with societal communities...,Democracy is a societal factor that contribute...,single_hop_specifc_query_synthesizer
8,What is the relationship between Environmental...,[Environmental Sustainability\n• the branch of...,Environmental Sustainability in software engin...,single_hop_specifc_query_synthesizer
9,Why is Green Software important for computer s...,[Green Software\n• Sustainability and energy e...,Green Software is important because it focuses...,single_hop_specifc_query_synthesizer


## Setup for multi-hop testset generation

In [10]:
scenario_num=5
testset_size=1


In [11]:
 # Steps 1 - 3: Prepare knowledge graph and models
kg, generator_llm, generator_embeddings = prepare_knowledge_graph(pdf_dir)

# Step 4: Apply transforms
relation_builder = OverlapScoreBuilder(
    property_name="keyphrases",
    new_property_name="overlap_score",
    threshold=0.01,
    distance_threshold=0.9,
)

transforms = [
    HeadlinesExtractor(llm=generator_llm, max_num=10),
    HeadlineSplitter(max_tokens=1000),
    KeyphrasesExtractor(llm=generator_llm, property_name="keyphrases", max_num=10),
    relation_builder
]

apply_transforms(kg, transforms=transforms)
print(f"Knowledge graph now has {len(kg.nodes)} nodes after transforms.")

# Step 5: Configure personas
personas = create_sustainable_se_personas()

@dataclass
class MyMultiHopQuery(MultiHopQuerySynthesizer):

    theme_persona_matching_prompt = ThemesPersonasMatchingPrompt()

    async def _generate_scenarios(
        self,
        n: int,
        knowledge_graph,
        persona_list,
        callbacks,
    ) -> t.List[MultiHopScenario]:

        # query and get (node_a, rel, node_b) to create multi-hop queries
        results = kg.find_two_nodes_single_rel(
            relationship_condition=lambda rel: (
                True if rel.type == "keyphrases_overlap" else False
            )
        )

        num_sample_per_triplet = max(1, n // len(results))

        scenarios = []
        for triplet in results:
            if len(scenarios) < n:
                node_a, node_b = triplet[0], triplet[-1]
                overlapped_keywords = triplet[1].properties["overlapped_items"]
                if overlapped_keywords:

                    # match the keyword with a persona for query creation
                    themes = list(dict(overlapped_keywords).keys())
                    prompt_input = ThemesPersonasInput(
                        themes=themes, personas=persona_list
                    )
                    persona_concepts = (
                        await self.theme_persona_matching_prompt.generate(
                            data=prompt_input, llm=self.llm, callbacks=callbacks
                        )
                    )

                    overlapped_keywords = [list(item) for item in overlapped_keywords]

                    # prepare and sample possible combinations
                    base_scenarios = self.prepare_combinations(
                        [node_a, node_b],
                        overlapped_keywords,
                        personas=persona_list,
                        persona_item_mapping=persona_concepts.mapping,
                        property_name="keyphrases",
                    )

                    # get number of required samples from this triplet
                    base_scenarios = self.sample_diverse_combinations(
                        base_scenarios, num_sample_per_triplet
                    )

                    scenarios.extend(base_scenarios)

        return scenarios
    
query = MyMultiHopQuery(llm=generator_llm)

Ignoring wrong pointing object 64 0 (offset 0)
Ignoring wrong pointing object 70 0 (offset 0)
Ignoring wrong pointing object 192 0 (offset 0)
Ignoring wrong pointing object 319 0 (offset 0)
Ignoring wrong pointing object 320 0 (offset 0)


Loading PDF documents...
Loading 01_intro.pdf...


Ignoring wrong pointing object 90 0 (offset 0)
Ignoring wrong pointing object 206 0 (offset 0)
Ignoring wrong pointing object 207 0 (offset 0)
Ignoring wrong pointing object 32 0 (offset 0)
Ignoring wrong pointing object 66 0 (offset 0)
Ignoring wrong pointing object 123 0 (offset 0)


  Added 42 pages from 01_intro.pdf
Loading 02_lab_measuring_energy.pdf...
  Added 27 pages from 02_lab_measuring_energy.pdf
Loading 03_scientific_guide.pdf...


Ignoring wrong pointing object 46 0 (offset 0)
Ignoring wrong pointing object 72 0 (offset 0)
Ignoring wrong pointing object 142 0 (offset 0)
Ignoring wrong pointing object 242 0 (offset 0)
Ignoring wrong pointing object 243 0 (offset 0)


  Added 39 pages from 03_scientific_guide.pdf
Loading 05_metrics.pdf...


Ignoring wrong pointing object 174 0 (offset 0)
Ignoring wrong pointing object 234 0 (offset 0)
Ignoring wrong pointing object 243 0 (offset 0)
Ignoring wrong pointing object 355 0 (offset 0)
Ignoring wrong pointing object 386 0 (offset 0)
Ignoring wrong pointing object 388 0 (offset 0)
Ignoring wrong pointing object 398 0 (offset 0)
Ignoring wrong pointing object 400 0 (offset 0)
Ignoring wrong pointing object 407 0 (offset 0)
Ignoring wrong pointing object 409 0 (offset 0)
Ignoring wrong pointing object 415 0 (offset 0)
Ignoring wrong pointing object 417 0 (offset 0)
Ignoring wrong pointing object 423 0 (offset 0)
Ignoring wrong pointing object 425 0 (offset 0)
Ignoring wrong pointing object 431 0 (offset 0)
Ignoring wrong pointing object 433 0 (offset 0)
Ignoring wrong pointing object 489 0 (offset 0)
Ignoring wrong pointing object 491 0 (offset 0)
Ignoring wrong pointing object 493 0 (offset 0)
Ignoring wrong pointing object 495 0 (offset 0)
Ignoring wrong pointing object 497 0 (of

  Added 38 pages from 05_metrics.pdf
Loading 07_green_software_research.pdf...


Ignoring wrong pointing object 72 0 (offset 0)
Ignoring wrong pointing object 221 0 (offset 0)
Ignoring wrong pointing object 241 0 (offset 0)
Ignoring wrong pointing object 247 0 (offset 0)
Ignoring wrong pointing object 248 0 (offset 0)
Ignoring wrong pointing object 249 0 (offset 0)
Ignoring wrong pointing object 250 0 (offset 0)
Ignoring wrong pointing object 251 0 (offset 0)
Ignoring wrong pointing object 252 0 (offset 0)
Ignoring wrong pointing object 253 0 (offset 0)
Ignoring wrong pointing object 254 0 (offset 0)
Ignoring wrong pointing object 255 0 (offset 0)
Ignoring wrong pointing object 256 0 (offset 0)
Ignoring wrong pointing object 257 0 (offset 0)
Ignoring wrong pointing object 258 0 (offset 0)
Ignoring wrong pointing object 335 0 (offset 0)
Ignoring wrong pointing object 336 0 (offset 0)


  Added 52 pages from 07_green_software_research.pdf
Loading 08_green_ai.pdf...


Ignoring wrong pointing object 62 0 (offset 0)
Ignoring wrong pointing object 105 0 (offset 0)
Ignoring wrong pointing object 108 0 (offset 0)
Ignoring wrong pointing object 110 0 (offset 0)
Ignoring wrong pointing object 112 0 (offset 0)
Ignoring wrong pointing object 115 0 (offset 0)
Ignoring wrong pointing object 116 0 (offset 0)
Ignoring wrong pointing object 118 0 (offset 0)
Ignoring wrong pointing object 122 0 (offset 0)
Ignoring wrong pointing object 126 0 (offset 0)
Ignoring wrong pointing object 128 0 (offset 0)
Ignoring wrong pointing object 130 0 (offset 0)
Ignoring wrong pointing object 132 0 (offset 0)
Ignoring wrong pointing object 134 0 (offset 0)
Ignoring wrong pointing object 136 0 (offset 0)
Ignoring wrong pointing object 138 0 (offset 0)
Ignoring wrong pointing object 140 0 (offset 0)
Ignoring wrong pointing object 142 0 (offset 0)
Ignoring wrong pointing object 144 0 (offset 0)
Ignoring wrong pointing object 146 0 (offset 0)
Ignoring wrong pointing object 151 0 (off

  Added 47 pages from 08_green_ai.pdf
Loading 10_project2.pdf...
  Added 30 pages from 10_project2.pdf
Loading 11_SusSE Lecture - Neuromorphic computing 2025.pdf...
  Added 65 pages from 11_SusSE Lecture - Neuromorphic computing 2025.pdf
Loaded 340 document chunks.
Setting up LLM and embedding models...
Creating knowledge graph...
Created initial knowledge graph with 340 nodes.


Applying HeadlinesExtractor:   5%|▌         | 18/340 [00:07<01:20,  3.98it/s]unable to apply transformation: Failed to parse Headlines from completion 4. Got: 1 validation error for Headlines
  Input should be a valid dictionary or instance of Headlines [type=model_type, input_value=4, input_type=int]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 
Applying HeadlinesExtractor:  12%|█▏        | 42/340 [00:16<01:51,  2.66it/s]unable to apply transformation: Failed to parse Headlines from completion 32. Got: 1 validation error for Headlines
  Input should be a valid dictionary or instance of Headlines [type=model_type, input_value=32, input_type=int]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 
Applying Headl

Knowledge graph now has 675 nodes after transforms.




In [13]:
scenarios = await query.generate_scenarios(
    n=scenario_num,
    knowledge_graph=kg,
    persona_list=personas,
    callbacks=None
)

pprint(scenarios)

[MultiHopScenario(
nodes=2
combinations=['spiking neural networks', 'spiking neuron networks']
style=Poor grammar
length=short
persona=name='Prepared Computer Science Student' role_description='A university student learning about sustainable software engineering. Has a good general overview of topics, but intrested in the finer technical details.'),
 MultiHopScenario(
nodes=2
combinations=['Government-developed software', 'government-developed OS software']
style=Misspelled queries
length=short
persona=name='Professional Developer' role_description='A software developer in the industry looking to learn about green software practices to apply in their projects and convince management of their importance.'),
 MultiHopScenario(
nodes=2
combinations=['data science', 'data centers']
style=Misspelled queries
length=long
persona=name='Prepared Computer Science Student' role_description='A university student learning about sustainable software engineering. Has a good general overview of topics

In [16]:
results = await query.generate_sample(
    scenario=scenarios[-1]
)

pprint(results)


SingleTurnSample(user_input='What is the biological basis of neuromorphic computing, and how does it differ from conventional neural networks?', retrieved_contexts=None, reference_contexts=['<1-hop>\n\nNeuromorphic computing\nNeuromorphic computing is an approach to computing that is inspired by the structure and function of the human brain.\nA neuromorphic computer/chip is any device that uses physical artificial neurons to do computations.\nhttps://en.wikipedia.org/wiki/Neuromorphic_engineering, https://www.informationweek.com/software-services/what-you-need-to-know-about-neuromorphic-computing', '<2-hop>\n\nReading materials\nMain reading:\n- Section 1 and Section 3.1 of "Computing with spiking neuron networks." by Paugam-Moisy H, Bohte SM, in Handbook of natural computing (2012). \nhttps://homepages.cwi.nl/~sbohte/publication/paugam_moisy_bohte_SNNChapter.pdf\n- Maass W. Networks of spiking neurons: the third generation of neural network models. Neural networks. 1997. 10(9):1659-71

## Generate the multi-hop dataset based on SSE Course Slides

You can modify the testset_size variable to change number of samples in testset.

In [22]:
testset_size = 50
testset = []
testset_counter = 0
while testset_counter < testset_size:
    for i in range(len(scenarios)):
        if testset_counter >= testset_size:
            break
        sample = await query.generate_sample(scenario=scenarios[i])
        testset_counter += 1
        for context in (sample.reference_contexts or [""]):  # Handle None case
            testset.append({
                "user_input": sample.user_input,
                "reference": sample.reference,
                "reference_contexts": context
            })

# Create DataFrame and save to CSV
df = pd.DataFrame(testset)
df.to_csv("sse_multihop_testset.csv", index=False)
        
    

