In [5]:
import os
from dotenv import load_dotenv
from haystack import Pipeline
from haystack.components.converters import PyPDFToDocument
from haystack.components.preprocessors import (
    DocumentCleaner,
    DocumentSplitter)
from pathlib import Path
from scripts.knowledge_graph_component import KnowledgeGraphGenerator
from scripts.synthetic_test_components import SyntheticTestGenerator,\
                                                TestDatasetSaver,\
                                                    DocumentToLangChainConverter

# Load environment variables
load_dotenv("./.env")

# Example: Create a complete pipeline for synthetic test generation
data_path = "data_for_indexing"

if os.path.exists(data_path):
    print("Creating synthetic test generation pipeline...")
    
    # Get PDF files from the directory
    pdf_files = list(Path(data_path).glob("*.pdf"))
    
    if pdf_files:
        print(f"Found {len(pdf_files)} PDF files to process")
        
        # Create pipeline components
        pdf_converter = PyPDFToDocument()
        doc_cleaner = DocumentCleaner(remove_empty_lines=True,
                                      remove_extra_whitespaces=True)
        doc_splitter = DocumentSplitter(split_by="sentence",
                                       split_length=50,
                                       split_overlap=5)
        doc_converter = DocumentToLangChainConverter()
        kg_generator = KnowledgeGraphGenerator(apply_transforms=True)
        
        # Updated: Removed artificial size limits - now generates requested 10 tests
        test_generator = SyntheticTestGenerator(
            testset_size=10,  # This will now generate 10 tests instead of 3
            llm_model="gpt-4o-mini",
            query_distribution=[
                ("single_hop", 0.25), 
                ("multi_hop_specific", 0.25),
                ("multi_hop_abstract", 0.5)
            ],
            # Optional: Add max_testset_size=5 if you want to limit due to API constraints
            # max_testset_size=5  # Uncomment this line if you experience API timeouts
        )
        test_saver = TestDatasetSaver("data_for_eval/synthetic_tests_10.csv")
        
        # Create pipeline
        pipeline = Pipeline()
        pipeline.add_component("pdf_converter", pdf_converter)
        pipeline.add_component("doc_cleaner", doc_cleaner)
        pipeline.add_component("doc_splitter", doc_splitter)
        pipeline.add_component("doc_converter", doc_converter)
        pipeline.add_component("kg_generator", kg_generator)
        pipeline.add_component("test_generator", test_generator)
        pipeline.add_component("test_saver", test_saver)
        
        # Connect components in sequence
        pipeline.connect("pdf_converter.documents", "doc_cleaner.documents")
        pipeline.connect("doc_cleaner.documents", "doc_splitter.documents")
        pipeline.connect("doc_splitter.documents", "doc_converter.documents")
        pipeline.connect("doc_converter.langchain_documents", "kg_generator.documents")
        pipeline.connect("kg_generator.knowledge_graph", "test_generator.knowledge_graph")
        pipeline.connect("doc_converter.langchain_documents", "test_generator.documents")
        pipeline.connect("test_generator.testset", "test_saver.testset")
        
        # Prepare input data - convert PDF files to ByteStream objects
        pdf_sources = [Path("./data_for_indexing/howpeopleuseai.pdf")]
         
        result = pipeline.run({
            "pdf_converter": {"sources": pdf_sources}
        })
        
        print("\n📊 Pipeline Results:")
        print(f"  📄 Documents Processed: {result['doc_converter']['document_count']}")
        print(f"  🧠 Knowledge Graph Nodes: {result['kg_generator']['node_count']}")
        print(f"  🧪 Test Cases Generated: {result['test_generator']['testset_size']}")
        print(f"  🔧 Generation Method: {result['test_generator']['generation_method']}")
        
    else:
        print("❌ No PDF files found in data_for_indexing directory")
else:
    print("❌ Data path 'data_for_indexing' not found")

Creating synthetic test generation pipeline...
Found 1 PDF files to process


Applying HeadlinesExtractor: 100%|██████████| 17/17 [00:08<00:00,  1.91it/s]
Applying HeadlineSplitter: 100%|██████████| 17/17 [00:00<00:00, 457.22it/s]
Applying SummaryExtractor: 100%|██████████| 17/17 [00:10<00:00,  1.70it/s]
Applying CustomNodeFilter: 100%|██████████| 49/49 [00:22<00:00,  2.14it/s]
Applying EmbeddingExtractor: 100%|██████████| 17/17 [00:04<00:00,  3.91it/s]
Applying ThemesExtractor: 100%|██████████| 44/44 [00:22<00:00,  1.91it/s]
Applying NERExtractor: 100%|██████████| 44/44 [00:21<00:00,  2.07it/s]
Applying CosineSimilarityBuilder: 100%|██████████| 1/1 [00:00<00:00, 234.41it/s]
Applying OverlapScoreBuilder: 100%|██████████| 1/1 [00:00<00:00, 68.50it/s]
Generating personas: 100%|██████████| 3/3 [00:02<00:00,  1.33it/s]
Generating Scenarios: 100%|██████████| 3/3 [00:17<00:00,  5.97s/it]
Generating Samples: 100%|██████████| 11/11 [00:07<00:00,  1.44it/s]



📊 Pipeline Results:
  📄 Documents Processed: 17
  🧠 Knowledge Graph Nodes: 17
  🧪 Test Cases Generated: 11
  🔧 Generation Method: knowledge_graph


In [9]:
import pandas as pd

# Load and display the generated synthetic tests
test_file_path = "data_for_eval/synthetic_tests_10.csv"

if os.path.exists(test_file_path):
    synthetic_tests_df = pd.read_csv(test_file_path)
    print("\n🧪 Synthetic Tests Sample:")
    display(synthetic_tests_df.head())
else:
    print("❌ Synthetic test file not found")


🧪 Synthetic Tests Sample:


Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,How do peple use OpenAI in their daily lives?,['NBER WORKING PAPER SERIES\nHOW PEOPLE USE CH...,The study documents the growth of ChatGPT’s co...,single_hop_specific_query_synthesizer
1,Who is Thomas Cunningham in relation to the st...,['ABSTRACT Despite the rapid adoption of LLM c...,Thomas Cunningham is associated with OpenAI an...,single_hop_specific_query_synthesizer
2,What does Roth (2025) report about ChatGPT usa...,['to classify messages without any human seein...,Roth (2025) reports that 28% of US adults used...,single_hop_specific_query_synthesizer
3,What trends in user behavior and message types...,"['<1-hop>\n\nHowever, in the first half of 202...","By June 2025, it was observed that the share o...",multi_hop_specific_query_synthesizer
4,What are the key privacy protections implement...,"['<1-hop>\n\nOverall, the majority of ChatGPT ...",The key privacy protections implemented in the...,multi_hop_specific_query_synthesizer
