In [4]:
# Example usage and pipeline creation
if __name__ == "__main__":
    """
    Example usage of the synthetic test generation components.
    """
    import os
    from dotenv import load_dotenv
    from haystack import Pipeline
    from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader
    from scripts.knowledge_graph_component import KnowledgeGraphGenerator
    from scripts.synthetic_test_components import SyntheticTestGenerator, TestDatasetSaver
    
    # Load environment variables
    load_dotenv("./.env")
    
    # Example: Create a complete pipeline for synthetic test generation
    data_path = "data_for_indexing"
    
    if os.path.exists(data_path):
        print("Creating synthetic test generation pipeline...")
        
        # Load documents
        loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyMuPDFLoader)
        docs = loader.load()
        
        if docs:
            # Create pipeline components with corrected parameters
            kg_generator = KnowledgeGraphGenerator(apply_transforms=True)
            
            # Updated: Removed artificial size limits - now generates requested 10 tests
            test_generator = SyntheticTestGenerator(
                testset_size=10,  # This will now generate 10 tests instead of 3
                llm_model="gpt-4o-mini",
                query_distribution=[
                    ("single_hop", 0.25), 
                    ("multi_hop_specific", 0.25),
                    ("multi_hop_abstract", 0.5)
                ],
                # Optional: Add max_testset_size=5 if you want to limit due to API constraints
                # max_testset_size=5  # Uncomment this line if you experience API timeouts
            )
            test_saver = TestDatasetSaver("data_for_eval/synthetic_tests_10.csv")
            
            # Create pipeline
            pipeline = Pipeline()
            pipeline.add_component("kg_generator", kg_generator)
            pipeline.add_component("test_generator", test_generator)
            pipeline.add_component("test_saver", test_saver)
            
            # Connect components
            pipeline.connect("kg_generator.knowledge_graph", "test_generator.knowledge_graph")
            pipeline.connect("test_generator.testset", "test_saver.testset")
            
            # Check environment first
            print("Checking environment setup...")
            api_key = os.getenv('OPENAI_API_KEY')
            if not api_key:
                print("❌ Error: OPENAI_API_KEY not found in environment variables.")
                print("Please set your OpenAI API key in the .env file.")
                exit(1)
            else:
                print("✅ OPENAI_API_KEY found")
            
            # Run pipeline
            try:
                print("Running pipeline...")
                result = pipeline.run({
                    "kg_generator": {"documents": docs},
                    "test_generator": {"documents": docs}
                })
                
                print("\n📊 Pipeline Results:")
                print(f"  🧠 Knowledge Graph Nodes: {result['kg_generator']['node_count']}")
                print(f"  🧪 Test Cases Generated: {result['test_generator']['testset_size']}")
                print(f"  🔧 Generation Method: {result['test_generator']['generation_method']}")
                
                if result['test_generator']['success']:
                    print(f"  💾 Saved to: {result['test_saver']['saved_path']}")
                    print(f"  ✅ Save Success: {result['test_saver']['success']}")
                    print(f"  📊 Rows Saved: {result['test_saver']['row_count']}")
                    
                    # Safely access testset - check if it exists in the result
                    if 'testset' in result['test_generator']:
                        testset_df = result['test_generator']['testset']
                        print(f"\n📋 Sample Questions (showing 3 of {len(testset_df)}):")
                        for i, row in testset_df.head(3).iterrows():
                            print(f"  Q{i+1}: {row.get('question', 'N/A')}")
                            print(f"     A: {row.get('ground_truth', row.get('answer', 'N/A'))[:100]}...")
                            print()
                    else:
                        print(f"\n📋 Test cases successfully generated and saved!")
                        print(f"     Check the saved file: {result['test_saver']['saved_path']}")
                        # Try to read the saved file to display sample questions
                        try:
                            import pandas as pd
                            saved_df = pd.read_csv(result['test_saver']['saved_path'])
                            print(f"\n📋 Sample Questions from saved file (showing 3 of {len(saved_df)}):")
                            for i, row in saved_df.head(3).iterrows():
                                print(f"  Q{i+1}: {row.get('question', 'N/A')}")
                                print(f"     A: {row.get('ground_truth', row.get('answer', 'N/A'))[:100]}...")
                                print()
                        except Exception as read_error:
                            print(f"     Could not read saved file: {read_error}")
                else:
                    print(f"  ❌ Test generation failed: {result['test_generator']['generation_method']}")
                    if 'connection' in result['test_generator']['generation_method']:
                        print("  💡 This appears to be a connection issue. Please check:")
                        print("     - Internet connection")
                        print("     - OpenAI API key validity")
                        print("     - OpenAI API quota/billing")
                        print("  💡 Try adding max_testset_size=3 to the SyntheticTestGenerator if API timeouts occur")
                
            except Exception as e:
                print(f"❌ Pipeline execution failed: {e}")
                print(f"Error type: {type(e).__name__}")
                import traceback
                traceback.print_exc()
        else:
            print(f"No documents found in {data_path}")
    else:
        print(f"Data path {data_path} not found")

Creating synthetic test generation pipeline...
Checking environment setup...
✅ OPENAI_API_KEY found
Running pipeline...


Applying HeadlinesExtractor: 100%|██████████| 21/21 [00:10<00:00,  2.02it/s]
Applying HeadlineSplitter: 100%|██████████| 64/64 [00:00<00:00, 3546.70it/s]
Applying SummaryExtractor:  57%|█████▋    | 20/35 [00:12<00:06,  2.34it/s]Property 'summary' already exists in node 'b813d7'. Skipping!
Applying SummaryExtractor:  63%|██████▎   | 22/35 [00:12<00:05,  2.55it/s]Property 'summary' already exists in node '372af7'. Skipping!
Applying SummaryExtractor:  66%|██████▌   | 23/35 [00:13<00:04,  2.50it/s]Property 'summary' already exists in node '1d93b5'. Skipping!
Property 'summary' already exists in node '02a20b'. Skipping!
Applying SummaryExtractor:  69%|██████▊   | 24/35 [00:14<00:05,  2.11it/s]Property 'summary' already exists in node 'ae99e4'. Skipping!
Property 'summary' already exists in node '695b25'. Skipping!
Applying SummaryExtractor:  74%|███████▍  | 26/35 [00:15<00:04,  2.06it/s]Property 'summary' already exists in node '693753'. Skipping!
Property 'summary' already exists in node 


📊 Pipeline Results:
  🧠 Knowledge Graph Nodes: 64
  🧪 Test Cases Generated: 10
  🔧 Generation Method: knowledge_graph
  💾 Saved to: data_for_eval/synthetic_tests_10.csv
  ✅ Save Success: True
  📊 Rows Saved: 10

📋 Test cases successfully generated and saved!
     Check the saved file: data_for_eval/synthetic_tests_10.csv

📋 Sample Questions from saved file (showing 3 of 10):
  Q1: N/A
     A: N/A...

  Q2: N/A
     A: N/A...

  Q3: N/A
     A: N/A...



In [None]:
# Alternative: Test the components individually to understand the data flow
print("🧪 Testing individual components...")
print("=" * 50)

import os
from dotenv import load_dotenv
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader
from scripts.knowledge_graph_component import KnowledgeGraphGenerator
from scripts.synthetic_test_components import SyntheticTestGenerator, TestDatasetSaver

load_dotenv("./.env")

# Test with a small sample
data_path = "data_for_indexing"
if os.path.exists(data_path):
    # Load just one document for testing
    loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyMuPDFLoader)
    docs = loader.load()
    
    if docs:
        print(f"✅ Loaded {len(docs)} documents")
        
        # Test individual components
        print("\n1. Testing KnowledgeGraphGenerator...")
        kg_generator = KnowledgeGraphGenerator(apply_transforms=False)  # Skip transforms for speed
        kg_result = kg_generator.run(documents=docs[:1])  # Use just first document
        print(f"   KG Nodes: {kg_result['node_count']}")
        
        print("\n2. Testing SyntheticTestGenerator...")
        test_generator = SyntheticTestGenerator(
            testset_size=3,  # Small number for testing
            llm_model="gpt-4o-mini",
            max_testset_size=2  # Even smaller limit for testing
        )
        
        try:
            test_result = test_generator.run(
                documents=docs[:1], 
                knowledge_graph=kg_result['knowledge_graph']
            )
            
            print(f"   Generation success: {test_result['success']}")
            print(f"   Test cases: {test_result['testset_size']}")
            print(f"   Method: {test_result['generation_method']}")
            
            # Check what's actually in the result
            print(f"   Result keys: {list(test_result.keys())}")
            
            if test_result['success'] and 'testset' in test_result:
                testset_df = test_result['testset']
                print(f"   Testset type: {type(testset_df)}")
                print(f"   Testset shape: {testset_df.shape if hasattr(testset_df, 'shape') else 'No shape'}")
                if hasattr(testset_df, 'columns'):
                    print(f"   Testset columns: {list(testset_df.columns)}")
                    
                    # Show first question
                    if len(testset_df) > 0:
                        first_row = testset_df.iloc[0]
                        print(f"\n   📝 Sample Question:")
                        print(f"      Q: {first_row.get('question', 'N/A')}")
                        print(f"      A: {first_row.get('ground_truth', first_row.get('answer', 'N/A'))[:100]}...")
            
        except Exception as e:
            print(f"   ❌ Test generation failed: {e}")
            print(f"   Error type: {type(e).__name__}")
    else:
        print("❌ No documents found")
else:
    print("❌ Data path not found")

print("\n" + "=" * 50)
print("💡 This test helps identify where the issue occurs in the pipeline.")