In [1]:
import os
import asyncio
import nest_asyncio
import sys
sys.path.append("../")

In [2]:
from config import config

In [3]:
from pdf_summarizer import PDFSummarizer
from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from web_data_extractor_from_links import KnowledgeBaseUpdater

In [4]:
# webdata_updater = KnowledgeBaseUpdater(text_files_directory="../Data/TextFiles", pdf_files_directory="../Data/PDFFiles")
# webdata_updater.update(urls = ["https://academic.iiti.ac.in/"])

In [None]:
async def test_multiple_pdfs_summarizer_async():
    """
    Tests the asynchronous PDFSummarizer class with multiple PDFs concurrently.
    """
    
    print("--- Starting Multi-PDF Summarizer Test ---")

    try:
        llm_model = ChatOllama(model=config.MODEL.MODEL_CHOICE, temperature=config.MODEL.GENERATION_ARGS['temperature'])
        print(f"LLM Model initialized: {config.MODEL.MODEL_CHOICE}")
    except Exception as e:
        print(f"Error initializing ChatOllama: {e}")
        print("Please ensure Ollama server is running and the model is pulled ('ollama pull mistral').")
        return

    # Ensure the output directory exists for saving summaries
    os.makedirs(config.DATA.PDF_DATA_DIR, exist_ok=True)
    summarizer_instance = PDFSummarizer(model=llm_model, output_dir=config.DATA.PDF_DATA_DIR)
    print(f"PDFSummarizer initialized. Output directory: {config.DATA.PDF_DATA_DIR}")

    test_pdf_filenames = os.listdir(config.DATA.PDF_DATA_DIR)

    pdf_paths_to_summarize = []
    
    for filename in test_pdf_filenames:
        full_path = os.path.join(config.DATA.PDF_DATA_DIR, filename)
        if os.path.exists(full_path):
            pdf_paths_to_summarize.append(full_path)
        else:
            print(f"\n[WARNING] PDF not found: {full_path}. Skipping this file.")

    if not pdf_paths_to_summarize:
        print("\n[ERROR] No valid PDF files found to summarize. Please check paths.")
        return

    print(f"\nAttempting to summarize {len(pdf_paths_to_summarize)} PDF(s)...")

    summarization_tasks = []
    for pdf_path in pdf_paths_to_summarize:
        print(f"Adding task for: {os.path.basename(pdf_path)}")
        summarization_tasks.append(summarizer_instance.summarize_pdf(pdf_path, save_as=True))

    try:
        # asyncio.gather returns a list of results in the order the tasks were given
        results = await asyncio.gather(*summarization_tasks, return_exceptions=True)
        
        print("\n--- All Summarization Tasks Completed ---")
        for i, (pdf_path, result) in enumerate(zip(pdf_paths_to_summarize, results)):
            pdf_filename = os.path.basename(pdf_path)
            print(f"\n--- Result for {pdf_filename} ---")
            if isinstance(result, Exception):
                print(f"[ERROR] An error occurred: {result}")
                import traceback
                traceback.print_exception(type(result), result, result.__traceback__) # Print full traceback for specific task error
            elif result:
                print(f"Summary generated (first 500 chars):\n{result[:500]}...")
                expected_output_path = os.path.join(
                    config.DATA.PDF_DATA_DIR,
                    f"{os.path.splitext(pdf_filename)[0]}.txt"
                )
                if os.path.exists(expected_output_path):
                    print(f"✅ Summary file saved to: {expected_output_path}")
                else:
                    print("❌ Summary file was NOT created.")
            else:
                print("Summarization returned no content.")
            
    except Exception as e:
        print(f"\n[CRITICAL ERROR] An unexpected error occurred during concurrent PDF summarization: {e}")
        import traceback
        traceback.print_exc()

    print("\n--- Multi-PDF Summarizer Test Finished ---")

# Execute the asynchronous test function
# Since nest_asyncio.apply() is at the top, we can just await directly.
await test_multiple_pdfs_summarizer_async()

--- Starting PDFSummarizer Test ---
LLM Model initialized: mistral
PDFSummarizer initialized. Output directory: ../Data/PDFFiles

Attempting to summarize PDF: ../Data/PDFFiles/academic_iiti_ac_in_Advt-GATB_2025.pdf


The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


[ERROR] Failed to extract PDF: CUDA error: CUDA-capable device(s) is/are busy or unavailable
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


Summarization returned no content (possibly due to error or empty PDF).

--- PDFSummarizer Test Finished ---
