In [3]:
import os
import time
import pdfplumber
from PyPDF2 import PdfReader
import pandas as pd

In [4]:
def extract_text_from_pdfs_with_progress(folder_path):
    """
    Extract text from all PDF files in the given folder with a timer and progress tracking.
    
    Parameters:
        folder_path (str): Path to the folder containing PDF files.
    
    Returns:
        dict: A dictionary where keys are file names and values are extracted text.
    """
    pdf_texts = {}
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
    total_files = len(pdf_files)

    if total_files == 0:
        print("No PDF files found in the specified folder.")
        return pdf_texts

    start_time = time.time()
    print(f"Processing {total_files} PDF files...")

    for idx, file_name in enumerate(pdf_files):
        file_path = os.path.join(folder_path, file_name)
        with pdfplumber.open(file_path) as pdf:
            text = ''.join([page.extract_text() for page in pdf.pages])
        pdf_texts[file_name] = text
        
        # Calculate and display progress
        progress = ((idx + 1) / total_files) * 100
        elapsed_time = time.time() - start_time
        print(f"[{progress:.2f}%] Completed {idx + 1}/{total_files} files. Elapsed time: {elapsed_time:.2f}s")

    print("All PDF files processed successfully.")
    return pdf_texts

In [None]:
def create_dataframe(pdf_data):
    df = pd.DataFrame(pdf_data)
    df['text_length'] = df['text'].apply(len)
    return df