In [None]:
# Need to go through the list of files in NewDocs folder and identify which files are parsable

In [1]:
import sys
from pathlib import Path

# Add parent directory to path to import app modules
sys.path.append(str(Path.cwd().parent))

from pypdf import PdfReader
import io
from typing import Dict, List
import pandas as pd

In [2]:
def analyze_pdf_file(file_path: Path) -> Dict:
    """
    Analyze a PDF file to determine if it's parsable and extract metadata.
    
    Returns a dictionary with:
    - file_name: Name of the file
    - parsable: Whether the file can be opened and parsed
    - total_pages: Number of pages in the PDF
    - pages_with_text: Number of pages that contain extractable text
    - text_extraction_rate: Percentage of pages with text
    - total_chars: Total characters extracted
    - error: Error message if file is not parsable
    """
    result = {
        "file_name": file_path.name,
        "file_size_kb": round(file_path.stat().st_size / 1024, 2),
        "parsable": False,
        "total_pages": 0,
        "pages_with_text": 0,
        "text_extraction_rate": 0.0,
        "total_chars": 0,
        "error": None
    }
    
    try:
        # Try to open and read the PDF
        with open(file_path, "rb") as f:
            file_bytes = f.read()
        
        reader = PdfReader(io.BytesIO(file_bytes))
        result["total_pages"] = len(reader.pages)
        
        # Check each page for text content
        total_chars = 0
        pages_with_text = 0
        
        for page in reader.pages:
            page_text = page.extract_text() or ""
            if page_text.strip():
                pages_with_text += 1
                total_chars += len(page_text)
        
        result["pages_with_text"] = pages_with_text
        result["total_chars"] = total_chars
        
        if result["total_pages"] > 0:
            result["text_extraction_rate"] = round(
                (pages_with_text / result["total_pages"]) * 100, 2
            )
        
        # Consider a file parsable if we can extract text from at least some pages
        result["parsable"] = pages_with_text > 0
        
    except Exception as e:
        result["error"] = str(e)
        result["parsable"] = False
    
    return result

In [None]:
# Path to NewDocs folder
newdocs_path = Path.cwd().parent / "sources"

# Get all PDF files
pdf_files = list(newdocs_path.glob("*.pdf"))

print(f"Found {len(pdf_files)} PDF files in NewDocs folder")
print(f"Analyzing files...\n")

Found 16 PDF files in NewDocs folder
Analyzing files...



In [14]:
# Analyze all PDF files
results: List[Dict] = []

for pdf_file in pdf_files:
    print(f"Analyzing: {pdf_file.name}...")
    result = analyze_pdf_file(pdf_file)
    results.append(result)
    
    # Print quick status
    if result["parsable"]:
        print(f"  ✓ Parsable - {result['pages_with_text']}/{result['total_pages']} pages with text")
    else:
        print(f"  ✗ Not parsable - {result['error']}")
    print()

print(f"\nAnalysis complete!")

Analyzing: Banking_Act_Determination_No_1_of_2025.pdf...
  ✓ Parsable - 2/2 pages with text

Analyzing: Banking_Act_Determination_No_4_of_2024.pdf...
  ✓ Parsable - 7/7 pages with text

Analyzing: Banking_Act_Directions_No_1_of_2025.pdf...
  ✓ Parsable - 1/1 pages with text

Analyzing: Banking_Act_Directions_No_2_of_2025.pdf...
  ✓ Parsable - 5/5 pages with text

Analyzing: Banking_Act_Directions_No_3_of_2025.pdf...
  ✓ Parsable - 1/1 pages with text

Analyzing: Banking_Act_Directions_No_4_of_2025.pdf...
  ✓ Parsable - 1/1 pages with text

Analyzing: Banking_Act_Directions_No_5_of_2024 (1).pdf...
  ✓ Parsable - 46/47 pages with text

Analyzing: Banking_Act_Directions_No_5_of_2024.pdf...
  ✓ Parsable - 46/47 pages with text

Analyzing: Banking_Act_Directions_No_6_of_2024.pdf...
  ✓ Parsable - 1/1 pages with text

Analyzing: Banking_special_provisions_Act_Directions_No_1_of_2025.pdf...
  ✓ Parsable - 2/2 pages with text

Analyzing: bsd_circular_no_1_of_2025_e.pdf...
  ✓ Parsable - 3/3 pa

In [15]:
# Create a summary DataFrame
df = pd.DataFrame(results)

# Sort by parsable status and text extraction rate
df = df.sort_values(by=["parsable", "text_extraction_rate"], ascending=[False, False])

# Display the results
print("\n" + "="*80)
print("SUMMARY OF PARSABLE DOCUMENTS")
print("="*80 + "\n")

display(df)

# Summary statistics
parsable_count = df["parsable"].sum()
total_count = len(df)

print(f"\n{'='*80}")
print(f"Parsable files: {parsable_count}/{total_count} ({round(parsable_count/total_count*100, 1)}%)")
print(f"{'='*80}")


SUMMARY OF PARSABLE DOCUMENTS



Unnamed: 0,file_name,file_size_kb,parsable,total_pages,pages_with_text,text_extraction_rate,total_chars,error
0,Banking_Act_Determination_No_1_of_2025.pdf,342.11,True,2,2,100.0,2778,
1,Banking_Act_Determination_No_4_of_2024.pdf,626.82,True,7,7,100.0,10978,
2,Banking_Act_Directions_No_1_of_2025.pdf,595.43,True,1,1,100.0,1263,
3,Banking_Act_Directions_No_2_of_2025.pdf,1547.49,True,5,5,100.0,10786,
4,Banking_Act_Directions_No_3_of_2025.pdf,120.21,True,1,1,100.0,883,
5,Banking_Act_Directions_No_4_of_2025.pdf,422.96,True,1,1,100.0,578,
8,Banking_Act_Directions_No_6_of_2024.pdf,573.25,True,1,1,100.0,1036,
9,Banking_special_provisions_Act_Directions_No_1...,1472.11,True,2,2,100.0,3085,
10,bsd_circular_no_1_of_2025_e.pdf,325.53,True,3,3,100.0,3511,
11,bsd_circular_no_3_of_2024_e.pdf,2169.44,True,4,4,100.0,5670,



Parsable files: 16/16 (100.0%)


In [16]:
# Show detailed breakdown
print("\nPARSABLE FILES:")
print("-" * 80)
parsable_df = df[df["parsable"] == True]
for _, row in parsable_df.iterrows():
    print(f"✓ {row['file_name']}")
    print(f"  Pages: {row['total_pages']} | Text extraction: {row['text_extraction_rate']}% | Size: {row['file_size_kb']} KB")
    print()

if (df["parsable"] == False).any():
    print("\nNON-PARSABLE FILES:")
    print("-" * 80)
    non_parsable_df = df[df["parsable"] == False]
    for _, row in non_parsable_df.iterrows():
        print(f"✗ {row['file_name']}")
        print(f"  Error: {row['error']}")
        print()


PARSABLE FILES:
--------------------------------------------------------------------------------
✓ Banking_Act_Determination_No_1_of_2025.pdf
  Pages: 2 | Text extraction: 100.0% | Size: 342.11 KB

✓ Banking_Act_Determination_No_4_of_2024.pdf
  Pages: 7 | Text extraction: 100.0% | Size: 626.82 KB

✓ Banking_Act_Directions_No_1_of_2025.pdf
  Pages: 1 | Text extraction: 100.0% | Size: 595.43 KB

✓ Banking_Act_Directions_No_2_of_2025.pdf
  Pages: 5 | Text extraction: 100.0% | Size: 1547.49 KB

✓ Banking_Act_Directions_No_3_of_2025.pdf
  Pages: 1 | Text extraction: 100.0% | Size: 120.21 KB

✓ Banking_Act_Directions_No_4_of_2025.pdf
  Pages: 1 | Text extraction: 100.0% | Size: 422.96 KB

✓ Banking_Act_Directions_No_6_of_2024.pdf
  Pages: 1 | Text extraction: 100.0% | Size: 573.25 KB

✓ Banking_special_provisions_Act_Directions_No_1_of_2025.pdf
  Pages: 2 | Text extraction: 100.0% | Size: 1472.11 KB

✓ bsd_circular_no_1_of_2025_e.pdf
  Pages: 3 | Text extraction: 100.0% | Size: 325.53 KB

✓

In [17]:
# Optional: Save results to CSV for future reference
output_file = Path.cwd().parent / "parsable_docs_analysis.csv"
df.to_csv(output_file, index=False)
print(f"\nResults saved to: {output_file}")


Results saved to: d:\IronOne\Demo_Projects\rag-chatbot\parsable_docs_analysis.csv
