In [1]:
import os
import fitz  # PyMuPDF
from typing import Union
import threading
import pandas as pd
import random
import numpy as np
import cv2

from typing import Dict, List, Tuple
from docsumo_image_util.parse.ocr.google import read_data, read_everything
from docsumo_image_util.parse.pdf2img import PdfImages
from dotenv import load_dotenv
load_dotenv() 
# Create a lock for token updates
token_lock = threading.Lock()
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/media/veracrypt1/GAC.json"
# from app.config import config_by_name




In [5]:
# Root directory where all folders are located
base_dir = os.getcwd()

# Folders to process
folders = ['Invoice', 'Form 1040', 'Form 1040 A', 'Form 1040 B', 'Form 1040 C', 'Form 1040 D', 'Form 1040 E', 'W9', 'Acord 25']
page_counts = {}

# Iterate through each folder and PDF inside
for folder in folders:
    folder_path = os.path.join(base_dir, folder)
    if not os.path.isdir(folder_path):
        print(f"[WARNING] Folder not found: {folder_path}")
        continue

    for filename in os.listdir(folder_path):
        if filename.lower().endswith('.pdf'):
            file_path = os.path.join(folder_path, filename)
            try:
                doc = fitz.open(file_path)
                num_pages = len(doc)
                doc.close()

                key = f"{folder}/{filename}"
                page_counts[key] = num_pages

                print(f"[INFO] {key}: {num_pages} pages")
            except Exception as e:
                print(f"[ERROR] Failed to open {file_path}: {e}")
                page_counts[f"{folder}/{filename}"] = 1  # fallback

# Optional: print the summary
print("\n=== Summary ===")
for key, count in page_counts.items():
    print(f"{key}: {count} pages")


[INFO] Invoice/page_19_invoice_10_10.pdf: 1 pages
[INFO] Invoice/page_20_invoice_7_7.pdf: 1 pages
[INFO] Invoice/page_19_invoice_3_3.pdf: 1 pages
[INFO] Invoice/page_20_invoice_17_17.pdf: 1 pages
[INFO] Invoice/page_19_invoice_17_17.pdf: 1 pages
[INFO] Form 1040/document (25)_2_3.pdf: 2 pages
[INFO] Form 1040/2022 UNFILED tax return_2_3.pdf: 2 pages
[INFO] Form 1040/form_1040.pdf: 2 pages
[INFO] Form 1040/sample_form_1040.pdf: 2 pages
[INFO] Form 1040/foxtheresa_22i_FC(1)_10_11.pdf: 2 pages
[INFO] Form 1040 A/Heisey, Matthew D_7_7.pdf: 1 pages
[INFO] Form 1040 A/document (3)_9_9.pdf: 1 pages
[INFO] Form 1040 A/Kennedy_15_15.pdf: 1 pages
[INFO] Form 1040 A/2020 Tax Return_9_9.pdf: 1 pages
[INFO] Form 1040 A/Reyes 1040 2021_3_3.pdf: 1 pages
[INFO] Form 1040 B/sample_form_1040_schedule_b.pdf: 1 pages
[INFO] Form 1040 B/2022 Tax Return Documents (Warner Theodore L and J - Client Copy) (3)_8_8.pdf: 1 pages
[INFO] Form 1040 B/RJ_Roberts_1040_2021 taxes_13_13.pdf: 1 pages
[INFO] Form 1040 B/D

In [6]:
# === CONFIGURATION ===
base_dir = os.getcwd()
min_pages = 150
max_pages = 200
num_merged_files = 20
output_dir = os.path.join(base_dir, "test_real")  # Save all outputs here

# Create output folder if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# === STEP 1: Build a pool of all PDFs with page counts ===
pdf_pool = []  # List of tuples: (doc_type, filepath, page_count)

for folder in folders:
    folder_path = os.path.join(base_dir, folder)
    if not os.path.isdir(folder_path):
        print(f"[WARNING] Folder not found: {folder_path}")
        continue

    for filename in os.listdir(folder_path):
        if filename.lower().endswith(".pdf"):
            file_path = os.path.join(folder_path, filename)
            try:
                doc = fitz.open(file_path)
                page_count = len(doc)
                doc.close()
                pdf_pool.append((folder, file_path, page_count))
            except Exception as e:
                print(f"[ERROR] Failed to open {file_path}: {e}")

print(f"[INFO] Total available PDFs: {len(pdf_pool)}")

# === STEP 2: Generate merged PDFs ===
for file_index in range(1, num_merged_files + 1):
    print(f"\n=== [MERGE {file_index}] Creating merged PDF... ===")
    selected_docs = []
    total_pages = 0
    attempts = 0

    # Pick random target page count between min and max
    target_pages = random.randint(min_pages, max_pages)
    print(f"[INFO] Targeting {target_pages} pages for merged file #{file_index}")

    shuffled_pool = pdf_pool[:]
    random.shuffle(shuffled_pool)

    while total_pages < target_pages and attempts < len(shuffled_pool) * 2:
        for item in shuffled_pool:
            doc_type, path, pages = item
            if total_pages + pages > target_pages:
                continue
            selected_docs.append(item)
            total_pages += pages
            if total_pages >= target_pages:
                break
        attempts += 1

    if not selected_docs:
        print(f"[ERROR] Could not create merged file #{file_index}. Skipping.")
        continue

    output_pdf = fitz.open()
    expanded_lines = []
    merge_log = []
    current_page = 0

    for i, (doc_type, file_path, pages) in enumerate(selected_docs):
        try:
            input_pdf = fitz.open(file_path)
            output_pdf.insert_pdf(input_pdf)
            input_pdf.close()

            expanded_lines.extend([doc_type] * pages)

            start_page = current_page + 1
            end_page = current_page + pages
            page_range = f"{start_page}-{end_page}" if pages > 1 else f"{start_page}"
            merge_log.append(f"Entry {i+1}: {os.path.basename(file_path)} ({doc_type}) → PDF pages {page_range}")
            current_page += pages

        except Exception as e:
            print(f"[ERROR] Failed to process {file_path}: {e}")

    # === STEP 3: Save output files ===
    merged_pdf_name = os.path.join(output_dir, f"merged_output_{file_index}.pdf")
    merged_txt_name = os.path.join(output_dir, f"merged_output_{file_index}.txt")
    merged_log_name = os.path.join(output_dir, f"merged_output_{file_index}_log.txt")

    output_pdf.save(merged_pdf_name)
    output_pdf.close()

    with open(merged_txt_name, "w") as f:
        f.write("\n".join(expanded_lines))

    with open(merged_log_name, "w") as log:
        log.write("\n".join(merge_log))
        log.write(f"\n\nTotal pages in merged PDF: {current_page}")

    print(f"[INFO] Saved: {merged_pdf_name} ({total_pages} pages)")
    print(f"[INFO] Saved: {merged_txt_name}")
    print(f"[INFO] Saved: {merged_log_name}")


[INFO] Total available PDFs: 44

=== [MERGE 1] Creating merged PDF... ===
[INFO] Targeting 168 pages for merged file #1
[INFO] Saved: /media/veracrypt1/batching_and_comparison/ai_split_optimization/test_real/merged_output_1.pdf (168 pages)
[INFO] Saved: /media/veracrypt1/batching_and_comparison/ai_split_optimization/test_real/merged_output_1.txt
[INFO] Saved: /media/veracrypt1/batching_and_comparison/ai_split_optimization/test_real/merged_output_1_log.txt

=== [MERGE 2] Creating merged PDF... ===
[INFO] Targeting 177 pages for merged file #2
[INFO] Saved: /media/veracrypt1/batching_and_comparison/ai_split_optimization/test_real/merged_output_2.pdf (177 pages)
[INFO] Saved: /media/veracrypt1/batching_and_comparison/ai_split_optimization/test_real/merged_output_2.txt
[INFO] Saved: /media/veracrypt1/batching_and_comparison/ai_split_optimization/test_real/merged_output_2_log.txt

=== [MERGE 3] Creating merged PDF... ===
[INFO] Targeting 187 pages for merged file #3
[INFO] Saved: /media/ver

### This code is only done for doing parsing purpose you can skip sections below it 

In [None]:
def get_google_ocr_raw_data(file_path: str) -> Tuple[List[np.array], List[pd.DataFrame]]:
    df_list = []
    image_list = []
    if file_path.endswith("pdf"):
        images = PdfImages(file_path)
    else:
        images = [cv2.imread(file_path)]
    for index,image in enumerate(images):
        (df, cdf), (image, angle) = read_everything(image)
        df_list.append(df)
        image_list.append(image)


    return image_list, df_list

image_list, df_list = get_google_ocr_raw_data("path_to_one_of_your_pdf_file")

[32m2025-05-01 12:07:04.402[0m | [1mINFO    [0m | [36mdocsumo_image_util.parse.ocr.google[0m:[36mread_everything[0m:[36m345[0m - [1mReading data using Google OCR[0m
[32m2025-05-01 12:07:04.403[0m | [1mINFO    [0m | [36mdocsumo_image_util.parse.ocr.google[0m:[36mread_raw[0m:[36m116[0m - [1mUsing Google OCR provider[0m
[32m2025-05-01 12:07:06.665[0m | [1mINFO    [0m | [36mdocsumo_image_util.parse.ocr.google[0m:[36mread_everything[0m:[36m348[0m - [1mParsing responses with four points[0m
[32m2025-05-01 12:07:06.847[0m | [1mINFO    [0m | [36mdocsumo_image_util.parse.ocr.google[0m:[36mread_everything[0m:[36m356[0m - [1m0.0[0m
[32m2025-05-01 12:07:06.848[0m | [1mINFO    [0m | [36mdocsumo_image_util.parse.ocr.google[0m:[36mread_everything[0m:[36m362[0m - [1mUsing 4-point rotation transformation.[0m
[32m2025-05-01 12:07:06.914[0m | [1mINFO    [0m | [36mdocsumo_image_util.parse.ocr.google[0m:[36mread_everything[0m:[36m345[0m -

In [40]:
for i, df in enumerate(df_list):
    df['page'] = i  # or i + 1 if you want to start from 1 instead of 0


In [44]:
final_df = pd.concat(df_list).reset_index(drop=True)


In [45]:
final_df

Unnamed: 0,index_sort,page,block,line,x0,y0,x2,y2,Text,space_type,confidence
0,0,0,0,0,64,71,238,105,ACORD,5,0.989440
1,1,0,1,0,237,71,251,83,Ⓡ,5,0.431050
2,2,0,2,0,453,84,709,114,CERTIFICATE,1,0.993090
3,3,0,2,0,722,84,774,114,OF,1,0.979886
4,4,0,2,0,787,84,971,114,LIABILITY,1,0.990620
...,...,...,...,...,...,...,...,...,...,...,...
106663,1122,174,45,5,883,2073,930,2090,later.,5,0.973015
106664,1123,174,46,0,1371,2116,1415,2138,Form,1,0.977495
106665,1126,174,46,0,1421,2116,1474,2138,W-9,1,0.965322
106666,1128,174,46,0,1480,2117,1526,2138,(Rev.,1,0.973602
