In [1]:
import glob
import os
import pathlib
from typing import List
import pandas as pd
from tqdm.auto import tqdm
from utils.pdf_to_json_row import pdf_to_dataframe
from utils.landscape_word_doc import row_to_landscape_doc
import warnings

warnings.filterwarnings("ignore")   # hides every category, every module

PDF_DIR = "pdfs"
OUT_DIR = "processed_documents" 

records : List[dict] = []

pdfs = glob.glob(os.path.join(PDF_DIR, "*.pdf"))

for pdf_path in tqdm(pdfs, total=len(pdfs), desc="Processing PDFs"):
    print(f"➜  processing {pdf_path}")
    df = pdf_to_dataframe(pdf_path)

    records.append(df)

    doc_name = (
        pathlib.Path(OUT_DIR) /
        f"{pathlib.Path(pdf_path).stem}_summary.docx"
    )
    
    row_to_landscape_doc(df, doc_name)
    print(f"    ↳ Word: {doc_name}")

master = pd.concat(records, ignore_index=True)

master.to_excel(os.path.join(OUT_DIR, "all_summaries.xlsx"), index=False)
master.to_csv(os.path.join(OUT_DIR, "all_summaries.csv"),  index=False)

print("\n✅  Done — individual .docx files and a master Excel/CSV saved.")

Processing PDFs:   0%|          | 0/2 [00:00<?, ?it/s]

➜  processing pdfs/pku.pdf
    ↳ Word: processed_documents/pku_summary.docx
➜  processing pdfs/aec.pdf
    ↳ Word: processed_documents/aec_summary.docx

✅  Done — individual .docx files and a master Excel/CSV saved.
