In [None]:
import os
import base64
from pathlib import Path
from dotenv import load_dotenv
from mistralai import Mistral

# Try importing Markdown display (for notebook mode)
try:
    from IPython.display import Markdown, display
    HAS_IPYTHON = True
except ImportError:
    HAS_IPYTHON = False

# ---------------- Config ----------------
load_dotenv()
api_key = os.getenv("MISTRAL_API_KEY")
if not api_key:
    raise ValueError("Missing MISTRAL_API_KEY")
mistral_client = Mistral(api_key=api_key)

# ---------------- OCR Markdown ----------------
def get_ocr_markdown(pdf_path: Path) -> str:
    with open(pdf_path, "rb") as f:
        encoded_pdf = base64.b64encode(f.read()).decode()

    res = mistral_client.ocr.process(
        model="mistral-ocr-latest",
        document={
            "type": "document_url",
            "document_url": f"data:application/pdf;base64,{encoded_pdf}"
        },
        include_image_base64=False
    )
    pages = getattr(res, "pages", [])
    return "\n\n".join([page.markdown for page in pages])

# ---------------- Save markdown to text file ----------------
def save_markdown_to_txt(markdown_text: str, pdf_path: Path):
    txt_file_path = pdf_path.with_suffix(".txt")
    with open(txt_file_path, "w", encoding="utf-8") as f:
        f.write(markdown_text)
    print(f"\n‚úÖ Markdown saved to: {txt_file_path}")

##########################################################################################################
# # ---------------- Main ----------------
# if __name__ == "__main__":
#     #pdf_path_str = input("Enter the PDF file path: ").strip().strip('"').strip("'")
#     pdf_path_str = r"D:\Zzz.PROJECTS\MOL\New Req- 2 - Bank statement\PDfs-Invoice-Frm Pallavi\RE_ MOL - BTS\1\anand san rent oct.pdf"

#     pdf_file = Path(os.path.normpath(pdf_path_str))

#     if not pdf_file.exists():
#         print(f"‚ùå File not found: {pdf_file}")
#         exit(1)

#     if pdf_file.suffix.lower() != ".pdf":
#         print(f"‚ùå Not a PDF file: {pdf_file}")
#         exit(1)

#     markdown = get_ocr_markdown(pdf_file)

#     print("\n========== OCR MARKDOWN OUTPUT ==========\n")

#     # ‚úÖ Show nicely depending on environment
#     if HAS_IPYTHON:
#         display(Markdown(markdown))
#     else:
#         print(markdown)  # plain terminal mode

#     save_markdown_to_txt(markdown, pdf_file)

##########################################################################################################



#************************************************************************************
# import os
# from pathlib import Path

# if __name__ == "__main__":
#     # folder containing PDFs
#     #folder_path_str = r"D:\Zzz.PROJECTS\MOL\New Req- 2 - Bank statement\PDfs-Invoice-Frm Pallavi\RE_ MOL - BTS\1"
#     #folder_path_str = r"D:\Zzz.PROJECTS\MOL\New Req- 2 - Bank statement\PDfs-Invoice-Frm Pallavi\RE_ MOL - BTS\8"
#     #folder_path_str = r"D:\Zzz.PROJECTS\AGOS\P2 - AirWay Bill - DU"
#     folder_path_str = r"D:\Python Projects\6-RAG_on_Docs\Data\Z.Test-nw"
#     folder_path = Path(os.path.normpath(folder_path_str))

#     if not folder_path.exists():
#         print(f"‚ùå Folder not found: {folder_path}")
#         exit(1)

#     if not folder_path.is_dir():
#         print(f"‚ùå Not a folder: {folder_path}")
#         exit(1)

#     # find all PDFs in folder (non-recursive). Use rglob("*.pdf") for recursive search.
#     pdf_files = sorted(folder_path.glob("*.pdf"))

#     if not pdf_files:
#         print(f"‚ö†Ô∏è No PDF files found in {folder_path}")
#         exit(0)

#     print(f"üìÇ Found {len(pdf_files)} PDF files in: {folder_path}\n")

#     for pdf_file in pdf_files:
#         print(f"üìÑ Processing: {pdf_file.name}")

#         if pdf_file.suffix.lower() != ".pdf":
#             print(f"‚ùå Skipping non-PDF file: {pdf_file}")
#             continue

#         try:
#             markdown = get_ocr_markdown(pdf_file)

#             print("\n========== OCR MARKDOWN OUTPUT ==========\n")

#             # if HAS_IPYTHON:
#             #     display(Markdown(markdown))
#             # else:
#             #     print(markdown)

#             save_markdown_to_txt(markdown, pdf_file)
#             print(f"‚úÖ Saved Markdown for: {pdf_file.name}\n")

#         except Exception as e:
#             print(f"‚ùå Error processing {pdf_file.name}: {e}\n")


#************************************************************************************

import os
from pathlib import Path

if __name__ == "__main__":
    # folder containing PDFs
    folder_path_str = r"D:\Zzz.PROJECTS\AGOS\P2 - AirWay Bill - DU\Files"
    folder_path = Path(os.path.normpath(folder_path_str))

    if not folder_path.exists():
        print(f"‚ùå Folder not found: {folder_path}")
        exit(1)

    if not folder_path.is_dir():
        print(f"‚ùå Not a folder: {folder_path}")
        exit(1)

    # find all PDFs in folder (recursive)
    pdf_files = sorted(folder_path.rglob("*.pdf"))

    if not pdf_files:
        print(f"‚ö†Ô∏è No PDF files found in {folder_path}")
        exit(0)

    print(f"üìÇ Found {len(pdf_files)} PDF files (including subfolders) in: {folder_path}\n")

    for pdf_file in pdf_files:
        print(f"üìÑ Processing: {pdf_file}")

        if pdf_file.suffix.lower() != ".pdf":
            print(f"‚ùå Skipping non-PDF file: {pdf_file}")
            continue

        try:
            markdown = get_ocr_markdown(pdf_file)

            print("\n========== OCR MARKDOWN OUTPUT ==========\n")

            save_markdown_to_txt(markdown, pdf_file)
            print(f"‚úÖ Saved Markdown for: {pdf_file.name}\n")

        except Exception as e:
            print(f"‚ùå Error processing {pdf_file.name}: {e}\n")

