In [1]:
import os

# Path to the folder containing .bin files
bin_folder_path = r"D:\Files_Extraction_py\P_Test_Files1\BIN_FILES"
html_output_folder = r"D:\Files_Extraction_py\P_Test_Files1\BIN_TO_HTML_output_files"
pdf_output_folder = r"D:\Files_Extraction_py\P_Test_Files1\BIN_TO_PDF_output_files"

# Create output directories if they do not exist
os.makedirs(html_output_folder, exist_ok=True)
os.makedirs(pdf_output_folder, exist_ok=True)

# Loop through each .bin file in the specified folder
for file_name in os.listdir(bin_folder_path):
    if file_name.endswith(".bin"):
        bin_file_path = os.path.join(bin_folder_path, file_name)
        html_output_path = os.path.join(html_output_folder, f"{os.path.splitext(file_name)[0]}.html")
        pdf_output_path = os.path.join(pdf_output_folder, f"{os.path.splitext(file_name)[0]}.pdf")

        # Open the binary file and read its content
        try:
            with open(bin_file_path, "rb") as bin_file:
                # Read the content of the bin file
                bin_data = bin_file.read()

            # Attempt to decode the binary data
            try:
                decoded_text = bin_data.decode("utf-8")  # Try decoding as HTML first
                # If decoding succeeds, save as an HTML file
                with open(html_output_path, "w", encoding="utf-8") as html_file:
                    html_file.write(decoded_text)
                print(f"HTML file successfully created at {html_output_path}")

            except UnicodeDecodeError:
                print(f"Error: Could not decode binary data using utf-8 for {file_name}. Trying PDF.")

                # If decoding as HTML fails, assume it's a PDF and save it as is
                with open(pdf_output_path, "wb") as pdf_file:
                    pdf_file.write(bin_data)
                print(f"PDF file successfully created at {pdf_output_path}")

        except FileNotFoundError:
            print(f"Error: The file at path {bin_file_path} was not found.")
        except Exception as e:
            print(f"An error occurred while processing {file_name}: {e}")


Error: Could not decode binary data using utf-8 for lob10013616679892409327.bin. Trying PDF.
PDF file successfully created at D:\Files_Extraction_py\P_Test_Files1\BIN_TO_PDF_output_files\lob10013616679892409327.pdf
Error: Could not decode binary data using utf-8 for lob10330880227198515982.bin. Trying PDF.
PDF file successfully created at D:\Files_Extraction_py\P_Test_Files1\BIN_TO_PDF_output_files\lob10330880227198515982.pdf
Error: Could not decode binary data using utf-8 for lob10352851475717600885.bin. Trying PDF.
PDF file successfully created at D:\Files_Extraction_py\P_Test_Files1\BIN_TO_PDF_output_files\lob10352851475717600885.pdf
Error: Could not decode binary data using utf-8 for lob10512230008315759877.bin. Trying PDF.
PDF file successfully created at D:\Files_Extraction_py\P_Test_Files1\BIN_TO_PDF_output_files\lob10512230008315759877.pdf
Error: Could not decode binary data using utf-8 for lob10514136485234828392.bin. Trying PDF.
PDF file successfully created at D:\Files_Extra

In [20]:
import pytesseract
from pdf2image import convert_from_path
import fitz  # PyMuPDF
import os

# Specify the Tesseract-OCR path
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Path to the folder containing PDF files
pdf_folder_path = r"D:\Files_Extraction_py\P_Test_Files1\BIN_TO_PDF_output_files"
poppler_path = r"C:\Poppler\Release-24.08.0-0\poppler-24.08.0\Library\bin"

# Path to store the extracted text from PDFs
output_folder_path = r"D:\Files_Extraction_py\P_Test_Files1\Pdf_Text_Output"

if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)

# Function to extract text from image-based PDFs using OCR
def extract_text_from_image_pdf(pdf_path, poppler_path, output_folder):
    try:
        images = convert_from_path(pdf_path, poppler_path=poppler_path)
        extracted_text = ""
        
        for i, image in enumerate(images):
            text = pytesseract.image_to_string(image)
            extracted_text += text
        
        pdf_name = os.path.basename(pdf_path).replace('.pdf', '')
        output_text_file = os.path.join(output_folder, f"{pdf_name}.txt")
        with open(output_text_file, 'w', encoding='utf-8') as text_file:
            text_file.write(extracted_text)
        
        print(f"OCR text extracted from {pdf_path} and saved to {output_text_file}")
    
    except Exception as e:
        print(f"An error occurred while processing {pdf_path} with OCR: {e}")

# Function to extract text from text-based PDFs
def extract_text_from_text_pdf(pdf_path, output_folder):
    try:
        pdf_document = fitz.open(pdf_path)
        extracted_text = ""

        for page_num in range(pdf_document.page_count):
            page = pdf_document.load_page(page_num)
            extracted_text += page.get_text("text")
        
        pdf_name = os.path.basename(pdf_path).replace('.pdf', '')
        output_text_file = os.path.join(output_folder, f"{pdf_name}.txt")
        with open(output_text_file, 'w', encoding='utf-8') as text_file:
            text_file.write(extracted_text)

        print(f"Text extracted from {pdf_path} and saved to {output_text_file}")
    
    except Exception as e:
        print(f"An error occurred while processing {pdf_path} as a text PDF: {e}")

# Function to decide whether to use OCR or direct text extraction
def process_pdf(pdf_path, poppler_path, output_folder):
    try:
        pdf_document = fitz.open(pdf_path)

        # Check if the PDF has any text on the first page
        first_page_text = pdf_document[0].get_text("text")
        
        if first_page_text.strip():  # If there's text, treat as text-based PDF
            extract_text_from_text_pdf(pdf_path, output_folder)
        else:  # If no text is found, treat as image-based PDF
            extract_text_from_image_pdf(pdf_path, poppler_path, output_folder)

    except Exception as e:
        print(f"An error occurred while deciding how to process {pdf_path}: {e}")

# Loop through all PDF files in the folder and process them
for pdf_file in os.listdir(pdf_folder_path):
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder_path, pdf_file)
        process_pdf(pdf_path, poppler_path, output_folder_path)

Text extracted from D:\Files_Extraction_py\P_Test_Files1\BIN_TO_PDF_output_files\lob10013616679892409327.pdf and saved to D:\Files_Extraction_py\P_Test_Files1\Pdf_Text_Output\lob10013616679892409327.txt
Text extracted from D:\Files_Extraction_py\P_Test_Files1\BIN_TO_PDF_output_files\lob10330880227198515982.pdf and saved to D:\Files_Extraction_py\P_Test_Files1\Pdf_Text_Output\lob10330880227198515982.txt
Text extracted from D:\Files_Extraction_py\P_Test_Files1\BIN_TO_PDF_output_files\lob10352851475717600885.pdf and saved to D:\Files_Extraction_py\P_Test_Files1\Pdf_Text_Output\lob10352851475717600885.txt
Text extracted from D:\Files_Extraction_py\P_Test_Files1\BIN_TO_PDF_output_files\lob10512230008315759877.pdf and saved to D:\Files_Extraction_py\P_Test_Files1\Pdf_Text_Output\lob10512230008315759877.txt
Text extracted from D:\Files_Extraction_py\P_Test_Files1\BIN_TO_PDF_output_files\lob10514136485234828392.pdf and saved to D:\Files_Extraction_py\P_Test_Files1\Pdf_Text_Output\lob1051413648