In [3]:
import re
from bs4 import BeautifulSoup
import pandas as pd
import os
 
# Directory containing the files
directory_path = "D:/HTML_Files_Ext"
 
# List to store data from each HTML file
data_list = []
 
# Loop through all files in the directory
for filename in os.listdir(directory_path):
    # Check if the file has an .html extension
    if filename.endswith(".html"):
        file_path = os.path.join(directory_path, filename)
        # Load and parse the HTML file
        with open(file_path, "r", encoding="utf-8") as file:
            html_content = file.read()
        soup = BeautifulSoup(html_content, "html.parser")
        # Extract required fields
        patient = soup.find("div", id="patName").get_text(strip=True).replace("Patient: ", "")
        location = soup.find("span", id="locNameVal").get_text(strip=True)
        dob = soup.find("span", id="dobVal").get_text(strip=True)
        attending_physician = soup.find("span", id="attendingVal").get_text(strip=True)
        mrn = soup.find("span", id="mrnVal").get_text(strip=True)
        date = soup.find("span", id="visitDateVal").get_text(strip=True)
        reasons_for_visit = soup.find("span", id="visitReasonVal").get_text(strip=True)
 
        # Define sections with start and end labels
        sections = {
            "Allergies as of Visit Date": ("Allergies as of visit date", "Special Considerations"),
            "Special Considerations": ("Special Considerations", "GYN Dx"),
            "GYN Dx": ("GYN Dx", "Hem/Onc Diagnosis"),
            "Hem/Onc Diagnosis": ("Hem/Onc Diagnosis", "H&P"),
            "H&P": ("H&P", "Signs"),
            "Signs": ("Signs", "Laboratory"),
            "Laboratory": ("Laboratory", "Outside Rx"),
            "Outside Rx": ("Outside Rx", "Medications"),
            "Medications": ("Medications", "Therapy note"),
            "Therapy note": ("Therapy note", "Diagnostic Imaging"),
            "Diagnostic Imaging": ("Diagnostic Imaging", "Plan"),
            "Plan": ("Plan", "Text Note"),
            "Text Note": ("Text Note", None)  # No end tag after 'Text Note'
        }
 
        # Dictionary to hold extracted section texts
        section_data = {}
 
        # Loop over the defined sections
        for section_name, (start_label, end_label) in sections.items():
            start_section = soup.find("span", string=re.compile(re.escape(start_label)))
            end_section = soup.find("span", string=re.compile(re.escape(end_label))) if end_label else None
            # Extract text between the start and end sections
            section_text = ""
            if start_section:
                current_element = start_section.find_next("div")
                while current_element and current_element != end_section:
                    section_text += current_element.get_text(" ", strip=True)
                    current_element = current_element.find_next_sibling()
            section_data[section_name] = section_text if section_text else "Not Found"
        # Append the data to the list
        data_list.append({
            "Patient": patient,
            "Location": location,
            "DOB": dob,
            "Attending Physician": attending_physician,
            "MRN": mrn,
            "Date": date,
            "Reasons for Visit": reasons_for_visit,
            **section_data  # Expand the section data into columns
        })
 
# Convert the list of data to a DataFrame
df = pd.DataFrame(data_list)
 
# Preview the DataFrame to verify the results
print("DataFrame preview:")
print(df)
 
# Save the DataFrame to an Excel file
#output_path = "D:/HTML_Files_Ext/patient_data_combined_with_all_sections2.xlsx"
#df.to_excel(output_path, index=False)
#print(f"Data has been saved to {output_path}")

DataFrame preview:
           Patient                                   Location         DOB  \
0  YVETTE A. LOPEZ  GYN-Medical Center San Antonio (Inactive)  09/16/1963   
1  YVETTE A. LOPEZ  GYN-Medical Center San Antonio (Inactive)  09/16/1963   

          Attending Physician       MRN        Date Reasons for Visit  \
0  Antonio Santillan-Gomez MD  02008593  06/28/2016  FLWP ( 4 MO FU )   
1  Antonio Santillan-Gomez MD  02008593  10/06/2016        Data entry   

                          Allergies as of Visit Date  \
0  Vicodin, po solid (hydrocodone/acetaminophen),...   
1  Vicodin, po solid (hydrocodone/acetaminophen),...   

                              Special Considerations  \
0  Varicella status: Chicken pox: yes, Varicella ...   
1  Varicella status: Chicken pox: yes, Varicella ...   

                                              GYN Dx  \
0  Primary gyn diagnosis for this visit: Endometr...   
1                                          Not Found   

                      

In [15]:
import re
from bs4 import BeautifulSoup
import pandas as pd
import os
import json

# Directory containing the files
directory_path = "D:/HTML_Files_Ext"

# List to store data from each HTML file
data_list = []

# Loop through all files in the directory
for filename in os.listdir(directory_path):
    # Check if the file has an .html extension
    if filename.endswith(".html"):
        file_path = os.path.join(directory_path, filename)
        # Load and parse the HTML file
        with open(file_path, "r", encoding="utf-8") as file:
            html_content = file.read()
        soup = BeautifulSoup(html_content, "html.parser")
        # Extract required fields
        patient = soup.find("div", id="patName").get_text(strip=True).replace("Patient: ", "")
        location = soup.find("span", id="locNameVal").get_text(strip=True)
        dob = soup.find("span", id="dobVal").get_text(strip=True)
        attending_physician = soup.find("span", id="attendingVal").get_text(strip=True)
        mrn = soup.find("span", id="mrnVal").get_text(strip=True)
        date = soup.find("span", id="visitDateVal").get_text(strip=True)
        reasons_for_visit = soup.find("span", id="visitReasonVal").get_text(strip=True)
        
        # Dictionary to hold data before JSON conversion
        data_dict = {
            "Patient": patient,
            "Location": location,
            "DOB": dob,
            "Attending Physician": attending_physician,
            "MRN": mrn,
            "Date": date,
            "Reasons for Visit": reasons_for_visit
        }
        
        # Define sections starting from 'Allergies as of Visit Date'
        start_label = "Allergies as of Visit Date"
        sections = {
            "Allergies as of Visit Date": ("Allergies as of visit date", "Special Considerations"),
            "Special Considerations": ("Special Considerations", "GYN Dx"),
            "GYN Dx": ("GYN Dx", "Hem/Onc Diagnosis"),
            "Hem/Onc Diagnosis": ("Hem/Onc Diagnosis", "H&P"),
            "H&P": ("H&P", "Signs"),
            "Signs": ("Signs", "Laboratory"),
            "Laboratory": ("Laboratory", "Outside Rx"),
            "Outside Rx": ("Outside Rx", "Medications"),
            "Medications": ("Medications", "Therapy note"),
            "Therapy note": ("Therapy note", "Diagnostic Imaging"),
            "Diagnostic Imaging": ("Diagnostic Imaging", "Plan"),
            "Plan": ("Plan", "Text Note"),
            "Text Note": ("Text Note", None)  # No end tag after 'Text Note'
        }
        
        # Dictionary to hold extracted section texts
        section_data = {}
        capture_data = False
        
        # Loop over the defined sections
        for section_name, (start_label, end_label) in sections.items():
            if start_label == "Allergies as of visit date":
                capture_data = True  # Start capturing from "Allergies as of Visit Date"

            if capture_data:
                start_section = soup.find("span", string=re.compile(re.escape(start_label)))
                end_section = soup.find("span", string=re.compile(re.escape(end_label))) if end_label else None
                # Extract text between the start and end sections
                section_text = ""
                if start_section:
                    current_element = start_section.find_next("div")
                    while current_element and current_element != end_section:
                        section_text += current_element.get_text(" ", strip=True)
                        current_element = current_element.find_next_sibling()
                section_data[section_name] = section_text if section_text else "Not Found"

        # Convert section data to JSON and add to the dictionary
        data_dict["Allergies as of visit date(Json)"] = json.dumps(section_data)

        # Append the data to the list
        data_list.append(data_dict)

# Convert the list of data to a DataFrame
df = pd.DataFrame(data_list)

# Preview the DataFrame to verify the results
print("DataFrame preview:")
print(df)

# Save the DataFrame to an Excel file (uncomment when saving)
output_path = "D:/HTML_Files_Ext/patient_data_combined_with_json_sections5.xlsx"
df.to_excel(output_path, index=False)
print(f"Data has been saved to {output_path}")


DataFrame preview:
           Patient                                   Location         DOB  \
0  YVETTE A. LOPEZ  GYN-Medical Center San Antonio (Inactive)  09/16/1963   
1  YVETTE A. LOPEZ  GYN-Medical Center San Antonio (Inactive)  09/16/1963   

          Attending Physician       MRN        Date Reasons for Visit  \
0  Antonio Santillan-Gomez MD  02008593  06/28/2016  FLWP ( 4 MO FU )   
1  Antonio Santillan-Gomez MD  02008593  10/06/2016        Data entry   

                          Allergies as of visit date  
0  {"Allergies as of Visit Date": "Vicodin, po so...  
1  {"Allergies as of Visit Date": "Vicodin, po so...  
Data has been saved to D:/HTML_Files_Ext/patient_data_combined_with_json_sections5.xlsx


In [1]:
import re
from bs4 import BeautifulSoup
import pandas as pd
import os
import json
from datetime import datetime

# Directory containing the files
directory_path = "D:/HTML_Files_Ext"

# List to store data from each HTML file
data_list = []

# Get the current execution date
execution_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

# Loop through all files in the directory
for filename in os.listdir(directory_path):
    # Check if the file has an .html extension
    if filename.endswith(".html"):
        file_path = os.path.join(directory_path, filename)
        # Load and parse the HTML file
        with open(file_path, "r", encoding="utf-8") as file:
            html_content = file.read()
        soup = BeautifulSoup(html_content, "html.parser")

        # Extract required fields
        patient = soup.find("div", id="patName").get_text(strip=True).replace("Patient: ", "")
        location = soup.find("span", id="locNameVal").get_text(strip=True)
        dob = soup.find("span", id="dobVal").get_text(strip=True)
        attending_physician = soup.find("span", id="attendingVal").get_text(strip=True)
        mrn = soup.find("span", id="mrnVal").get_text(strip=True)
        date = soup.find("span", id="visitDateVal").get_text(strip=True)
        reasons_for_visit = soup.find("span", id="visitReasonVal").get_text(strip=True)
        
        # Dictionary to hold data before JSON conversion
        data_dict = {
            "Patient": patient,
            "Location": location,
            "DOB": dob,
            "Attending Physician": attending_physician,
            "MRN": mrn,
            "Date": date,
            "Reasons for Visit": reasons_for_visit,
            "File_Path": file_path,  # Add the file path
            "File_Name": filename,    # Add the file name
            "Execution_Date": execution_date  # Add execution date
        }
        
        # Define sections starting from 'Allergies as of Visit Date'
        start_label = "Allergies as of Visit Date"
        sections = {
            "Allergies as of Visit Date": ("Allergies as of visit date", "Special Considerations"),
            "Special Considerations": ("Special Considerations", "GYN Dx"),
            "GYN Dx": ("GYN Dx", "Hem/Onc Diagnosis"),
            "Hem/Onc Diagnosis": ("Hem/Onc Diagnosis", "H&P"),
            "H&P": ("H&P", "Signs"),
            "Signs": ("Signs", "Laboratory"),
            "Laboratory": ("Laboratory", "Outside Rx"),
            "Outside Rx": ("Outside Rx", "Medications"),
            "Medications": ("Medications", "Therapy note"),
            "Therapy note": ("Therapy note", "Diagnostic Imaging"),
            "Diagnostic Imaging": ("Diagnostic Imaging", "Plan"),
            "Plan": ("Plan", "Text Note"),
            "Text Note": ("Text Note", None)  # No end tag after 'Text Note'
        }
        
        # Dictionary to hold extracted section texts
        section_data = {}
        capture_data = False
        
        # Loop over the defined sections
        for section_name, (start_label, end_label) in sections.items():
            if start_label == "Allergies as of visit date":
                capture_data = True  # Start capturing from "Allergies as of Visit Date"

            if capture_data:
                start_section = soup.find("span", string=re.compile(re.escape(start_label)))
                end_section = soup.find("span", string=re.compile(re.escape(end_label))) if end_label else None
                # Extract text between the start and end sections
                section_text = ""
                if start_section:
                    current_element = start_section.find_next("div")
                    while current_element and current_element != end_section:
                        section_text += current_element.get_text(" ", strip=True)
                        current_element = current_element.find_next_sibling()
                section_data[section_name] = section_text if section_text else "Not Found"

        # Convert section data to JSON and add to the dictionary
        data_dict["Allergies as of visit date(Json)"] = json.dumps(section_data)

        # Append the data to the list
        data_list.append(data_dict)

# Convert the list of data to a DataFrame
df = pd.DataFrame(data_list)

# Preview the DataFrame to verify the results
print("DataFrame preview:")
print(df)

# Save the DataFrame to an Excel file (uncomment when saving)
output_path = "D:/HTML_Files_Ext/patient_data_combined_with_json_sections6.xlsx"
df.to_excel(output_path, index=False)
print(f"Data has been saved to {output_path}")




DataFrame preview:
           Patient                                   Location         DOB  \
0  YVETTE A. LOPEZ  GYN-Medical Center San Antonio (Inactive)  09/16/1963   
1  YVETTE A. LOPEZ  GYN-Medical Center San Antonio (Inactive)  09/16/1963   

          Attending Physician       MRN        Date Reasons for Visit  \
0  Antonio Santillan-Gomez MD  02008593  06/28/2016  FLWP ( 4 MO FU )   
1  Antonio Santillan-Gomez MD  02008593  10/06/2016        Data entry   

                                           File_Path  \
0  D:/HTML_Files_Ext\0_20160628-131141_GYNMedical...   
1  D:/HTML_Files_Ext\0_20161006-090033_GYNMedical...   

                                           File_Name       Execution_Date  \
0  0_20160628-131141_GYNMedicalCenterSanAntonioIn...  2024-10-28 03:10:04   
1  0_20161006-090033_GYNMedicalCenterSanAntonioIn...  2024-10-28 03:10:04   

                    Allergies as of visit date(Json)  
0  {"Allergies as of Visit Date": "Vicodin, po so...  
1  {"Allergies as 

In [3]:
# Path to the .bin file
bin_file_path = r"D:\HTML_Files_Ext\lob1730502600014505286.bin"
html_output_path = r"D:\HTML_Files_Ext\Bin_HTML_output_files\lob1730502600014505286.html"

# Open the binary file and read its content
try:
    with open(bin_file_path, "rb") as bin_file:
        # Read the content of the bin file
        bin_data = bin_file.read()

    # Attempt to decode the binary data to text (using utf-8 or other encoding if necessary)
    try:
        decoded_text = bin_data.decode("utf-8")  # Adjust the encoding if needed
    except UnicodeDecodeError:
        print("Error: Could not decode binary data using utf-8.")
        decoded_text = bin_data.decode("latin-1")  # You may try a different encoding here

    # Write the decoded text into an HTML file
    with open(html_output_path, "w", encoding="utf-8") as html_file:
        html_file.write(decoded_text)

    print(f"HTML file successfully created at {html_output_path}")
except FileNotFoundError:
    print(f"Error: The file at path {bin_file_path} was not found.")
except Exception as e:
    print(f"An error occurred: {e}")


HTML file successfully created at D:\HTML_Files_Ext\Bin_HTML_output_files\lob1730502600014505286.html


In [2]:
import os

# Path to the folder containing .bin files
bin_folder_path = r"D:\Files_Extraction_py\BIN"
html_output_folder = r"D:\Files_Extraction_py\BIN_TO_HTML_output_files"
pdf_output_folder = r"D:\Files_Extraction_py\BIN_TO_PDF_output_files"

# Ensure output folders exist
os.makedirs(html_output_folder, exist_ok=True)
os.makedirs(pdf_output_folder, exist_ok=True)

# Function to detect if the file is HTML or PDF
def detect_file_type(bin_data):
    # Check for HTML content (common HTML tags)
    if b'<!DOCTYPE html>' in bin_data or b'<html' in bin_data:
        return 'html'
    # Check for PDF content (PDF files typically start with "%PDF")
    elif bin_data[:4] == b'%PDF':
        return 'pdf'
    else:
        return 'unknown'

# Loop through all .bin files in the folder
for filename in os.listdir(bin_folder_path):
    if filename.endswith(".bin"):
        bin_file_path = os.path.join(bin_folder_path, filename)
        
        # Read the binary file
        try:
            with open(bin_file_path, "rb") as bin_file:
                bin_data = bin_file.read()

            # Detect file type
            file_type = detect_file_type(bin_data)

            if file_type == 'html':
                # Decode binary data as HTML and save
                html_output_path = os.path.join(html_output_folder, f"{filename}.html")
                try:
                    decoded_text = bin_data.decode("utf-8")
                except UnicodeDecodeError:
                    decoded_text = bin_data.decode("latin-1")  # Fallback decoding

                # Write to HTML file
                with open(html_output_path, "w", encoding="utf-8") as html_file:
                    html_file.write(decoded_text)
                print(f"HTML file successfully created: {html_output_path}")

            elif file_type == 'pdf':
                # Write the binary data to a PDF file
                pdf_output_path = os.path.join(pdf_output_folder, f"{filename}.pdf")
                with open(pdf_output_path, "wb") as pdf_file:
                    pdf_file.write(bin_data)
                print(f"PDF file successfully created: {pdf_output_path}")

            else:
                print(f"Unknown file format for {filename}. Skipping...")

        except FileNotFoundError:
            print(f"Error: The file {filename} was not found.")
        except Exception as e:
            print(f"An error occurred while processing {filename}: {e}")


Unknown file format for lob1034411577975205166.bin. Skipping...
Unknown file format for lob10501467738322404766.bin. Skipping...
Unknown file format for lob10626829518212944619.bin. Skipping...
Unknown file format for lob107031334653987491.bin. Skipping...
Unknown file format for lob1077708249107881611.bin. Skipping...
Unknown file format for lob1094213002255894382.bin. Skipping...
Unknown file format for lob11049748732235748120.bin. Skipping...
Unknown file format for lob1107450906559733108.bin. Skipping...
Unknown file format for lob11221047292668554672.bin. Skipping...
Unknown file format for lob123045195900684474.bin. Skipping...
Unknown file format for lob143826308596261948.bin. Skipping...
Unknown file format for lob163976911685831059.bin. Skipping...
Unknown file format for lob267180463724377215.bin. Skipping...
Unknown file format for lob290305702767849286.bin. Skipping...
Unknown file format for lob295016282863131491.bin. Skipping...
Unknown file format for lob4235833341103374

In [5]:
import os

# Path to the folder containing .bin files
bin_folder_path = r"D:\Files_Extraction_py\BIN"
html_output_folder = r"D:\Files_Extraction_py\BIN_TO_HTML_output_files"
pdf_output_folder = r"D:\Files_Extraction_py\BIN_TO_PDF_output_files"

# Create output directories if they do not exist
os.makedirs(html_output_folder, exist_ok=True)
os.makedirs(pdf_output_folder, exist_ok=True)

# Loop through each .bin file in the specified folder
for file_name in os.listdir(bin_folder_path):
    if file_name.endswith(".bin"):
        bin_file_path = os.path.join(bin_folder_path, file_name)
        html_output_path = os.path.join(html_output_folder, f"{os.path.splitext(file_name)[0]}.html")
        pdf_output_path = os.path.join(pdf_output_folder, f"{os.path.splitext(file_name)[0]}.pdf")

        # Open the binary file and read its content
        try:
            with open(bin_file_path, "rb") as bin_file:
                # Read the content of the bin file
                bin_data = bin_file.read()

            # Attempt to decode the binary data
            try:
                decoded_text = bin_data.decode("utf-8")  # Try decoding as HTML first
                # If decoding succeeds, save as an HTML file
                with open(html_output_path, "w", encoding="utf-8") as html_file:
                    html_file.write(decoded_text)
                print(f"HTML file successfully created at {html_output_path}")

            except UnicodeDecodeError:
                print(f"Error: Could not decode binary data using utf-8 for {file_name}. Trying PDF.")

                # If decoding as HTML fails, assume it's a PDF and save it as is
                with open(pdf_output_path, "wb") as pdf_file:
                    pdf_file.write(bin_data)
                print(f"PDF file successfully created at {pdf_output_path}")

        except FileNotFoundError:
            print(f"Error: The file at path {bin_file_path} was not found.")
        except Exception as e:
            print(f"An error occurred while processing {file_name}: {e}")


HTML file successfully created at D:\Files_Extraction_py\BIN_TO_HTML_output_files\lob1034411577975205166.html
HTML file successfully created at D:\Files_Extraction_py\BIN_TO_HTML_output_files\lob10501467738322404766.html
HTML file successfully created at D:\Files_Extraction_py\BIN_TO_HTML_output_files\lob10626829518212944619.html
HTML file successfully created at D:\Files_Extraction_py\BIN_TO_HTML_output_files\lob107031334653987491.html
HTML file successfully created at D:\Files_Extraction_py\BIN_TO_HTML_output_files\lob1077708249107881611.html
HTML file successfully created at D:\Files_Extraction_py\BIN_TO_HTML_output_files\lob1094213002255894382.html
HTML file successfully created at D:\Files_Extraction_py\BIN_TO_HTML_output_files\lob11049748732235748120.html
HTML file successfully created at D:\Files_Extraction_py\BIN_TO_HTML_output_files\lob1107450906559733108.html
HTML file successfully created at D:\Files_Extraction_py\BIN_TO_HTML_output_files\lob11221047292668554672.html
HTML fi

In [6]:
pip install pytesseract pdf2image pillow pandas openpyxl PyMuPDF

Collecting pytesseractNote: you may need to restart the kernel to use updated packages.

  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pillow
  Downloading pillow-11.0.0-cp310-cp310-win_amd64.whl.metadata (9.3 kB)
Collecting PyMuPDF
  Downloading PyMuPDF-1.24.12-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Downloading pillow-11.0.0-cp310-cp310-win_amd64.whl (2.6 MB)
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---------------------------------------- 2.6/2.6 MB 37.2 MB/s eta 0:00:00
Downloading PyMuPDF-1.24.12-cp39-abi3-win_amd64.whl (16.0 MB)
   ---------------------------------------- 0.0/16.0 MB ? eta -:--:--
   ------------------ --------------------- 7.3/16.0 MB 37.8 MB/s eta 0:00:01
   ---------------------------------------- 16.0/

In [12]:
pip install pytesseract




In [13]:

pip install PyMuPDF





In [14]:
pip install pdf2image

Note: you may need to restart the kernel to use updated packages.


In [17]:
import os
import pytesseract
import fitz  # PyMuPDF
from pdf2image import convert_from_path

# Set the path to the Tesseract executable if not added to the system PATH
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Path to the folder containing the PDF files
pdf_folder_path = r"D:\Files_Extraction_py\PDF"
output_folder_path = r"D:\Files_Extraction_py\PDF_Text_Output"

# Create output folder if it doesn't exist
os.makedirs(output_folder_path, exist_ok=True)

# Loop through each PDF file in the folder
for file_name in os.listdir(pdf_folder_path):
    if file_name.endswith(".pdf"):
        pdf_file_path = os.path.join(pdf_folder_path, file_name)
        output_text_file_path = os.path.join(output_folder_path, f"{os.path.splitext(file_name)[0]}.txt")

        try:
            # Open the PDF
            pdf_document = fitz.open(pdf_file_path)

            # Initialize an empty string to store the extracted text
            full_text = ""

            # Loop through each page in the PDF
            for page_num in range(len(pdf_document)):
                # Convert PDF page to an image using pdf2image
                images = convert_from_path(pdf_file_path, first_page=page_num + 1, last_page=page_num + 1)
                for image in images:
                    # Perform OCR on the image
                    text = pytesseract.image_to_string(image)
                    full_text += f"\n\nPage {page_num + 1}:\n{text}"

            # Save the extracted text to a file
            with open(output_text_file_path, "w", encoding="utf-8") as output_file:
                output_file.write(full_text)

            print(f"Text extracted and saved to {output_text_file_path}")
        except Exception as e:
            print(f"An error occurred while processing {file_name}: {e}")


An error occurred while processing 0006086B.pdf: Unable to get page count. Is poppler installed and in PATH?


In [3]:
import pytesseract
from pdf2image import convert_from_path
import os

# Specify the Tesseract-OCR path
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Path to the folder containing PDF files
pdf_folder_path = r"D:\Files_Extraction_py\PDF"
poppler_path = r"C:\Poppler\Release-24.08.0-0\poppler-24.08.0\Library\bin"

# Path to store the extracted text from PDFs
output_folder_path = r"D:\Files_Extraction_py\PDF\Text_Output"

if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)

# Function to extract text from PDF images
def extract_text_from_pdf(pdf_path, poppler_path, output_folder):
    try:
        images = convert_from_path(pdf_path, poppler_path=poppler_path)
        extracted_text = ""
        
        for i, image in enumerate(images):
            text = pytesseract.image_to_string(image)
            extracted_text += text
        
        pdf_name = os.path.basename(pdf_path).replace('.pdf', '')
        output_text_file = os.path.join(output_folder, f"{pdf_name}.txt")
        with open(output_text_file, 'w', encoding='utf-8') as text_file:
            text_file.write(extracted_text)
        
        print(f"Text extracted from {pdf_path} and saved to {output_text_file}")
    
    except Exception as e:
        print(f"An error occurred while processing {pdf_path}: {e}")

# Loop through all PDF files in the folder and process them
for pdf_file in os.listdir(pdf_folder_path):
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder_path, pdf_file)
        extract_text_from_pdf(pdf_path, poppler_path, output_folder_path)


Text extracted from D:\Files_Extraction_py\PDF\0006086B.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output\0006086B.txt


In [4]:
import pytesseract
from pdf2image import convert_from_path
import fitz  # PyMuPDF
import os

# Specify the Tesseract-OCR path
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Path to the folder containing PDF files
pdf_folder_path = r"D:\Files_Extraction_py\PDF\PDF files"
poppler_path = r"C:\Poppler\Release-24.08.0-0\poppler-24.08.0\Library\bin"

# Path to store the extracted text from PDFs
output_folder_path = r"D:\Files_Extraction_py\PDF\Text_Output"

if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)

# Function to extract text from image-based PDFs using OCR
def extract_text_from_image_pdf(pdf_path, poppler_path, output_folder):
    try:
        images = convert_from_path(pdf_path, poppler_path=poppler_path)
        extracted_text = ""
        
        for i, image in enumerate(images):
            text = pytesseract.image_to_string(image)
            extracted_text += text
        
        pdf_name = os.path.basename(pdf_path).replace('.pdf', '')
        output_text_file = os.path.join(output_folder, f"{pdf_name}.txt")
        with open(output_text_file, 'w', encoding='utf-8') as text_file:
            text_file.write(extracted_text)
        
        print(f"OCR text extracted from {pdf_path} and saved to {output_text_file}")
    
    except Exception as e:
        print(f"An error occurred while processing {pdf_path} with OCR: {e}")

# Function to extract text from text-based PDFs
def extract_text_from_text_pdf(pdf_path, output_folder):
    try:
        pdf_document = fitz.open(pdf_path)
        extracted_text = ""

        for page_num in range(pdf_document.page_count):
            page = pdf_document.load_page(page_num)
            extracted_text += page.get_text("text")
        
        pdf_name = os.path.basename(pdf_path).replace('.pdf', '')
        output_text_file = os.path.join(output_folder, f"{pdf_name}.txt")
        with open(output_text_file, 'w', encoding='utf-8') as text_file:
            text_file.write(extracted_text)

        print(f"Text extracted from {pdf_path} and saved to {output_text_file}")
    
    except Exception as e:
        print(f"An error occurred while processing {pdf_path} as a text PDF: {e}")

# Function to decide whether to use OCR or direct text extraction
def process_pdf(pdf_path, poppler_path, output_folder):
    try:
        pdf_document = fitz.open(pdf_path)

        # Check if the PDF has any text on the first page
        first_page_text = pdf_document[0].get_text("text")
        
        if first_page_text.strip():  # If there's text, treat as text-based PDF
            extract_text_from_text_pdf(pdf_path, output_folder)
        else:  # If no text is found, treat as image-based PDF
            extract_text_from_image_pdf(pdf_path, poppler_path, output_folder)

    except Exception as e:
        print(f"An error occurred while deciding how to process {pdf_path}: {e}")

# Loop through all PDF files in the folder and process them
for pdf_file in os.listdir(pdf_folder_path):
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder_path, pdf_file)
        process_pdf(pdf_path, poppler_path, output_folder_path)


OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\00055501.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output\00055501.txt
OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\00055881.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output\00055881.txt
OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\00055882.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output\00055882.txt
OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\0005619B.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output\0005619B.txt
OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\0006086B.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output\0006086B.txt
OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\00061493.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output\00061493.txt
OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\0008127E.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output\0008127E.txt
OCR text extracted from D:\

In [9]:
import pytesseract
from pdf2image import convert_from_path
import fitz  # PyMuPDF
import os
import re

# Specify the Tesseract-OCR path
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Path to the folder containing PDF files
pdf_folder_path = r"D:\Files_Extraction_py\PDF\PDF files"
poppler_path = r"C:\Poppler\Release-24.08.0-0\poppler-24.08.0\Library\bin"

# Path to store the extracted text from PDFs
output_folder_path = r"D:\Files_Extraction_py\PDF\Text_Output1"

if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)

# Function to format text after extraction to keep colon-aligned text in the same line and aligned vertically
def format_text_colon_alignment(extracted_text):
    # Use regex to ensure that text following a colon is on the same line
    formatted_text = re.sub(r':\s*\n\s*', ': ', extracted_text)
    
    # Split the text into lines
    lines = formatted_text.splitlines()

    # Align colons vertically by determining the maximum length before the colon
    max_length = 0
    for line in lines:
        if ':' in line:
            key_part = line.split(':')[0]
            max_length = max(max_length, len(key_part))

    # Adjust lines to ensure all colons are vertically aligned
    aligned_text = ""
    for line in lines:
        if ':' in line:
            key_part, value_part = line.split(':', 1)
            # Add enough spaces to align the colons
            aligned_line = f"{key_part.ljust(max_length)} : {value_part.strip()}"
            aligned_text += aligned_line + "\n"
        else:
            aligned_text += line + "\n"
    
    return aligned_text

# Function to extract text from image-based PDFs using OCR
def extract_text_from_image_pdf(pdf_path, poppler_path, output_folder):
    try:
        images = convert_from_path(pdf_path, poppler_path=poppler_path)
        extracted_text = ""
        
        for i, image in enumerate(images):
            text = pytesseract.image_to_string(image)
            extracted_text += text
        
        # Apply formatting for colon alignment
        formatted_text = format_text_colon_alignment(extracted_text)
        
        pdf_name = os.path.basename(pdf_path).replace('.pdf', '')
        output_text_file = os.path.join(output_folder, f"{pdf_name}.txt")
        with open(output_text_file, 'w', encoding='utf-8') as text_file:
            text_file.write(formatted_text)
        
        print(f"OCR text extracted from {pdf_path} and saved to {output_text_file}")
    
    except Exception as e:
        print(f"An error occurred while processing {pdf_path} with OCR: {e}")

# Function to extract text from text-based PDFs
def extract_text_from_text_pdf(pdf_path, output_folder):
    try:
        pdf_document = fitz.open(pdf_path)
        extracted_text = ""

        for page_num in range(pdf_document.page_count):
            page = pdf_document.load_page(page_num)
            extracted_text += page.get_text("text")
        
        # Apply formatting for colon alignment
        formatted_text = format_text_colon_alignment(extracted_text)
        
        pdf_name = os.path.basename(pdf_path).replace('.pdf', '')
        output_text_file = os.path.join(output_folder, f"{pdf_name}.txt")
        with open(output_text_file, 'w', encoding='utf-8') as text_file:
            text_file.write(formatted_text)

        print(f"Text extracted from {pdf_path} and saved to {output_text_file}")
    
    except Exception as e:
        print(f"An error occurred while processing {pdf_path} as a text PDF: {e}")

# Function to decide whether to use OCR or direct text extraction
def process_pdf(pdf_path, poppler_path, output_folder):
    try:
        pdf_document = fitz.open(pdf_path)

        # Check if the PDF has any text on the first page
        first_page_text = pdf_document[0].get_text("text")
        
        if first_page_text.strip():  # If there's text, treat as text-based PDF
            extract_text_from_text_pdf(pdf_path, output_folder)
        else:  # If no text is found, treat as image-based PDF
            extract_text_from_image_pdf(pdf_path, poppler_path, output_folder)

    except Exception as e:
        print(f"An error occurred while deciding how to process {pdf_path}: {e}")

# Loop through all PDF files in the folder and process them
for pdf_file in os.listdir(pdf_folder_path):
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder_path, pdf_file)
        process_pdf(pdf_path, poppler_path, output_folder_path)


OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\00055501.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output1\00055501.txt
OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\00055881.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output1\00055881.txt
OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\00055882.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output1\00055882.txt
OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\0005619B.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output1\0005619B.txt
OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\0006086B.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output1\0006086B.txt
OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\00061493.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output1\00061493.txt
OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\0008127E.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output1\0008127E.txt
OCR text extracted f

In [12]:
import pytesseract
from pdf2image import convert_from_path
import fitz  # PyMuPDF
import os
import re

# Specify the Tesseract-OCR path
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Path to the folder containing PDF files
pdf_folder_path = r"D:\Files_Extraction_py\PDF\PDF files"
poppler_path = r"C:\Poppler\Release-24.08.0-0\poppler-24.08.0\Library\bin"

# Path to store the extracted text from PDFs
output_folder_path = r"D:\Files_Extraction_py\PDF\Text_Output1"

if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)

# Function to format text after extraction to keep colon-aligned text in the same line and aligned vertically
# Excludes timestamps in formats like hh:mm:ss or hh:mm from colon alignment
def format_text_colon_alignment(extracted_text):
    # Use regex to ensure that text following a colon is on the same line,
    # but avoid altering timestamps like 12:30:00 or 12:30.
    formatted_text = re.sub(r'(?<!\d):\s*\n\s*', ': ', extracted_text)
    
    # Split the text into lines
    lines = formatted_text.splitlines()

    # Align colons vertically by determining the maximum length before the colon,
    # but skip lines with timestamps (e.g., hh:mm:ss or hh:mm)
    max_length = 0
    for line in lines:
        if ':' in line and not re.search(r'\b\d{1,2}:\d{2}(:\d{2})?\b', line):
            key_part = line.split(':')[0]
            max_length = max(max_length, len(key_part))

    # Adjust lines to ensure all colons are vertically aligned
    aligned_text = ""
    for line in lines:
        if ':' in line and not re.search(r'\b\d{1,2}:\d{2}(:\d{2})?\b', line):
            key_part, value_part = line.split(':', 1)
            # Add enough spaces to align the colons
            aligned_line = f"{key_part.ljust(max_length)} : {value_part.strip()}"
            aligned_text += aligned_line + "\n"
        else:
            aligned_text += line + "\n"
    
    return aligned_text

# Function to extract text from image-based PDFs using OCR
def extract_text_from_image_pdf(pdf_path, poppler_path, output_folder):
    try:
        images = convert_from_path(pdf_path, poppler_path=poppler_path)
        extracted_text = ""
        
        for i, image in enumerate(images):
            text = pytesseract.image_to_string(image)
            extracted_text += text
        
        # Apply formatting for colon alignment
        formatted_text = format_text_colon_alignment(extracted_text)
        
        pdf_name = os.path.basename(pdf_path).replace('.pdf', '')
        output_text_file = os.path.join(output_folder, f"{pdf_name}.txt")
        with open(output_text_file, 'w', encoding='utf-8') as text_file:
            text_file.write(formatted_text)
        
        print(f"OCR text extracted from {pdf_path} and saved to {output_text_file}")
    
    except Exception as e:
        print(f"An error occurred while processing {pdf_path} with OCR: {e}")

# Function to extract text from text-based PDFs
def extract_text_from_text_pdf(pdf_path, output_folder):
    try:
        pdf_document = fitz.open(pdf_path)
        extracted_text = ""

        for page_num in range(pdf_document.page_count):
            page = pdf_document.load_page(page_num)
            extracted_text += page.get_text("text")
        
        # Apply formatting for colon alignment
        formatted_text = format_text_colon_alignment(extracted_text)
        
        pdf_name = os.path.basename(pdf_path).replace('.pdf', '')
        output_text_file = os.path.join(output_folder, f"{pdf_name}.txt")
        with open(output_text_file, 'w', encoding='utf-8') as text_file:
            text_file.write(formatted_text)

        print(f"Text extracted from {pdf_path} and saved to {output_text_file}")
    
    except Exception as e:
        print(f"An error occurred while processing {pdf_path} as a text PDF: {e}")

# Function to decide whether to use OCR or direct text extraction
def process_pdf(pdf_path, poppler_path, output_folder):
    try:
        pdf_document = fitz.open(pdf_path)

        # Check if the PDF has any text on the first page
        first_page_text = pdf_document[0].get_text("text")
        
        if first_page_text.strip():  # If there's text, treat as text-based PDF
            extract_text_from_text_pdf(pdf_path, output_folder)
        else:  # If no text is found, treat as image-based PDF
            extract_text_from_image_pdf(pdf_path, poppler_path, output_folder)

    except Exception as e:
        print(f"An error occurred while deciding how to process {pdf_path}: {e}")

# Loop through all PDF files in the folder and process them
for pdf_file in os.listdir(pdf_folder_path):
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder_path, pdf_file)
        process_pdf(pdf_path, poppler_path, output_folder_path)


OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\00055501.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output1\00055501.txt
OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\00055881.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output1\00055881.txt
OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\00055882.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output1\00055882.txt
OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\0005619B.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output1\0005619B.txt
OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\0006086B.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output1\0006086B.txt
OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\00061493.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output1\00061493.txt
OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\0008127E.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output1\0008127E.txt
OCR text extracted f

In [13]:
import pytesseract
from pdf2image import convert_from_path
import fitz  # PyMuPDF
import os
import re

# Specify the Tesseract-OCR path
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Path to the folder containing PDF files
pdf_folder_path = r"D:\Files_Extraction_py\PDF\PDF files"
poppler_path = r"C:\Poppler\Release-24.08.0-0\poppler-24.08.0\Library\bin"

# Path to store the extracted text from PDFs
output_folder_path = r"D:\Files_Extraction_py\PDF\Text_Output2"

if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)

# Function to format text after extraction to keep colon-aligned text in the same line and aligned vertically
# Excludes timestamps in formats like hh:mm:ss or hh:mm from colon alignment and handles multi-line values
def format_text_colon_alignment(extracted_text):
    # Use regex to ensure that text following a colon is on the same line,
    # but avoid altering timestamps like 12:30:00 or 12:30.
    formatted_text = re.sub(r'(?<!\d):\s*\n\s*', ': ', extracted_text)
    
    # Split the text into lines
    lines = formatted_text.splitlines()

    # Align colons vertically by determining the maximum length before the colon,
    # but skip lines with timestamps (e.g., hh:mm:ss or hh:mm)
    max_length = 0
    for line in lines:
        if ':' in line and not re.search(r'\b\d{1,2}:\d{2}(:\d{2})?\b', line):
            key_part = line.split(':')[0]
            max_length = max(max_length, len(key_part))

    # Adjust lines to ensure all colons are vertically aligned
    aligned_text = ""
    grouped_lines = []
    current_group = []

    # Detect groups of related lines (those without colons)
    for line in lines:
        if ':' in line and not re.search(r'\b\d{1,2}:\d{2}(:\d{2})?\b', line):
            # If current group has entries, append them to the grouped_lines before adding new key-value pair
            if current_group:
                grouped_lines.append(' '.join(current_group))
                current_group = []

            key_part, value_part = line.split(':', 1)
            # Add enough spaces to align the colons
            aligned_line = f"{key_part.ljust(max_length)} : {value_part.strip()}"
            grouped_lines.append(aligned_line)
        else:
            # Accumulate multi-line values
            current_group.append(line.strip())

    # Append any remaining grouped lines
    if current_group:
        grouped_lines.append(' '.join(current_group))

    aligned_text = "\n".join(grouped_lines)
    
    return aligned_text

# Function to extract text from image-based PDFs using OCR
def extract_text_from_image_pdf(pdf_path, poppler_path, output_folder):
    try:
        images = convert_from_path(pdf_path, poppler_path=poppler_path)
        extracted_text = ""
        
        for i, image in enumerate(images):
            text = pytesseract.image_to_string(image)
            extracted_text += text
        
        # Apply formatting for colon alignment
        formatted_text = format_text_colon_alignment(extracted_text)
        
        pdf_name = os.path.basename(pdf_path).replace('.pdf', '')
        output_text_file = os.path.join(output_folder, f"{pdf_name}.txt")
        with open(output_text_file, 'w', encoding='utf-8') as text_file:
            text_file.write(formatted_text)
        
        print(f"OCR text extracted from {pdf_path} and saved to {output_text_file}")
    
    except Exception as e:
        print(f"An error occurred while processing {pdf_path} with OCR: {e}")

# Function to extract text from text-based PDFs
def extract_text_from_text_pdf(pdf_path, output_folder):
    try:
        pdf_document = fitz.open(pdf_path)
        extracted_text = ""

        for page_num in range(pdf_document.page_count):
            page = pdf_document.load_page(page_num)
            extracted_text += page.get_text("text")
        
        # Apply formatting for colon alignment
        formatted_text = format_text_colon_alignment(extracted_text)
        
        pdf_name = os.path.basename(pdf_path).replace('.pdf', '')
        output_text_file = os.path.join(output_folder, f"{pdf_name}.txt")
        with open(output_text_file, 'w', encoding='utf-8') as text_file:
            text_file.write(formatted_text)

        print(f"Text extracted from {pdf_path} and saved to {output_text_file}")
    
    except Exception as e:
        print(f"An error occurred while processing {pdf_path} as a text PDF: {e}")

# Function to decide whether to use OCR or direct text extraction
def process_pdf(pdf_path, poppler_path, output_folder):
    try:
        pdf_document = fitz.open(pdf_path)

        # Check if the PDF has any text on the first page
        first_page_text = pdf_document[0].get_text("text")
        
        if first_page_text.strip():  # If there's text, treat as text-based PDF
            extract_text_from_text_pdf(pdf_path, output_folder)
        else:  # If no text is found, treat as image-based PDF
            extract_text_from_image_pdf(pdf_path, poppler_path, output_folder)

    except Exception as e:
        print(f"An error occurred while deciding how to process {pdf_path}: {e}")

# Loop through all PDF files in the folder and process them
for pdf_file in os.listdir(pdf_folder_path):
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder_path, pdf_file)
        process_pdf(pdf_path, poppler_path, output_folder_path)


OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\00055501.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output2\00055501.txt
OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\00055881.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output2\00055881.txt
OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\00055882.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output2\00055882.txt
OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\0005619B.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output2\0005619B.txt
OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\0006086B.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output2\0006086B.txt
OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\00061493.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output2\00061493.txt
OCR text extracted from D:\Files_Extraction_py\PDF\PDF files\0008127E.pdf and saved to D:\Files_Extraction_py\PDF\Text_Output2\0008127E.txt
OCR text extracted f