In [6]:
import re
import PyPDF2
import tabula
import pandas as pd
import numpy as np
import os


def extract_tables_tabula(doc):
    # Read the PDF file using tabula
    df = tabula.read_pdf(doc, pages='all', multiple_tables=True, stream=True, guess=True)

    # Check if any tables are found
    if len(df) == 0:
        print(f"No tables found in {doc}")
        return pd.DataFrame()  # Return an empty DataFrame

    df = pd.concat(df)
    return df


def clear_cells_starting_with_letter(s):
    if isinstance(s, str) and s:
        if s[0].isalpha():
            return ''
        elif len(s) > 1 and s[0] == "'" and not s[1].isdigit():
            return ''
    return s


def extract_number_and_remaining_text(s):
    pattern = r"(\d+\.\d{3})"
    match = re.search(pattern, s)
    if match:
        number = match.group(0)
        remaining_text = re.sub(pattern, '', s)
        return number, remaining_text
    return np.nan, s


def clear_row_with_width_x(row):
    if row["B"] == 'Width   x':
        return pd.Series({key: '' for key in row.index})
    return row

def clear_row_with_width_blank(row):
    if pd.isnull(row["A"]):
        return pd.Series({key: '' for key in row.index})
    return row


def convert_pdf_to_excel(pdf_file):
    # Open the PDF file in binary mode
    with open(pdf_file, "rb") as file:
        # Create a PDF reader object
        reader = PyPDF2.PdfReader(file)
        # Get the number of pages in the PDF file
        num_pages = len(reader.pages)

    # Read the PDF file using tabula
    print('using tabula')
    df = extract_tables_tabula(pdf_file)

    if df.empty:
        print(f"No tables found in {pdf_file}. Skipping...")
        return df

    # Create a new DataFrame with the same columns as the original DataFrame and one blank row
    new_row = pd.DataFrame(columns=df.columns, data=[np.full(df.shape[1], np.nan)])

    # Concatenate the new row with the original DataFrame and reset the index
    df = pd.concat([new_row, df], ignore_index=True)

    # Rename columns
    df.columns = ["A", "B", "C", "D", "E", "F"]

    # Clear cells in column D that start with a letter or a non-digit after the single quote
    df["D"] = df["D"].apply(clear_cells_starting_with_letter)

    # Extract the numbers and remaining text from column D
    df[["C", "D"]] = df["D"].astype(str).apply(extract_number_and_remaining_text).apply(pd.Series)

    # Replace 'nan' string in column D with empty cells
    df["D"] = df["D"].replace('nan', '')

    # Clear rows with 'Width x' in column B
    df = df.apply(clear_row_with_width_x, axis=1)
    df = df.apply(clear_row_with_width_blank, axis=1)
    df.to_excel("output.xlsx", index=False)
    return df


def process_all_pdfs_in_folder(folder_path):
    # List all files in the folder
    files = os.listdir(folder_path)

    # Iterate through each file
    for file in files:
        # Check if the file is a PDF
        if file.lower().endswith(".pdf"):
            print(f"Processing {file}")
            pdf_file_path = os.path.join(folder_path, file)
            df = convert_pdf_to_excel(pdf_file_path)

            # Save the DataFrame to an Excel file with the same name as the PDF
            if not df.empty:
                output_file_path = os.path.join(folder_path, f"{os.path.splitext(file)[0]}.xlsx")
                df.to_excel(output_file_path, index=False)


folder_path = "/home/justin/PycharmProjects/Sheet/Testlists"
process_all_pdfs_in_folder(folder_path)

doc = "/home/justin/PycharmProjects/PLE/AM000884 .pdf"
# doc = "/home/justin/PycharmProjects/PLE/AM001179 .pdf"

convert_pdf_to_excel(doc)


Processing 74180 - Part List.pdf
using tabula
Processing AM001433 - Part List.pdf
using tabula
Processing AM001561 - Part List.pdf
using tabula
Processing AM000960 - Part List.pdf
using tabula
Processing AM001251 - Part List.pdf
using tabula
Processing AM001542 - Part List.pdf
using tabula
Processing AM001572 - Part List.pdf
using tabula
Processing AM001453 - Part List.pdf
using tabula
Processing AM001411 - Part List.pdf
using tabula
Processing AM001241 - Part List.pdf
using tabula
Processing AM001450 - Part List.pdf
using tabula
Processing AM001434 - Part List.pdf
using tabula
Processing AM001672 - Part List.pdf
using tabula
Processing AM001317 - Part List.pdf
using tabula
Processing AM001485 - Part List.pdf
using tabula
Processing AM001576 - Part List.pdf
using tabula
Processing AM001437 - Part List.pdf
using tabula
Processing AM001440G - Part List.pdf
using tabula
No tables found in /home/justin/PycharmProjects/Sheet/Testlists/AM001440G - Part List.pdf
No tables found in /home/justi

Unnamed: 0,A,B,C,D,E,F
0,,,,,,
1,,,,,,
2,PSI ID: 2 - Peninsula Base Panel,,,,,
3,1 End (Finishing) Panel - Peninsula,72.000,36.000,MDFMDF16.26MDF,*AM000884PART0007.DXF_/OAR*,1
4,,,,,,
...,...,...,...,...,...,...
367,PSI ID: 43 - Sample Door & Panels (2),,,,,
368,1 Door- (Sample - Internal use only),12.000,15.000,BIRCHPC19.05BIRCH,*AM000884DORB0002.DXF_/OAR*,11
369,,,,,,
370,2 Sample Board - 11 x 14,11.000,14.000,L175PC16.26BIRCH,*AM000884PART0003.DXF_/OAR*,37
