In [3]:
import re
import PyPDF2
import tabula
import pandas as pd
import numpy as np


def mm_to_points(mm):
    return mm * 2.83465


def extract_tables_tabula(doc, area_mm, column_positions_mm):
    area_points = [mm_to_points(x) for x in area_mm]
    column_positions_points = [mm_to_points(x) for x in column_positions_mm]
    df_list = tabula.read_pdf(doc, pages='all', multiple_tables=True, stream=True, guess=True, area=area_points,
                              columns=column_positions_points)
    df = pd.concat(df_list)
    print("Extracting")
    return df

def clean_dataframe(df):
    # Rename columns
    df.columns = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']

    # Combine columns A and B if "PSI I" is found in column A
    df['A'] = df.apply(lambda row: str(row['A']) + ' ' + str(row['B']) if "PSI I" in str(row['A']) else row['A'],
                       axis=1)
    df.loc[df['A'].str.contains("PSI I", na=False), 'B'] = ''

    # Replace row contents with empty strings if "QTY" in A and "Material" in G
    df.loc[(df['A'].str.contains("QTY", na=False)) & (df['G'].str.contains("Material", na=False))] = ''

    # Look for "Requiring Bottom" in B and make the entire row empty when it's found
    df.loc[df['B'].str.contains("Requiring Bottom", na=False)] = ''

    # Shift columns F, G, H, and I one column to the left
    df[['E', 'F', 'G', 'H']] = df[['F', 'G', 'H', 'I']]
    df['I'] = np.nan  # Set column I to NaN

    # ... other cleaning steps

    # Insert a new column between A and B
    df.insert(1, 'QTY', np.nan)

    # Insert a new column between E and F
    df.insert(6, 'New_Column2', np.nan)

    # Insert a new column between F and G
    df.insert(8, 'New_Column3', np.nan)

    return df




import os


def convert_pdf_to_excel(pdf_file, area_mm, column_positions_mm, output_file, modified_pdf_name):
    df = extract_tables_tabula(pdf_file, area_mm, column_positions_mm)
    df = clean_dataframe(df)
    df.to_excel(output_file, index=False)
    print("Converting")
    return df


def process_pdf(input_pdf, area_mm, column_positions_mm, output_excel):
    pdf_name = os.path.basename(input_pdf)[:-4]  # Get PDF file name without the file extension
    modified_pdf_name = pdf_name.replace(" - Part List", "")
    convert_pdf_to_excel(input_pdf, area_mm, column_positions_mm, output_excel, modified_pdf_name)
    print("Processing")


def process_pdf_directory(input_directory, area_mm, column_positions_mm):
    for filename in os.listdir(input_directory):
        if filename.endswith(".pdf"):
            input_pdf = os.path.join(input_directory, filename)
            output_excel = os.path.splitext(input_pdf)[0] + ".xlsx"
            process_pdf(input_pdf, area_mm, column_positions_mm, output_excel)


area_mm = [0, 0, 196.85, 279.4]  # top, left, bottom, and right coordinates in millimeters
column_positions_mm = [15, 76, 90, 96, 98, 113, 169, 260]  # Approximate column positions in millimeters

input_directory = "C:\\Users\\Justi\\PycharmProjects\\Quality2\\Test Part Lists"
process_pdf_directory(input_directory, area_mm, column_positions_mm)


Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting
Converting
Processing
Extracting