In [1]:
# %%
import pdfplumber
import pandas as pd

council = "BCP"
config = {
    "Bedford": {
        "gridlines": {
            (52, 121): "plate",
            (121, 220): "vrm",
            (220, 312): "make",
            (312, 418): "model",
            (418, 482): "licence_start",
            (482, 538): "licence_end"
        },
        "unique_identifier": "plate"
    },
    "BCP": {
        "gridlines": {
            (42, 76): "reference_number",
            (76, 212): "name",
            (212, 268): "vrm",
            (268, 311): "licence_number",
            (311, 364): "licence_start",
            (364, 412): "licence_end",
            (412, 528): "make",
            (528, 590): "licence_type",
        },
        "unique_identifier": "reference_number"
    }
}

gridlines = config[council]["gridlines"]
unique_identifier = config[council]["unique_identifier"]

# Path to the PDF file
pdf_path = f'pdfs/{council}.pdf'
def extract_pdf_text(pdf_path):
    # Open the PDF file
    with pdfplumber.open(pdf_path) as pdf:
        # Initialize an empty list to store the extracted data
        data = []
        # Iterate through each page
        for page_num, page in enumerate(pdf.pages):
            # Extract the text with bounding boxes
            for element in page.extract_words():
                text = element['text']
                x0, y0, x1, y1 = element['x0'], element['top'], element['x1'], element['bottom']
                data.append([page_num + 1, text, x0, y0, x1, y1])

        # Convert the list to a DataFrame
        df = pd.DataFrame(data, columns=['page', 'text', 'x0', 'y0', 'x1', 'y1'])
        return df
    
df = extract_pdf_text(pdf_path)

# Save the DataFrame to a CSV file
df.to_csv('extracted_text.csv', index=False)



def find_interval(number, intervals):
    intervals = sorted(intervals)
    for interval in intervals:
        if number >= interval[0] and number < interval[1]:
            return interval
    return None


df["interval"] = df['x0'].apply(lambda x: find_interval(x, list(gridlines.keys())))
df["value"] = df["interval"].apply(lambda x: gridlines.get(x, None) if x else None)


def process_consecutive_values(df, target_value):
    processed_rows = []
    current_row = None

    for _, row in df.iterrows():
        if row['value'] == target_value:
            if current_row is None:
                current_row = row.copy()
            else:
                # Concatenate the text field
                current_row['text'] += row['text']
                # Update the bounding box
                current_row['x1'] = max(current_row['x1'], row['x1'])
                current_row['y1'] = max(current_row['y1'], row['y1'])
        else:
            if current_row is not None:
                processed_rows.append(current_row)
                current_row = None
            processed_rows.append(row)

    if current_row is not None:
        processed_rows.append(current_row)

    return pd.DataFrame(processed_rows)

df = process_consecutive_values(df, target_value=unique_identifier)


In [2]:
df

Unnamed: 0,page,text,x0,y0,x1,y1,interval,value
0,1,BCP,200.78000,32.7900,230.04584,51.1500,"(76, 212)",name
1,1,Council,234.25028,32.7900,289.20176,51.1500,"(212, 268)",vrm
2,1,-,293.35112,32.7900,298.96928,51.1500,"(268, 311)",licence_number
3,1,Taxi,303.15536,32.7900,333.08216,51.1500,"(268, 311)",licence_number
4,1,Vehicle,337.23152,32.7900,391.76072,51.1500,"(311, 364)",licence_start
...,...,...,...,...,...,...,...,...
11867,16,Zone,550.87588,409.2704,571.96156,418.5104,"(528, 590)",licence_type
11868,16,P,574.53028,409.2704,580.69336,418.5104,"(528, 590)",licence_type
11869,16,(Medium,412.03000,414.9304,447.89968,424.1704,"(412, 528)",make
11870,16,Long),450.52384,414.9304,474.22444,424.1704,"(412, 528)",make


In [3]:
def update_identifier_ranges(df, unique_identifier):
    df_identifier = df[df["value"] == unique_identifier].sort_values(by=["page","y1"]).reset_index(drop=True)
    for index in df_identifier.index:
        if index == 0:
            df_identifier.loc[index, "identifier_range_y0"] = df_identifier.loc[index, 'y0'] 
            df_identifier.loc[index, "identifier_range_y1"] = df_identifier.loc[index, 'y1']
        elif index == df_identifier.index.max():
            df_identifier.loc[index, "identifier_range_y0"] = df_identifier.loc[index-1, 'y1'] 
            df_identifier.loc[index, "identifier_range_y1"] = df_identifier.loc[index, 'y1']
        else:
            df_identifier.loc[index, "identifier_range_y0"] = df_identifier.loc[index-1, 'y1']
            df_identifier.loc[index, "identifier_range_y1"] = df_identifier.loc[index, 'y1']

    # df_identifier = df_identifier[["page", "text", "identifier_range_y0", "identifier_range_y1"]]
    return df_identifier

df_identifier = update_identifier_ranges(df, unique_identifier)

In [4]:
df_identifier

Unnamed: 0,page,text,x0,y0,x1,y1,interval,value,identifier_range_y0,identifier_range_y1
0,1,Ref,46.20,60.4100,58.63704,69.6500,"(42, 76)",reference_number,60.4100,69.6500
1,1,168783,42.36,77.6804,73.29764,86.9204,"(42, 76)",reference_number,69.6500,86.9204
2,1,144672,42.36,89.3204,73.29764,98.5604,"(42, 76)",reference_number,86.9204,98.5604
3,1,151221,42.36,100.9604,73.29764,110.2004,"(42, 76)",reference_number,98.5604,110.2004
4,1,159161,42.36,112.6004,73.29764,121.8404,"(42, 76)",reference_number,110.2004,121.8404
...,...,...,...,...,...,...,...,...,...,...
908,16,176289,42.36,356.9504,73.29764,366.1904,"(42, 76)",reference_number,354.5504,366.1904
909,16,171996,42.36,368.5904,73.29764,377.8304,"(42, 76)",reference_number,366.1904,377.8304
910,16,172299,42.36,380.2304,73.29764,389.4704,"(42, 76)",reference_number,377.8304,389.4704
911,16,175153,42.36,391.8704,73.29764,401.1104,"(42, 76)",reference_number,389.4704,401.1104


In [5]:


# %%
df_reduced = df[["text", "value"]].reset_index(drop=True)

# %%
df_reduced.to_csv("df_reduced.csv", index=False)

# %%
def concatenate_values(df):
    new_value_column = []
    new_text_column = []

    # Iterate over the dataframe to concatenate values
    current_value = None
    current_text = ""

    for value, text in zip(df['value'], df['text']):
        if value == current_value:
            current_text += text + " "
        else:
            if current_value is not None:
                new_value_column.append(current_value)
                new_text_column.append(current_text.strip())
            current_value = value
            current_text = text + " "  # Added space at the end of each text

    # Append the last accumulated values
    if current_value is not None:
        new_value_column.append(current_value)
        new_text_column.append(current_text.strip())

    # Create a new DataFrame with the concatenated values
    new_df = pd.DataFrame({
        'value': new_value_column,
        'text': new_text_column
    })
    return new_df

new_df = concatenate_values(df_reduced)


# %%
def transform_df(new_df, unique_identifier):
    new_df[unique_identifier] = new_df.apply(lambda x: x.text if x.value == unique_identifier else None, axis=1).ffill()
    new_df = new_df.pivot_table(index=unique_identifier, columns='value', values='text', aggfunc='first')
    return new_df

new_df = transform_df(new_df, unique_identifier)


# %%
new_df.to_csv("inspection.csv")

# %%

In [6]:
def find_interval(number, intervals):
    intervals = sorted(intervals)
    for interval in intervals:
        if number >= interval[0] and number < interval[1]:
            return interval
    return None

def apply_intervals(df, council):
    df["interval"] = df['x0'].apply(lambda x: find_interval(x, list(gridlines[council].keys())))
    df["value"] = df["interval"].apply(lambda x: gridlines[council].get(x, None) if x else None)
    return df



In [7]:
def process_consecutive_values(df, target_value):
    processed_rows = []
    current_row = None

    for _, row in df.iterrows():
        if row['value'] == target_value:
            if current_row is None:
                current_row = row.copy()
            else:
                # Concatenate the text field
                current_row['text'] += row['text']
                # Update the bounding box
                current_row['x1'] = max(current_row['x1'], row['x1'])
                current_row['y1'] = max(current_row['y1'], row['y1'])
        else:
            if current_row is not None:
                processed_rows.append(current_row)
                current_row = None
            processed_rows.append(row)

    if current_row is not None:
        processed_rows.append(current_row)

    return pd.DataFrame(processed_rows)

In [8]:
def update_identifier_ranges(df, unique_identifier):
    df_identifier = df[df["value"] == unique_identifier].sort_values(by=["page","y1"]).reset_index(drop=True)
    for index in df_identifier.index:
        if index == 0:
            df_identifier.loc[index, "identifier_range_y0"] = df_identifier.loc[index, 'y0'] 
            df_identifier.loc[index, "identifier_range_y1"] = df_identifier.loc[index, 'y1']
        elif index == df_identifier.index.max():
            df_identifier.loc[index, "identifier_range_y0"] = df_identifier.loc[index-1, 'y1'] 
            df_identifier.loc[index, "identifier_range_y1"] = df_identifier.loc[index, 'y1']
        else:
            df_identifier.loc[index, "identifier_range_y0"] = df_identifier.loc[index-1, 'y1']
            df_identifier.loc[index, "identifier_range_y1"] = df_identifier.loc[index, 'y1']

    df_identifier = df_identifier[["page", "text", "identifier_range_y0", "identifier_range_y1"]]
    return df_identifier

In [9]:
def concatenate_values(df):
    new_value_column = []
    new_text_column = []

    # Iterate over the dataframe to concatenate values
    current_value = None
    current_text = ""

    for value, text in zip(df['value'], df['text']):
        if value == current_value:
            current_text += text + " "
        else:
            if current_value is not None:
                new_value_column.append(current_value)
                new_text_column.append(current_text.strip())
            current_value = value
            current_text = text + " "  # Added space at the end of each text

    # Append the last accumulated values
    if current_value is not None:
        new_value_column.append(current_value)
        new_text_column.append(current_text.strip())

    # Create a new DataFrame with the concatenated values
    new_df = pd.DataFrame({
        'value': new_value_column,
        'text': new_text_column
    })
    return new_df

In [10]:
def transform_df(new_df, unique_identifier):
    new_df[unique_identifier] = new_df.apply(lambda x: x.text if x.value == unique_identifier else None, axis=1).ffill()
    new_df = new_df.pivot_table(index=unique_identifier, columns='value', values='text', aggfunc='first')
    return new_df

In [11]:
def main(council, unique_identifier):
    pdf_path = f'pdfs/{council}.pdf'
    df = extract_pdf_text(pdf_path)
    df = apply_intervals(df, council)
    df = process_consecutive_values(df, target_value=unique_identifier)
    df_identifier = update_identifier_ranges(df, unique_identifier)
    new_df = concatenate_values(df[["text", "value"]].reset_index(drop=True))
    new_df = transform_df(new_df, unique_identifier)
    return new_df

new_df = main(council, unique_identifier)

KeyError: 'BCP'

In [None]:
new_df.to_csv("inspection.csv")