In [51]:
import pdfplumber
import pandas as pd

# Path to the PDF file
pdf_path = 'pdfs/Bedford.pdf'

# Open the PDF file
with pdfplumber.open(pdf_path) as pdf:
    # Initialize an empty list to store the extracted data
    data = []
    # Iterate through each page
    for page_num, page in enumerate(pdf.pages):
        # Extract the text with bounding boxes
        for element in page.extract_words():
            text = element['text']
            x0, y0, x1, y1 = element['x0'], element['top'], element['x1'], element['bottom']
            data.append([page_num + 1, text, x0, y0, x1, y1])

    # Convert the list to a DataFrame
    df = pd.DataFrame(data, columns=['page', 'text', 'x0', 'y0', 'x1', 'y1'])
    # Save the DataFrame to a CSV file
    df.to_csv('extracted_text.csv', index=False)


In [52]:
def find_interval(number, intervals):
    intervals = sorted(intervals)
    for interval in intervals:
        if number >= interval[0] and number < interval[1]:
            return interval
    return None


gridlines = {"Bedford":{
    (52, 121): "plate",
    (121, 220): "vrm",
    (220, 312): "make",
    (312, 419): "model",
    (419, 482): "license_start",
    (482, 538): "license_end"
}
}

council = "Bedford"
first_value = "PHV3429"
unique_identifier = "plate"

df["interval"] = df['x0'].apply(lambda x: find_interval(x, list(gridlines[council].keys())))
df["value"] = df["interval"].apply(lambda x: gridlines[council].get(x, None) if x else None)

In [53]:
df

Unnamed: 0,page,text,x0,y0,x1,y1,interval,value
0,1,Public,52.919998,56.689079,86.102200,67.729529,"(52, 121)",plate
1,1,Register,89.103348,56.689079,133.287315,67.729529,"(52, 121)",plate
2,1,of,136.490814,56.689079,146.791554,67.729529,"(121, 220)",vrm
3,1,Vehicles,149.936228,56.689079,194.737856,67.729529,"(121, 220)",vrm
4,1,Licensed,197.806929,56.689079,245.622859,67.729529,"(121, 220)",vrm
...,...,...,...,...,...,...,...,...
4434,12,ATV,150.793156,774.169055,172.355069,785.209505,"(121, 220)",vrm
4435,12,Peugeot,220.535593,774.169055,261.607663,785.209505,"(220, 312)",make
4436,12,Horizon,312.316449,774.169055,350.188902,785.209505,"(312, 419)",model
4437,12,10/02/2024,419.015067,774.169055,474.269544,785.209505,"(419, 482)",license_start


In [54]:
df = df[24:]

Concatenate the Identifier value first (in this case plate)

In [55]:
def process_consecutive_values(df, target_value):
    processed_rows = []
    current_row = None

    for _, row in df.iterrows():
        if row['value'] == target_value:
            if current_row is None:
                current_row = row.copy()
            else:
                # Concatenate the text field
                current_row['text'] += row['text']
                # Update the bounding box
                current_row['x1'] = max(current_row['x1'], row['x1'])
                current_row['y1'] = max(current_row['y1'], row['y1'])
        else:
            if current_row is not None:
                processed_rows.append(current_row)
                current_row = None
            processed_rows.append(row)

    if current_row is not None:
        processed_rows.append(current_row)

    return pd.DataFrame(processed_rows)

df = process_consecutive_values(df, target_value='plate')


In [56]:
df.tail(50)

Unnamed: 0,page,text,x0,y0,x1,y1,interval,value
4388,12,20/12/2024,482.651186,677.569064,537.836228,688.609514,"(482, 538)",license_end
4389,12,TXV40,52.919998,691.369082,86.771528,702.409532,"(52, 121)",plate
4390,12,LD09,121.482702,691.369082,147.738747,702.409532,"(121, 220)",vrm
4391,12,YLR,150.874235,691.369082,172.204384,702.409532,"(121, 220)",vrm
4392,12,Mercedes,220.539474,691.369082,268.811686,702.409532,"(220, 312)",make
4393,12,Vito,312.311059,691.369082,331.333668,702.409532,"(312, 419)",model
4394,12,22/12/2023,419.039002,691.369082,474.222535,702.409532,"(419, 482)",license_start
4395,12,21/12/2024,482.657439,691.369082,537.842482,702.409532,"(482, 538)",license_end
4396,12,TXV2,52.919998,705.169101,80.631356,716.209551,"(52, 121)",plate
4397,12,B166,121.414778,705.169101,147.129582,716.209551,"(121, 220)",vrm


In [57]:
df.to_csv("test.csv", index=False)

In [58]:
unique_identifier = "plate"

In [59]:
df_identifier = df[df["value"] == unique_identifier].sort_values(by=["page","y1"]).reset_index(drop=True)

In [60]:
for index in df_identifier.index:
    if index == 0:
        df_identifier.loc[index, "identifier_range_y0"] = df_identifier.loc[index, 'y0'] 
        df_identifier.loc[index, "identifier_range_y1"] = df_identifier.loc[index, 'y1']
    elif index == df_identifier.index.max():
        df_identifier.loc[index, "identifier_range_y0"] = df_identifier.loc[index-1, 'y1'] 
        df_identifier.loc[index, "identifier_range_y1"] = df_identifier.loc[index, 'y1']
    else:
        df_identifier.loc[index, "identifier_range_y0"] = df_identifier.loc[index-1, 'y1']
        df_identifier.loc[index, "identifier_range_y1"] = df_identifier.loc[index, 'y1']


In [61]:
df_identifier = df_identifier[["page", "text", "identifier_range_y0", "identifier_range_y1"]]

In [62]:
df_identifier.to_csv("df_identifier.csv", index=False)

In [63]:
df_identifier

Unnamed: 0,page,text,identifier_range_y0,identifier_range_y1
0,1,PHV3429,114.649078,125.689528
1,1,PHV3420,125.689528,139.489535
2,1,PHV3541,139.489535,153.289530
3,1,PHV3722,153.289530,167.089526
4,1,PHV3720,167.089526,180.889532
...,...,...,...,...
621,12,TXV47,716.209551,730.009523
622,12,TXV60,730.009523,743.809542
623,12,TXV5,743.809542,757.609514
624,12,TXV34,757.609514,771.409487


In [None]:
df.head(50)

Unnamed: 0,page,text,x0,y0,x1,y1,interval,value
24,1,PHV3429,52.919998,114.649078,99.957534,125.689528,"(52, 121)",plate
25,1,SF64,121.420169,114.649078,147.729475,125.689528,"(121, 220)",vrm
26,1,FVE,150.798547,114.649078,172.182131,125.689528,"(121, 220)",vrm
27,1,Peugeot,220.583464,114.649078,261.646261,125.689528,"(220, 312)",make
28,1,Independence,312.355048,114.649078,382.206854,125.689528,"(312, 419)",model
29,1,20/02/2023,419.083639,114.649078,474.270191,125.689528,"(419, 482)",license_start
30,1,19/02/2024,482.59469,114.649078,537.849166,125.689528,"(482, 538)",license_end
31,1,PHV3420,52.919998,128.449085,99.957534,139.489535,"(52, 121)",plate
32,1,VU15,121.420169,128.449085,148.890663,139.489535,"(121, 220)",vrm
33,1,JJL,152.094075,128.449085,169.273015,139.489535,"(121, 220)",vrm


In [66]:
df['plate'] = df.apply(lambda row: df_identifier[(df_identifier['identifier_range_y0'] <= row['y0']) & (df_identifier['identifier_range_y1'] >= row['y1']) & (df_identifier['page'] == row['page'])]['text'].iloc[0] if not df_identifier[(df_identifier['identifier_range_y0'] <= row['y0']) & (df_identifier['identifier_range_y1'] >= row['y1']) & (df_identifier['page'] == row['page'])].empty else None, axis=1)


In [71]:
df[-200:-150]

Unnamed: 0,page,text,x0,y0,x1,y1,interval,value,plate
4238,12,EYK,150.793156,387.769092,172.796687,398.809542,"(121, 220)",vrm,TXV32
4239,12,Peugeot,220.535593,387.769092,261.607663,398.809542,"(220, 312)",make,TXV32
4240,12,Partner,312.316449,387.769092,348.530549,398.809542,"(312, 419)",model,TXV32
4241,12,13/10/2023,419.045903,387.769092,474.230945,398.809542,"(419, 482)",license_start,TXV32
4242,12,12/10/2024,482.665849,387.769092,537.852401,398.809542,"(482, 538)",license_end,TXV32
4243,12,TXV31,52.919998,401.569064,86.771528,412.609514,"(52, 121)",plate,TXV31
4244,12,SF15,121.482702,401.569064,147.724084,412.609514,"(121, 220)",vrm,TXV31
4245,12,FOM,150.793156,401.569064,175.321414,412.609514,"(121, 220)",vrm,TXV31
4246,12,Peugeot,220.554137,401.569064,261.616935,412.609514,"(220, 312)",make,TXV31
4247,12,Premier,312.325722,401.569064,350.991146,412.609514,"(312, 419)",model,TXV31


In [None]:
df.to_csv("test.csv")

In [None]:
df_reduced = df[24:][["text", "value"]].reset_index(drop=True)

In [None]:
df_reduced.to_csv("df_reduced.csv", index=False)

In [None]:
df_reduced

Unnamed: 0,text,value
0,Skoda,make
1,Fabia,model
2,21/02/2023,license_start
3,20/02/2024,license_end
4,PHV3720,plate
...,...,...
4381,ATV,vrm
4382,Peugeot,make
4383,Horizon,model
4384,10/02/2024,license_start


In [None]:
new_value_column = []
new_text_column = []

# Iterate over the dataframe to concatenate values
current_value = None
current_text = ""

for value, text in zip(df_reduced['value'], df_reduced['text']):
    if value == current_value:
        current_text += text + " "
    else:
        if current_value is not None:
            new_value_column.append(current_value)
            new_text_column.append(current_text.strip())
        current_value = value
        current_text = text

# Append the last accumulated values
if current_value is not None:
    new_value_column.append(current_value)
    new_text_column.append(current_text.strip())

# Create a new DataFrame with the concatenated values
new_df = pd.DataFrame({
    'value': new_value_column,
    'text': new_text_column
})

In [None]:
new_df.to_csv("inspection.csv")