In [1]:
import pdfplumber
import pandas as pd

# Path to the PDF file
pdf_path = 'pdfs/BCP.pdf'

# Open the PDF file
with pdfplumber.open(pdf_path) as pdf:
    # Initialize an empty list to store the extracted data
    data = []
    # Iterate through each page
    for page_num, page in enumerate(pdf.pages):
        # Extract the text with bounding boxes
        for element in page.extract_words():
            text = element['text']
            x0, y0, x1, y1 = element['x0'], element['top'], element['x1'], element['bottom']
            data.append([page_num + 1, text, x0, y0, x1, y1])

    # Convert the list to a DataFrame
    df = pd.DataFrame(data, columns=['page', 'text', 'x0', 'y0', 'x1', 'y1'])
    # Save the DataFrame to a CSV file
    df.to_csv('extracted_text.csv', index=False)


In [2]:
def find_interval(number, intervals):
    intervals = sorted(intervals)
    for interval in intervals:
        if number >= interval[0] and number < interval[1]:
            return interval
    return None


gridlines = {"Bedford":{
    (52, 121): "plate",
    (121, 220): "vrm",
    (220, 312): "make",
    (312, 419): "model",
    (419, 482): "licence_start",
    (482, 538): "licence_end"
},
"BCP":{
    (42, 76): "reference_number",
    (76, 212): "name",
    (212, 268): "vrm",
    (268, 311): "licence_number",
    (311, 364): "licence_start",
    (364, 412): "licence_end",
    (412, 528): "make",
    (528, 590): "licence_type",
}
}

has_header = True
council = "BCP"
first_value = "Ref"
unique_identifier = "reference_number"

df["interval"] = df['x0'].apply(lambda x: find_interval(x, list(gridlines[council].keys())))
df["value"] = df["interval"].apply(lambda x: gridlines[council].get(x, None) if x else None)

In [3]:
starting_index = df[df["text"] == first_value].index[0]

In [4]:
df = df[starting_index:]

In [5]:
df[2140:2170]

Unnamed: 0,page,text,x0,y0,x1,y1,interval,value
2148,3,Zone,550.87588,589.8904,571.96156,599.1304,"(528, 590)",licence_type
2149,3,B,574.53028,589.8904,580.69336,599.1304,"(528, 590)",licence_type
2150,3,120233,42.36,601.5604,73.29764,610.8004,"(42, 76)",reference_number
2151,3,Mr,76.92,601.5604,87.55524,610.8004,"(76, 212)",name
2152,3,Abate,90.11472,601.5604,114.2034,610.8004,"(76, 212)",name
2153,3,HF53,212.66,601.5604,235.28876,610.8004,"(212, 268)",vrm
2154,3,RUO,237.839,601.5604,258.4442,610.8004,"(212, 268)",vrm
2155,3,120233,271.13,601.5604,302.06764,610.8004,"(268, 311)",licence_number
2156,3,05-May-09,311.21,601.5604,355.18316,610.8004,"(311, 364)",licence_start
2157,3,31-May-24,364.39,601.5604,408.36316,610.8004,"(364, 412)",licence_end


In [6]:
df[df['text'] == "120233"]

Unnamed: 0,page,text,x0,y0,x1,y1,interval,value
2150,3,120233,42.36,601.5604,73.29764,610.8004,"(42, 76)",reference_number
2155,3,120233,271.13,601.5604,302.06764,610.8004,"(268, 311)",licence_number


Concatenate the Identifier value first (in this case plate)

In [7]:
def process_consecutive_values(df, target_value):
    processed_rows = []
    current_row = None

    for _, row in df.iterrows():
        if row['value'] == target_value:
            if current_row is None:
                current_row = row.copy()
            else:
                # Concatenate the text field
                current_row['text'] += row['text']
                # Update the bounding box
                current_row['x1'] = max(current_row['x1'], row['x1'])
                current_row['y1'] = max(current_row['y1'], row['y1'])
        else:
            if current_row is not None:
                processed_rows.append(current_row)
                current_row = None
            processed_rows.append(row)

    if current_row is not None:
        processed_rows.append(current_row)

    return pd.DataFrame(processed_rows)

df = process_consecutive_values(df, target_value=unique_identifier)


In [8]:
df.tail(50)

Unnamed: 0,page,text,x0,y0,x1,y1,interval,value
11822,16,HCV83,268.61,368.5904,298.41824,377.8304,"(268, 311)",licence_number
11823,16,10-Feb-23,312.41,368.5904,355.17272,377.8304,"(311, 364)",licence_start
11824,16,31-Mar-25,365.71,368.5904,408.3526,377.8304,"(364, 412)",licence_end
11825,16,Toyota,412.03,368.5904,440.11036,377.8304,"(412, 528)",make
11826,16,Corolla,442.62364,368.5904,471.97912,377.8304,"(412, 528)",make
11827,16,Estate,474.54784,368.5904,500.67856,377.8304,"(412, 528)",make
11828,16,HCV,528.82,368.5904,548.39956,377.8304,"(528, 590)",licence_type
11829,16,Zone,550.87588,368.5904,571.96156,377.8304,"(528, 590)",licence_type
11830,16,B,574.53028,368.5904,580.69336,377.8304,"(528, 590)",licence_type
11831,16,172299,42.36,380.2304,73.29764,389.4704,"(42, 76)",reference_number


In [9]:
df.to_csv("test.csv", index=False)

In [10]:
df_identifier = df[df["value"] == unique_identifier].sort_values(by=["page","y1"]).reset_index(drop=True)

In [11]:
for index in df_identifier.index:
    if index == 0:
        df_identifier.loc[index, "identifier_range_y0"] = df_identifier.loc[index, 'y0'] 
        df_identifier.loc[index, "identifier_range_y1"] = df_identifier.loc[index, 'y1']
    elif index == df_identifier.index.max():
        df_identifier.loc[index, "identifier_range_y0"] = df_identifier.loc[index-1, 'y1'] 
        df_identifier.loc[index, "identifier_range_y1"] = df_identifier.loc[index, 'y1']
    else:
        df_identifier.loc[index, "identifier_range_y0"] = df_identifier.loc[index-1, 'y1']
        df_identifier.loc[index, "identifier_range_y1"] = df_identifier.loc[index, 'y1']


In [12]:
df_identifier = df_identifier[["page", "text", "identifier_range_y0", "identifier_range_y1"]]

In [13]:
df_identifier.to_csv("df_identifier.csv", index=False)

In [14]:
df_identifier

Unnamed: 0,page,text,identifier_range_y0,identifier_range_y1
0,1,Ref,60.4100,69.6500
1,1,168783,69.6500,86.9204
2,1,144672,86.9204,98.5604
3,1,151221,98.5604,110.2004
4,1,159161,110.2004,121.8404
...,...,...,...,...
908,16,176289,354.5504,366.1904
909,16,171996,366.1904,377.8304
910,16,172299,377.8304,389.4704
911,16,175153,389.4704,401.1104


In [15]:
df.head(50)

Unnamed: 0,page,text,x0,y0,x1,y1,interval,value
8,1,Ref,46.2,60.41,58.63704,69.65,"(42, 76)",reference_number
9,1,Name,131.9,60.41,154.36244,69.65,"(76, 212)",name
10,1,Vehicle,217.25,60.41,244.739,69.65,"(212, 268)",vrm
11,1,Reg,246.74408,60.41,260.71496,69.65,"(212, 268)",vrm
12,1,Lic,272.81,60.41,282.71528,69.65,"(268, 311)",licence_number
13,1,No.,284.80352,60.41,298.02596,69.65,"(268, 311)",licence_number
14,1,Licence,306.17,60.41,334.03784,69.65,"(268, 311)",licence_number
15,1,Issue,336.04292,60.41,354.92948,69.65,"(311, 364)",licence_start
16,1,Make,442.42,60.41,463.5796,69.65,"(412, 528)",make
17,1,/,465.57544,60.41,469.14208,69.65,"(412, 528)",make


In [16]:
df["identifier"] = df.apply(lambda row: df_identifier[(df_identifier['identifier_range_y0'] <= row['y0']) & (df_identifier['identifier_range_y1'] >= row['y1']) & (df_identifier['page'] == row['page'])]['text'].iloc[0] if not df_identifier[(df_identifier['identifier_range_y0'] <= row['y0']) & (df_identifier['identifier_range_y1'] >= row['y1']) & (df_identifier['page'] == row['page'])].empty else None, axis=1)


In [17]:
df[-200:-150]

Unnamed: 0,page,text,x0,y0,x1,y1,interval,value,identifier
11672,16,Class,463.82944,217.3604,487.03108,226.6004,"(412, 528)",make,176069.0
11673,16,BCP,528.82,217.3604,547.8082,226.6004,"(528, 590)",licence_type,176069.0
11674,16,PHV,550.28452,217.3604,569.27272,226.6004,"(528, 590)",licence_type,176069.0
11675,16,Volkswagen,412.03,229.0004,461.87056,238.2404,"(412, 528)",make,175967.0
11676,16,Caddy,464.43004,229.0004,491.20756,238.2404,"(412, 528)",make,175967.0
11677,16,Maxi,493.573,229.0004,512.78296,238.2404,"(412, 528)",make,175967.0
11678,16,175967,42.36,234.7904,73.29764,244.0304,"(42, 76)",reference_number,175967.0
11679,16,Mr,76.92,234.7904,87.55524,244.0304,"(76, 212)",name,175967.0
11680,16,Inocencio,90.11472,234.7904,129.78204,244.0304,"(76, 212)",name,175967.0
11681,16,HG72,212.66,234.7904,236.85956,244.0304,"(212, 268)",vrm,175967.0


In [18]:
df.to_csv("test.csv")

In [19]:
df_reduced = df[["text", "value"]].reset_index(drop=True)

In [20]:
df_reduced.to_csv("df_reduced.csv", index=False)

In [21]:
df_reduced

Unnamed: 0,text,value
0,Ref,reference_number
1,Name,name
2,Vehicle,vrm
3,Reg,vrm
4,Lic,licence_number
...,...,...
11859,Zone,licence_type
11860,P,licence_type
11861,(Medium,make
11862,Long),make


In [22]:
new_value_column = []
new_text_column = []

# Iterate over the dataframe to concatenate values
current_value = None
current_text = ""

for value, text in zip(df_reduced['value'], df_reduced['text']):
    if value == current_value:
        current_text += text + " "
    else:
        if current_value is not None:
            new_value_column.append(current_value)
            new_text_column.append(current_text.strip())
        current_value = value
        current_text = text + " "  # Added space at the end of each text

# Append the last accumulated values
if current_value is not None:
    new_value_column.append(current_value)
    new_text_column.append(current_text.strip())

# Create a new DataFrame with the concatenated values
new_df = pd.DataFrame({
    'value': new_value_column,
    'text': new_text_column
})

In [23]:
new_df[unique_identifier] = new_df.apply(lambda x: x.text if x.value == unique_identifier else None, axis=1).ffill()


In [24]:
new_df.tail(50)

Unnamed: 0,value,text,reference_number
7362,licence_type,BCP PHV,159221
7363,reference_number,176044,176044
7364,name,Mr Harvey,176044
7365,vrm,BP19 VCG,176044
7366,licence_number,176044,176044
7367,licence_start,31-Mar-21,176044
7368,licence_end,31-Mar-25,176044
7369,make,Volvo V90,176044
7370,licence_type,BCP PHV,176044
7371,reference_number,176289,176289


In [25]:
new_df.pivot_table(index='reference_number', columns='value', values='text', aggfunc='first')

value,licence_end,licence_number,licence_start,licence_type,make,name,reference_number,vrm
reference_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
120233,31-May-24,120233,05-May-09,BCP PHV,Lincoln Town Car,Mr Abate,120233,HF53 RUO
120744,31-Jul-24,HCV234,09-Jul-09,HCV Zone B,Citroen Dispatch,Mr Almoshabuk,120744,BX09 MSV
121429,31-Oct-24,HCV1,16-Oct-09,HCV Zone B,Peugeot 407,Ms Finet,121429,HJ09 HPY
124746,30-Nov-24,124746,01-Nov-10,BCP PHV,Skoda Octavia,Mr Fowler,124746,SG60 YVP
130601,31-Aug-24,HCV235,24-Aug-12,HCV Zone B,Peugeot Partner,Mr Khayal,130601,SA58 YZT
...,...,...,...,...,...,...,...,...
176228,31-May-24,176228,28-Feb-24,BCP PHV,Toyota Prius Plus,Mr Szymanski,176228,NA20 ECW
176289,31-Mar-25,H65,24-Mar-22,HCV Zone P,Hyundai Ioniq,Mr Ferencz,176289,OV69 NHL
176307,30-Apr-24,64,07-Apr-20,HCV Zone C,Ford Tourneo Custom,Mr Barnicoat,176307,ML19 KCX
176324,31-May-24,HCV225,01-May-19,HCV Zone B,Life,Mr Elmi-Aliabadi,176324,YX68 YNF


In [26]:
new_df.to_csv("inspection.csv")