In [364]:
import pdfplumber
import pandas as pd


def extract_pdf_text(pdf_path):
    # Open the PDF file
    with pdfplumber.open(pdf_path) as pdf:
        # Initialize an empty list to store the extracted data
        data = []
        # Iterate through each page
        for page_num, page in enumerate(pdf.pages):
            # Extract the text with bounding boxes
            for element in page.extract_words():
                text = element['text']
                x0, y0, x1, y1 = element['x0'], element['top'], element['x1'], element['bottom']
                data.append([page_num + 1, text, x0, y0, x1, y1])

        # Convert the list to a DataFrame
        df = pd.DataFrame(data, columns=['page', 'text', 'x0', 'y0', 'x1', 'y1'])
        return df


def find_interval(number, intervals):
    intervals = sorted(intervals)
    for interval in intervals:
        if number >= interval[0] and number < interval[1]:
            return interval
    return None


def assign_intervals_and_values(df, gridlines):
    df["interval"] = df['x0'].apply(lambda x: find_interval(x, list(gridlines.keys())))
    df["value"] = df["interval"].apply(lambda x: gridlines.get(x, None) if x else None)
    return df


def process_consecutive_values(df, target_value):
    processed_rows = []
    current_row = None

    for _, row in df.iterrows():
        if row['value'] == target_value:
            if current_row is None:
                current_row = row.copy()
            else:
                # Concatenate the text field
                current_row['text'] += row['text']
                # Update the bounding box
                current_row['x1'] = max(current_row['x1'], row['x1'])
                current_row['y1'] = max(current_row['y1'], row['y1'])
        else:
            if current_row is not None:
                processed_rows.append(current_row)
                current_row = None
            processed_rows.append(row)

    if current_row is not None:
        processed_rows.append(current_row)

    return pd.DataFrame(processed_rows)


In [365]:
def update_identifier_ranges(df, unique_identifier):
    df_identifier = df[df["value"] == unique_identifier].sort_values(by=["page","y1"]).reset_index(drop=True)
    for index in df_identifier.index:
        if index == 0:
            df_identifier.loc[index, "identifier_range_y0"] = df_identifier.loc[index, 'y0'] 
            df_identifier.loc[index, "identifier_range_y1"] = df_identifier.loc[index, 'y1']
        elif index == df_identifier.index.max():
            df_identifier.loc[index, "identifier_range_y0"] = df_identifier.loc[index-1, 'y1'] 
            df_identifier.loc[index, "identifier_range_y1"] = df_identifier.loc[index, 'y1']
        else:
            df_identifier.loc[index, "identifier_range_y0"] = df_identifier.loc[index-1, 'y1']
            df_identifier.loc[index, "identifier_range_y1"] = df_identifier.loc[index, 'y1']

    df_identifier = df_identifier[["page", "text", "identifier_range_y0", "identifier_range_y1"]]
    return df_identifier

In [366]:
def concatenate_values(df):
    new_value_column = []
    new_text_column = []

    # Iterate over the dataframe to concatenate values
    current_value = None
    current_text = ""

    for value, text in zip(df['value'], df['text']):
        if value == current_value:
            current_text += text + " "
        else:
            if current_value is not None:
                new_value_column.append(current_value)
                new_text_column.append(current_text.strip())
            current_value = value
            current_text = text + " "  # Added space at the end of each text

    # Append the last accumulated values
    if current_value is not None:
        new_value_column.append(current_value)
        new_text_column.append(current_text.strip())

    # Create a new DataFrame with the concatenated values
    new_df = pd.DataFrame({
        'value': new_value_column,
        'text': new_text_column
    })
    return new_df

def transform_df(new_df, unique_identifier):
    new_df[unique_identifier] = new_df.apply(lambda x: x.text if x.value == unique_identifier else None, axis=1).ffill()
    new_df = new_df.pivot_table(index=unique_identifier, columns='value', values='text', aggfunc='first')
    return new_df

In [367]:
council = "Stroud"
config = {
    "Bedford": {
        "gridlines": {
            (52, 121): "plate",
            (121, 220): "vrm",
            (220, 312): "make",
            (312, 418): "model",
            (418, 482): "licence_start",
            (482, 538): "licence_end"
        },
        "unique_identifier": "plate"
    },
    "BCP": {
        "gridlines": {
            (42, 76): "reference_number",
            (76, 212): "name",
            (212, 268): "vrm",
            (268, 311): "licence_number",
            (311, 364): "licence_start",
            (364, 412): "licence_end",
            (412, 528): "make",
            (528, 590): "licence_type",
        },
        "unique_identifier": "reference_number"
    },
    "Cheshire West": {
        "gridlines": {
            (52, 160): "make",
            (160, 478): "model",
            (478, 540): "vrm"
        },
        "unique_identifier": "vrm"
    },
    "East Northamptonshire": {
        "gridlines": {
            (55, 130): "vrm",
            (160, 240): "make",
            (240, 540): "model"
        },
        "unique_identifier": "vrm"
    },
    "Fylde": {
        "gridlines": {
            (18, 87): "vrm",
            (87, 216): "make",
            (216, 340): "model",
            (340, 460): "licence_start",
            (460, 540): "licence_end"
        },
        "unique_identifier": "vrm"
    },
    "Great Yarmouth": {
        "gridlines": {
            (52, 160): "vrm",
            (160, 238): "make",
            (238, 540): "model",
        },
        "unique_identifier": "vrm"
    },
    "Horsham": {
        "gridlines": {
            (52, 103): "reference_number",
            (103, 152): "licence_type",
            (152, 218): "name",
            (218, 266): "vrm",
            (266, 320): "make",
            (320, 386): "model",
            (386, 446): "licence_start",
            (446, 484): "licence_end",
            (484, 540): "plate",
        },
        "unique_identifier": "vrm"
    },
    "Kettering": {
        "gridlines": {
            (55, 102): "vrm",
            (148, 214): "make",
            (214, 540): "model"
        },
        "unique_identifier": "vrm"
    },
    "Maldon": {
        "gridlines": {
            (35, 142): "plate",
            (142, 258): "vrm",
            (258, 312): "make",
            (312, 445): "model",
            (445, 576): "colour",
            (576, 648): "seats",
            (648, 736): "licence_start",
            (736, 800): "licence_end",
        },
        "unique_identifier": "vrm"
    },
    "Milton Keynes": {
        "gridlines": {
            (256, 288): "plate",
            (288, 336): "vrm",
            (336, 406): "make",
            (406, 488): "model",
            (488, 742): "name",
            (742, 800): "licence_end"
        },
        "unique_identifier": "plate"
    },
    "North Lanarkshire": {
        "gridlines": {
            (52, 140): "vrm",
            (140, 210): "make",
            (210, 360): "model",
        },
        "unique_identifier": "vrm"
    },
    "Powys": {
        "gridlines": {
            (3, 38): "plate",
            (98, 222): "vrm",
            (222, 312): "make",
            (312, 404): "model",
            (404, 482): "licence_start",
        },
        "unique_identifier": "plate"
    },
    "Swansea": {
        "gridlines": {
            (52, 130): "vrm",
            (130, 240): "make"
        },
        "unique_identifier": "vrm"
    },
    "Wealden": {
        "gridlines": {
            (56, 98): "plate",
            (98, 136): "vrm",
            (136, 206): "make",
            (206, 272): "model",
            (272, 315): "colour",
            (315, 404): "name",
            (450, 488): "licence_start",
            (496, 540): "licence_end",
        },
        "unique_identifier": "vrm"
    },
    "West Lothian": {
        "gridlines": {
            (52, 184): "licence_type",
            (184, 295): "vrm",
            (295, 372): "make",
            (372, 440): "model",
        },
        "unique_identifier": "vrm"
    },
    "West Morland Barrow": {
        "gridlines": {
            (55, 118): "vrm",
            (118, 200): "make",
            (200, 300): "model",
        },
        "unique_identifier": "vrm"
    },
    "Argyll and Bute": {
        "gridlines": {
            (52, 158): "vrm",
            (158, 245): "make",
            (245, 400): "model",
        },
        "unique_identifier": "vrm"
    },
    "East Lothian": {
        "gridlines": {
            (77, 170): "vrm",
            (220, 312): "make",
            (312, 418): "model",
        },
        "unique_identifier": "vrm"
    },
    "Renfrewshire": {
        "gridlines": {
            (52, 138): "reference_number",
            (138, 248): "vrm",
            (248, 440): "make",
        },
        "unique_identifier": "vrm"
    },
    "Salford": {
        "gridlines": {
            (28, 78): "reference_number",
            (78, 136): "licence_type",
            (136, 198): "licence_start",
            (198, 250): "licence_end",
            (286, 392): "make",
            (392, 450): "vrm",
        },
        "unique_identifier": "reference_number"
    },
    "Corby": {
        "gridlines": {
            (248, 312): "plate",
            (312, 390): "licence_start",
            (390, 460): "vrm",
            (460, 538): "make",
            (538, 600): "model",
        },
        "unique_identifier": "plate"
    },
    "Cornwall": {
        "gridlines": {
            (28, 121): "reference_number",
            (208, 318): "vrm",
            (318, 416): "make",
            (416, 570): "model",
            (618, 700): "licence_end",
        },
        "unique_identifier": "reference_number"
    },
    "Derby": {
        "gridlines": {
            (32, 88): "plate",
            (88, 320): "name",
            (320, 416): "vrm",
            (416, 540): "make",
        },
        "unique_identifier": "plate"
    },
    "East Dunbarton": {
        "gridlines": {
            (76, 150): "vrm",
            (150, 240): "make",
            (240, 400): "model",
        },
        "unique_identifier": "vrm"
    },
    "Eastleigh": {
        "gridlines": {
            (56, 196): "vrm",
            (196, 276): "make",
            (276, 400): "model",
        },
        "unique_identifier": "vrm"
    },
    "Anglesey": {
        "gridlines": {
            (52, 121): "licence_type",
            (121, 220): "vrm",
            (220, 312): "make",
            (312, 418): "model",
        },
        "unique_identifier": "vrm"
    },
    "Perth Kinross": {
        "gridlines": {
            (76, 196): "vrm",
            (196, 286): "make",
            (286, 400): "model",
        },
        "unique_identifier": "vrm"
    },
    "Rother": {
        "gridlines": {
            (50, 96): "plate",
            (96, 148): "vrm",
            (148, 212): "make",
            (212, 268): "model",
            (456, 499): "licence_start",
            (499, 560): "licence_end",
        },
        "unique_identifier": "plate"
    },
    "Stockton": {
        "gridlines": {
            (40, 230): "licence_type",
            (276, 386): "vrm",
            (386, 510): "make",
            (510, 578): "licence_start",
            (578, 644): "licence_end",
        },
        "unique_identifier": "vrm"
    },
    "Stroud": {
        "gridlines": {
            (52, 121): "vrm",
            (121, 220): "make",
            (220, 312): "model",
            (312, 418): "licence_start",
            (312, 418): "licence_end",
        },
        "unique_identifier": "plate"
    }
}

gridlines = config[council]["gridlines"]
unique_identifier = config[council]["unique_identifier"]

pdf_path = f'pdfs/tabular/{council}.pdf'
df = extract_pdf_text(pdf_path)


In [368]:
df.head(50)

Unnamed: 0,page,text,x0,y0,x1,y1
0,1,This,23.04,18.93,39.29472,28.89
1,1,document,41.59548,18.93,83.07888,28.89
2,1,was,85.36968,18.93,101.11644,28.89
3,1,classified,103.45704,18.93,140.3688,28.89
4,1,as:,142.6596,18.93,154.10364,28.89
5,1,OFFICIAL,156.3546,18.93,192.33012,28.89
6,1,Register,36.0,45.03,88.97124,60.99
7,1,of,92.4984,45.03,105.77712,60.99
8,1,licensed,109.38408,45.03,162.61068,60.99
9,1,hackney,166.21764,45.03,219.89112,60.99


In [369]:
df[100:150]

Unnamed: 0,page,text,x0,y0,x1,y1
100,1,Vehicle,121.78224,191.09,154.40544,202.13
101,1,(Saloon),156.8232,191.09,193.752,202.13
102,1,HCV,230.18,191.09,249.1688,202.13
103,1,1190,251.66384,191.09,274.04192,202.13
104,1,LD62,311.69,191.09,334.31096,202.13
105,1,UVP,336.73976,191.09,355.7948,202.13
106,1,Volkswagen,388.73,191.09,442.274,202.13
107,1,Passat,444.63656,191.09,473.2964,202.13
108,1,01/03/2024,511.27,191.09,564.44968,202.13
109,1,28/09/2024,579.34,191.09,632.51968,202.13


In [370]:
df = assign_intervals_and_values(df, gridlines)
df = process_consecutive_values(df, target_value=unique_identifier)
df_identifier = update_identifier_ranges(df, unique_identifier)
df_reduced = df[["text", "value"]].reset_index(drop=True)
new_df = concatenate_values(df_reduced)
new_df = transform_df(new_df, unique_identifier)

In [371]:
new_df

value,licence_end,licence_start,licence_type,make,vrm
vrm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A19TKR,30/04/2024,17/04/2023,Hackney Carriage Vehicle (Saloon),Suzuki Swace,A19TKR
AD12GYW,18/05/2024,01/12/2023,Hackney Carriage Vehicle (Saloon),Skoda Octavia,AD12GYW
AD19DTX,31/01/2025,02/02/2024,Private Hire Vehicle (Saloon),Toyota Corolla,AD19DTX
AE63NLZ,31/10/2024,01/11/2023,Hackney Carriage Vehicle (Saloon),Volkswagen Passat,AE63NLZ
AE63ONG,30/11/2024,01/12/2023,Private Hire Vehicle (Saloon),Skoda Octavia,AE63ONG
...,...,...,...,...,...
YS73HTP,30/11/2024,12/12/2023,Hackney Carriage Vehicle (Saloon),Toyota Corolla,YS73HTP
YT19NBD,31/12/2024,01/01/2024,Private Hire Vehicle (Saloon),Skoda Octavia,YT19NBD
YT19NBE,31/03/2024,15/03/2023,Private Hire Vehicle (Saloon),Skoda Octavia,YT19NBE
YY63DDU,31/01/2025,01/02/2024,Hackney Carriage Vehicle (Saloon),Skoda Octavia,YY63DDU


In [372]:
new_df.to_csv('inspection.csv')