In [11]:
import pdfplumber
import pandas as pd
import numpy as np

def extract_table_from_pdf(pdf_path, has_header):
    rows = []
    header = None
    
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            # Extract table settings: Look for lines
            tables = page.extract_tables({
                "vertical_strategy": "lines",
                "horizontal_strategy": "lines"
            })
            
            for table in tables:
                # If header is not set and has_header is True, set header and skip the first row
                if has_header and header is None:
                    header = table[0] + ["page"]
                    start_row_index = 1
                else:
                    start_row_index = 0

                # If header is not set and has_header is False, create a default header
                if header is None:
                    header = [f"column_{i}" for i in range(len(table[0]))] + ["page"]

                # Iterate over the table rows, starting from the appropriate index
                for row in table[start_row_index:]:
                    row_with_page = row + [page_num]
                    rows.append(row_with_page)
                
    df_extracted = pd.DataFrame(rows, columns=header)
    df_extracted = df_extracted.replace("", None)

    return df_extracted


council = "East Lothian"
council_mappings = {"Salford": (1, True), "Renfrewshire": (2, False), "East Lothian": (3, False)}
number, has_header = council_mappings[council]
pdf_path = f"sample_pdf{number}.pdf"




In [12]:
column_mappings = {
            "Salford": {"REG": "vrm", "VEHICLE TYPE": "make"},
            "Renfrewshire": {"column_1": "vrm", "column_2": "make"},
            "East Lothian": {
                "column_0": "vrm",
                "column_1": "vrm.1",
                "column_3": "make",
                "column_4": "make.1",
                "column_6": "model",
                "column_7": "model.1",
            },
        }

COMBINE VRM COLUMN, FILL IN MISSING VRM WITH VRM ABOVE, THEN PROCESS THE DATAFRAME THROUGH THE SAME FUNCTION AS BEFORE

In [13]:
df = extract_table_from_pdf(pdf_path, has_header=has_header)

In [14]:
def update_headers(column_mappings, df, council):
    column_map = column_mappings[council]
    # Rename the columns using the provided column mapping
    df.rename(columns=column_map, inplace=True)
    
    return df

In [15]:
df = update_headers(column_mappings, df, council)
df.to_csv(f"output_{council}.csv", index=False)

In [16]:

if "vrm.1" in df.columns:
    df['vrm'] = np.where(df['vrm'].isnull(), df['vrm.1'], df['vrm'])
    df['vrm'] = df['vrm'].fillna(method='ffill')
    df.drop(columns=["vrm.1"], inplace=True)

  df['vrm'] = df['vrm'].fillna(method='ffill')


In [17]:
for column in ["make", "model"]:
    if f"{column}.1" in df.columns:
        df[column] = df[column].combine_first(df[f"{column}.1"])
        df.drop(columns=[f"{column}.1"], inplace=True)


In [18]:
df

Unnamed: 0,vrm,column_2,make,column_5,model,column_8,page
0,GF16 HJE,,Vauxhall,,Octavia S TDi,,1.0
1,DL15 XXT,,Vauxhall,,Astra,,1.0
2,CP17 YXJ,,FORD,,TRANSIT,,1.0
3,MM70 JBV,,Vauxhall,,Corsa,,1.0
4,GD66 OMC,,Skoda,,Octavia Estate,,1.0
...,...,...,...,...,...,...,...
234,SD17 OUF,,Peugeot,,Premier RS Blue HDI S/S,,6.0
235,SK62 UPV,,Renault,,Trafic,,6.0
236,MF68 UYR,,Ford,,Tourneo,,6.0
237,LM68 LXF,,Vauxhall,,Vivaro,,6.0


In [19]:
df.to_csv("inspect.csv")