In [18]:
import streamlit as st
import pandas as pd
import fitz  # PyMuPDF
import io
import pdfplumber
import json


def display_first_two_pdf_pages(pdf_bytes):
    # Load the PDF file from bytes
    pdf = fitz.open("pdf", pdf_bytes)
    num_pages = len(pdf)

    # Create a two-column layout if there are at least two pages, otherwise one column
    if num_pages >= 2:
        cols = st.columns(2)
    else:
        cols = st.columns(1)

    # Display the first page
    page = pdf.load_page(0)
    pix = page.get_pixmap()
    img_data = pix.tobytes("ppm")
    image = io.BytesIO(img_data)
    with cols[0]:
        st.image(image, caption="First page of the PDF", use_column_width=True)

    # Display the second page if it exists
    if num_pages >= 2:
        page = pdf.load_page(1)
        pix = page.get_pixmap()
        img_data = pix.tobytes("ppm")
        image = io.BytesIO(img_data)
        with cols[1]:
            st.image(image, caption="Second page of the PDF", use_column_width=True)

    # Close the PDF file
    pdf.close()


def extract_pdf_text(pdf_path):
    # Open the PDF file
    with pdfplumber.open(pdf_path) as pdf:
        # Initialize an empty list to store the extracted data
        data = []
        # Iterate through each page
        for page_num, page in enumerate(pdf.pages):
            # Extract the text with bounding boxes
            for element in page.extract_words():
                text = element["text"]
                x0, y0, x1, y1 = (
                    element["x0"],
                    element["top"],
                    element["x1"],
                    element["bottom"],
                )
                data.append([page_num + 1, text, x0, y0, x1, y1])

        # Convert the list to a DataFrame
        df = pd.DataFrame(data, columns=["page", "text", "x0", "y0", "x1", "y1"])
        return df


def find_interval(number, intervals):
    intervals = sorted(intervals)
    for interval in intervals:
        if number >= interval[0] and number < interval[1]:
            return interval
    return None


def assign_intervals_and_values(df, gridlines):
    # Create a list of intervals from the gridlines
    intervals = [item["interval"] for item in gridlines]
    df["interval"] = df["x0"].apply(lambda x: find_interval(x, intervals))
    df["value"] = df["interval"].apply(
        lambda x: (
            next((item["label"] for item in gridlines if item["interval"] == x), None)
            if x
            else None
        )
    )
    return df


def process_consecutive_values(df, target_value):
    processed_rows = []
    current_row = None

    for _, row in df.iterrows():
        if row["value"] == target_value:
            if current_row is None:
                current_row = row.copy()
            else:
                # Concatenate the text field with a space
                current_row["text"] += " " + row["text"]
                # Update the bounding box
                current_row["x1"] = max(current_row["x1"], row["x1"])
                current_row["y1"] = max(current_row["y1"], row["y1"])
        else:
            if current_row is not None:
                processed_rows.append(current_row)
                current_row = None
            processed_rows.append(row)

    if current_row is not None:
        processed_rows.append(current_row)

    df = pd.DataFrame(processed_rows)
    df.dropna(subset=["value"], inplace=True)
    return df


# def concatenate_values(df):
#     new_value_column = []
#     new_text_column = []

#     # Iterate over the dataframe to concatenate values
#     current_value = None
#     current_text = ""

#     for value, text in zip(df["value"], df["text"]):
#         if value == current_value:
#             current_text += text + " "
#         else:
#             if current_value is not None:
#                 new_value_column.append(current_value)
#                 new_text_column.append(current_text.strip())
#             current_value = value
#             current_text = text + " "  # Added space at the end of each text

#     # Append the last accumulated values
#     if current_value is not None:
#         new_value_column.append(current_value)
#         new_text_column.append(current_text.strip())

#     # Create a new DataFrame with the concatenated values
#     new_df = pd.DataFrame({"value": new_value_column, "text": new_text_column})
#     return new_df


def transform_df(new_df, unique_identifier, date_format):
    # new_df[unique_identifier] = new_df.apply(
    #     lambda x: x.text if x.value == unique_identifier else None, axis=1
    # ).ffill()
    # new_df = new_df.pivot_table(
    #     index=unique_identifier, columns="value", values="text", aggfunc="first"
    # )
    # new_df.reset_index(drop=True, inplace=True)
    new_df["reg"] = new_df["reg"].str.replace(" ", "")

    # Filter rows where "reg" column matches the specified pattern
    new_df = new_df[
        new_df["reg"].str.contains(r"^(?:[A-Z]+[0-9]|[0-9]+[A-Z])[A-Z0-9]*$", na=False)
    ]
    if date_format is not None:
        if "date_from" in new_df.columns:
            new_df["date_from"] = pd.to_datetime(
                new_df["date_from"], format=date_format, errors="coerce"
            ).dt.strftime("%d/%m/%Y")
        if "date_to" in new_df.columns:
            new_df["date_to"] = pd.to_datetime(
                new_df["date_to"], format=date_format, errors="coerce"
            ).dt.strftime("%d/%m/%Y")

    # Remove rows where date format does not match
    if "date_from" in new_df.columns:
        new_df = new_df[new_df["date_from"].notna()]
    if "date_to" in new_df.columns:
        new_df = new_df[new_df["date_to"].notna()]

    # Select the required columns
    columns_to_select = ["reg", "make", "model", "date_from", "date_to"]
    existing_columns = [col for col in columns_to_select if col in new_df.columns]
    new_df = new_df[existing_columns]
    new_df = new_df.sort_values(by="reg", ascending=True)

    return new_df





In [19]:

council = "Cheshire West"
pdf_config = json.load(open("data_processor/data/pdf_config.json"))

gridlines = pdf_config[council]["gridlines"]
unique_identifier = [item["label"] for item in gridlines][0]
date_format = pdf_config[council]["date_format"]
pdf_path = f"pdf_files/tabular/{council}.pdf"
labels = [item["label"] for item in gridlines]

df = extract_pdf_text(pdf_path)
df = assign_intervals_and_values(df, gridlines)
df = process_consecutive_values(df, target_value=unique_identifier)
df_reduced = df[["text", "value"]].reset_index(drop=True)




In [20]:
pdf_config_df = pd.DataFrame(pdf_config)
pdf_config_df.to_csv("data_processor/data/pdf_config.csv")
pdf_config = pd.read_csv("data_processor/data/pdf_config.csv", index_col=0).to_dict()

In [29]:
pdf_config

{'Anglesey': {'gridlines': "[{'interval': [112, 368], 'label': 'reg'}, {'interval': [368, 514], 'label': 'make'}, {'interval': [514, 750], 'label': 'model'}]",
  'date_format': nan},
 'Argyll and Bute': {'gridlines': "[{'interval': [52, 158], 'label': 'reg'}, {'interval': [158, 245], 'label': 'make'}, {'interval': [245, 400], 'label': 'model'}]",
  'date_format': nan},
 'BCP': {'gridlines': "[{'interval': [42, 76], 'label': 'reference_number'}, {'interval': [212, 268], 'label': 'reg'}, {'interval': [311, 364], 'label': 'date_from'}, {'interval': [364, 412], 'label': 'date_to'}, {'interval': [412, 528], 'label': 'make'}]",
  'date_format': '%d-%b-%y'},
 'Bedford': {'gridlines': "[{'interval': [52, 121], 'label': 'plate'}, {'interval': [121, 220], 'label': 'reg'}, {'interval': [220, 312], 'label': 'make'}, {'interval': [312, 418], 'label': 'model'}, {'interval': [418, 482], 'label': 'date_from'}, {'interval': [482, 538], 'label': 'date_to'}]",
  'date_format': '%d/%m/%Y'},
 'Calderdale':

In [22]:
labels

['make', 'model', 'reg']

In [23]:
import pandas as pd

def split_dataframe(df_reduced, unique_identifier):
    """
    labels Eg. ["make", "model", "reg"] is a list of column headers that the dataframe should be split by.
    the unique_identifier is one of those labels.
    The function must split the dataframe based on the labels and unique_identifier.
    Once a unique identifier repeats, a new dataframe is created.
    Unique identifier can be any one of the labels, in any position, but each dataframe must follow the pattern of the labels.
    """
    # Initialize an empty list to store the dataframes
    dataframes_list = []

    # Find the indices where "reference_number" appears in the value column
    reference_indices = df_reduced[df_reduced["value"] == unique_identifier].index

    # Iterate through the indices and create dataframes
    for i in range(len(reference_indices)):
        start_idx = reference_indices[i]
        end_idx = reference_indices[i + 1] if i + 1 < len(reference_indices) else len(df_reduced)
        chunk_df = df_reduced[start_idx:end_idx].reset_index(drop=True)
        dataframes_list.append(chunk_df)

    return dataframes_list

dataframes_list = split_dataframe(df_reduced, unique_identifier)


In [24]:
dataframes_list

[           text  value
 0          Make   make
 1         Model  model
 2             3  model
 3  Registration    reg,
      text  value
 0  TOYOTA   make
 1  Prius+  model
 2    RA19    reg
 3     DJJ    reg,
     text  value
 0  DACIA   make
 1  LOGAN  model
 2    MCV  model
 3   SH65    reg
 4    LHC    reg,
             text  value
 0  MERCEDES-BENZ   make
 1           Vito  model
 2             K3    reg
 3            YKW    reg,
       text  value
 0  PEUGEOT   make
 1       E7  model
 2       XS  model
 3     SD63    reg
 4      DRO    reg,
        text  value
 0   HYUNDAI   make
 1     Ioniq  model
 2     First  model
 3  Addition  model
 4      CE69    reg
 5       HKB    reg,
          text  value
 0  VOLKSWAGEN   make
 1      Passat  model
 2        GJ17    reg
 3         MXG    reg,
       text  value
 0  HYUNDAI   make
 1    IONIQ  model
 2     FP68    reg
 3      WCJ    reg,
      text  value
 0  TOYOTA   make
 1   Prius  model
 2  Hybrid  model
 3    YK68    reg
 4    

In [25]:
dataframes_list[0]

Unnamed: 0,text,value
0,Make,make
1,Model,model
2,3,model
3,Registration,reg


In [26]:
def process_dataframes(dataframes_list, unique_identifier):
    new_dataframes_list = []

    for df in dataframes_list:
        df = df.groupby('value', as_index=False).agg({'text': ' '.join})
        df[unique_identifier] = df[df["value"] == unique_identifier]["text"].iloc[0]
        df = df.pivot_table(
                index=unique_identifier, columns="value", values="text", aggfunc="first"
            )
        df.reset_index(inplace=True, drop=True)
        new_dataframes_list.append(df)

    new_df = pd.concat(new_dataframes_list)
    return new_df

new_df = process_dataframes(dataframes_list, unique_identifier)


In [27]:
df_final = transform_df(new_df, unique_identifier, date_format)

In [28]:
df_final

value,reg,make,model
0,20RS,ROLLS ROYCE,GHOST
0,20RS,ROLLS ROYCE,GHOST BLACK BADGE
0,A19KKL,LAND ROVER,DISCOVERY
0,A19KKL,LAND ROVER,Land Rover Velar
0,A20WCW,MERCEDES-BENZ,S 400 L AMG LN PRM+EXEC D 4M
...,...,...,...
0,YY18OFB,VOLKSWAGEN,TOURAN
0,YY18UVJ,HYUNDAI,IONIQ
0,YY64FHW,VOLKSWAGEN,TOURAN SE BLUE TECH TDI
0,YY65JKE,FORD,Focus
