In [1]:
import fitz  # PyMuPDF

def extract_text_with_structure(pdf_path):
    structured_text = []
    with fitz.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf, start=1):
            # Extract text as a dictionary
            page_dict = page.get_text("dict")
            blocks = page_dict["blocks"]
            for b in blocks:  # iterate through the text blocks
                if "lines" in b:  # block contains text
                    for line in b["lines"]:  # iterate through the text lines
                        spans = line["spans"]  # get the spans of text
                        for span in spans:  # iterate through the text spans
                            structured_text.append({
                                "page": page_number,
                                "text": span["text"],
                                "font": span["font"],  # font used
                                "size": span["size"],  # font size
                                "origin_x": (span["origin"][0]),
                                "origin_y": (span["origin"][1]),  
                                "bbox": span["bbox"]  # bounding box
                            })
    return structured_text

number = 2
# Specify the path to your PDF file
pdf_file_path = f'sample_pdf{number}.pdf'

# Call the function and print the result
structured_text = extract_text_with_structure(pdf_file_path)
for item in structured_text:
    print(item)



{'page': 1, 'text': 'Licence_RefNo', 'font': 'Calibri', 'size': 10.979999542236328, 'origin_x': 52.79999923706055, 'origin_y': 64.97967529296875, 'bbox': (52.79999923706055, 56.74467468261719, 119.0461196899414, 67.72467803955078)}
{'page': 1, 'text': 'Vehicle_Registration', 'font': 'Calibri', 'size': 10.979999542236328, 'origin_x': 139.5628662109375, 'origin_y': 64.97967529296875, 'bbox': (139.5628662109375, 56.74467468261719, 231.55755615234375, 67.72467803955078)}
{'page': 1, 'text': 'Vehicle_Make&Model', 'font': 'Calibri', 'size': 10.979999542236328, 'origin_x': 249.10699462890625, 'origin_y': 64.97967529296875, 'bbox': (249.10699462890625, 56.74467468261719, 348.70111083984375, 67.72467803955078)}
{'page': 1, 'text': 'PH0500', 'font': 'Calibri', 'size': 10.979999542236328, 'origin_x': 52.79999923706055, 'origin_y': 79.319580078125, 'bbox': (52.79999923706055, 71.08457946777344, 87.58284759521484, 82.06458282470703)}
{'page': 1, 'text': 'R13\xa0NDR', 'font': 'Calibri', 'size': 10.9

In [2]:
structured_text

[{'page': 1,
  'text': 'Licence_RefNo',
  'font': 'Calibri',
  'size': 10.979999542236328,
  'origin_x': 52.79999923706055,
  'origin_y': 64.97967529296875,
  'bbox': (52.79999923706055,
   56.74467468261719,
   119.0461196899414,
   67.72467803955078)},
 {'page': 1,
  'text': 'Vehicle_Registration',
  'font': 'Calibri',
  'size': 10.979999542236328,
  'origin_x': 139.5628662109375,
  'origin_y': 64.97967529296875,
  'bbox': (139.5628662109375,
   56.74467468261719,
   231.55755615234375,
   67.72467803955078)},
 {'page': 1,
  'text': 'Vehicle_Make&Model',
  'font': 'Calibri',
  'size': 10.979999542236328,
  'origin_x': 249.10699462890625,
  'origin_y': 64.97967529296875,
  'bbox': (249.10699462890625,
   56.74467468261719,
   348.70111083984375,
   67.72467803955078)},
 {'page': 1,
  'text': 'PH0500',
  'font': 'Calibri',
  'size': 10.979999542236328,
  'origin_x': 52.79999923706055,
  'origin_y': 79.319580078125,
  'bbox': (52.79999923706055,
   71.08457946777344,
   87.5828475952148

In [3]:
import csv

def structured_text_to_csv(structured_text, csv_file_path):
    # Define the CSV column names based on the dictionary keys
    fieldnames = ['page', 'text', 'font', 'size', 'origin_x', 'origin_y', 'bbox']
    
    # Open the CSV file for writing
    with open(csv_file_path, mode='w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        # Write the header
        writer.writeheader()
        
        # Write the structured text data to the CSV
        for entry in structured_text:
            writer.writerow(entry)

# Example usage:
csv_output_path = f'output{number}.csv'
structured_text_to_csv(structured_text, csv_output_path)

In [4]:
import pandas as pd
data = pd.read_csv(csv_output_path)

data

Unnamed: 0,page,text,font,size,origin_x,origin_y,bbox
0,1,Licence_RefNo,Calibri,10.98,52.799999,64.979675,"(52.79999923706055, 56.74467468261719, 119.046..."
1,1,Vehicle_Registration,Calibri,10.98,139.562866,64.979675,"(139.5628662109375, 56.74467468261719, 231.557..."
2,1,Vehicle_Make&Model,Calibri,10.98,249.106995,64.979675,"(249.10699462890625, 56.74467468261719, 348.70..."
3,1,PH0500,Calibri,10.98,52.799999,79.319580,"(52.79999923706055, 71.08457946777344, 87.5828..."
4,1,R13 NDR,Calibri,10.98,139.570572,79.319580,"(139.57057189941406, 71.08457946777344, 178.94..."
...,...,...,...,...,...,...,...
2764,19,SF17 XVH,Calibri,10.98,139.572739,107.999634,"(139.57273864746094, 99.76463317871094, 182.02..."
2765,19,Skoda Octavia,Calibri,10.98,249.116898,107.999634,"(249.1168975830078, 99.76463317871094, 312.047..."
2766,19,SURR PH0942,Calibri,10.98,52.802170,122.339539,"(52.80216979980469, 114.10453796386719, 114.08..."
2767,19,GF20 WHU,Calibri,10.98,139.573822,122.339539,"(139.57382202148438, 114.10453796386719, 188.8..."


In [5]:
new_data = []
current_group = []
current_page = None
current_y = None
last_x = None
page_index = 0  # Initialize page index

for _, row in data.iterrows():
    # Check if we've moved to a new page
    if row['page'] != current_page:
        # If we have a current group, append it before resetting for the new page
        if current_group:
            new_data.append({
                "page": current_page,
                "page_index": page_index,
                "origin_x": last_x,
                "origin_y": current_y,
                "concatenated_text": "".join(current_group)
            })
            current_group = []
        current_page = row['page']
        last_x = None  # Reset last_x for the new page
        page_index = 0  # Reset page index for the new page

    # Check if we've moved to a new x position or if it's a new page
    if row['origin_x'] != last_x:
        # If we have a current group, append it before starting the new one
        if current_group:
            new_data.append({
                "page": current_page,
                "page_index": page_index,
                "origin_x": last_x,
                "origin_y": current_y,
                "concatenated_text": "".join(current_group)
            })
            current_group = []
            page_index += 1  # Increment page index for the new group

        last_x = row['origin_x']
        current_y = row['origin_y']

    current_group.append(row['text'])

# Append the last group if it exists
if current_group:
    new_data.append({
        "page": current_page,
        "page_index": page_index,
        "origin_x": last_x,
        "origin_y": current_y,
        "concatenated_text": "".join(current_group)
    })

# Convert the list of dictionaries to a DataFrame
result_df = pd.DataFrame(new_data)

# Display the new DataFrame
result_df

Unnamed: 0,page,page_index,origin_x,origin_y,concatenated_text
0,1,0,52.799999,64.979675,Licence_RefNo
1,1,1,139.562866,64.979675,Vehicle_Registration
2,1,2,249.106995,64.979675,Vehicle_Make&Model
3,1,3,52.799999,79.319580,PH0500
4,1,4,139.570572,79.319580,R13 NDR
...,...,...,...,...,...
2764,19,10,139.572739,107.999634,SF17 XVH
2765,19,11,249.116898,107.999634,Skoda Octavia
2766,19,12,52.802170,122.339539,SURR PH0942
2767,19,13,139.573822,122.339539,GF20 WHU


In [6]:
result_df.to_csv(f'concat_output{number}.csv')

What the template can be:


Line/Index number for 3 of the following for the first two pages (if available, after concatenation):

- VRM
- Make
- Model
- Source
- Record Type
- Date From
- Date To
- Date Received
- Council

Supplied manually

In [7]:
template_1 = {
    1: {
        "vrm": (20, 31, 42),
        "make": (19, 30, 41),
        "model": None,
        "source": None,
        "record_type": None
    },
    2: {
        "vrm": (20, 31, 42),
        "make": (19, 30, 41),
        "model": None,
        "source": None,
        "record_type": None
    },
    3: {
        "vrm": (20, 31, 42),
        "make": (19, 30, 41),
        "model": None,
        "source": None,
        "record_type": None
    },
    4: {
        "vrm": (20, 31, 42),
        "make": (19, 30, 41),
        "model": None,
        "source": None,
        "record_type": None
    }
}

In [8]:
template_2 = {
    1: {
        "vrm": (20, 31, 42),
        "make": (19, 30, 41),
        "model": None,
        "source": None,
        "record_type": None
    },
    2: {
        "vrm": (20, 31, 42),
        "make": (19, 30, 41),
        "model": None,
        "source": None,
        "record_type": None
    },
    3: {
        "vrm": (20, 31, 42),
        "make": (19, 30, 41),
        "model": None,
        "source": None,
        "record_type": None
    },
    4: {
        "vrm": (20, 31, 42),
        "make": (19, 30, 41),
        "model": None,
        "source": None,
        "record_type": None
    }
}

In [9]:
max_index_per_page = {}
for page in result_df['page'].unique():
    max_index_per_page[page] = result_df[result_df['page'] == page].shape[0] - 1

In [10]:
# Define the function to extend the template
def extend_template(max_index_per_page, template):
    """Extend the arithmetic pattern of indexes up until the max index per page amount"""
    extended_template = {}
    
    for page, max_index in max_index_per_page.items():
        if page in template:
            extended_template[page] = template[page].copy()
            for key, indices in template[page].items():
                if indices:
                    last_index = indices[-1]
                    step = indices[-1] - indices[-2] if len(indices) > 1 else 1
                    extended_indices = list(indices)
                    while last_index + step <= max_index:
                        last_index += step
                        extended_indices.append(last_index)
                    extended_template[page][key] = tuple(extended_indices)
    
    return extended_template



# Calculate max_index_per_page for the initial dataframe
max_index_per_page = {}
for page in result_df['page'].unique():
    max_index_per_page[page] = result_df[result_df['page'] == page].shape[0] - 1

# Extend the template
extended_template_1 = extend_template(max_index_per_page, template_1)


In [11]:
extended_template_1

{1: {'vrm': (20, 31, 42, 53, 64, 75, 86, 97, 108, 119, 130, 141, 152),
  'make': (19, 30, 41, 52, 63, 74, 85, 96, 107, 118, 129, 140, 151),
  'model': None,
  'source': None,
  'record_type': None},
 2: {'vrm': (20, 31, 42, 53, 64, 75, 86, 97, 108, 119, 130, 141, 152),
  'make': (19, 30, 41, 52, 63, 74, 85, 96, 107, 118, 129, 140, 151),
  'model': None,
  'source': None,
  'record_type': None},
 3: {'vrm': (20, 31, 42, 53, 64, 75, 86, 97, 108, 119, 130, 141, 152),
  'make': (19, 30, 41, 52, 63, 74, 85, 96, 107, 118, 129, 140, 151),
  'model': None,
  'source': None,
  'record_type': None},
 4: {'vrm': (20, 31, 42, 53, 64, 75, 86, 97, 108, 119, 130, 141, 152),
  'make': (19, 30, 41, 52, 63, 74, 85, 96, 107, 118, 129, 140, 151),
  'model': None,
  'source': None,
  'record_type': None}}

In [12]:
def extract_data_using_template(result_df, template):
    """Extract text from the result_df using the template
    result_df columns:
    page, origin_x, origin_y, concatenated_text

    extracted_data is a dataframe with the following columns:
    vrm, make, model, source, record_type

    for each page in result_df, use the three numbers inside the tuple to determine the concatenated_text to extract.
    Continue the pattern for the entire page.

    return the dataframe with the data
    """
    # Create an empty DataFrame with the required columns
    extracted_data = []

    # Iterate through each page in the DataFrame
    for page in result_df['page'].unique()[:4]:
        page_data = result_df[result_df['page'] == page]
        # Initialize the lists to hold the extracted data
        vrm_list = []
        make_list = []
        
        # Check if the template provides indices for vrm and make for the current page
        vrm_indices = template[page]["vrm"]
        make_indices = template[page]["make"]
        
        if vrm_indices:
            vrm_list = [page_data.iloc[i]['concatenated_text'] for i in vrm_indices if i < len(page_data)]
        if make_indices:
            make_list = [page_data.iloc[i]['concatenated_text'] for i in make_indices if i < len(page_data)]
        
        # Combine the extracted data into a list of dictionaries
        page_extracted_data = [
            {"page": page, "vrm": vrm, "make": make, "model": None, "source": None, "record_type": None}
            for vrm, make in zip(vrm_list, make_list)
        ]
        
        # Extend the main extracted_data list
        extracted_data.extend(page_extracted_data)
    
    # Convert the list of dictionaries to a DataFrame
    extracted_df = pd.DataFrame(extracted_data)
    
    # Return the DataFrame with the extracted data
    return extracted_df

In [13]:
extracted_df = extract_data_using_template(result_df, extended_template_1)
extracted_df

Unnamed: 0,page,vrm,make,model,source,record_type
0,1,Skoda Octavia,FG16 WTD,,,
1,1,BD69 EWK,PH0513,,,
2,1,PH0520,Skoda Octavia,,,
3,1,Skoda Octavia,GM19 MPV,,,
4,1,DU67 LZM,PH0538,,,
5,1,PH0544,Mercedes‐Benz C220,,,
6,1,Seat Leon,NX19 RTU,,,
7,1,NX73 XBZ,PH0552,,,
8,1,PH0557,Kia Niro,,,
9,1,Skoda Octavia,NU69 TZV,,,


In [14]:
extracted_df.to_csv(f"extracted_data{number}.csv")