In [46]:
import fitz  # PyMuPDF

def extract_text_with_structure(pdf_path):
    structured_text = []
    with fitz.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf, start=1):
            # Extract text as a dictionary
            page_dict = page.get_text("dict")
            blocks = page_dict["blocks"]
            for b in blocks:  # iterate through the text blocks
                if "lines" in b:  # block contains text
                    for line in b["lines"]:  # iterate through the text lines
                        spans = line["spans"]  # get the spans of text
                        for span in spans:  # iterate through the text spans
                            structured_text.append({
                                "page": page_number,
                                "text": span["text"],
                                "font": span["font"],  # font used
                                "size": span["size"],  # font size
                                "origin_x": (span["origin"][0]),
                                "origin_y": (span["origin"][1]),  
                                "bbox": span["bbox"]  # bounding box
                            })
    return structured_text

number = 1
# Specify the path to your PDF file
pdf_file_path = f'sample_pdf{number}.pdf'

# Call the function and print the result
structured_text = extract_text_with_structure(pdf_file_path)
for item in structured_text:
    print(item)



{'page': 1, 'text': 'LICENSED HACKNEY AND PRIVATE HIRE VEHICLES', 'font': 'Helvetica-Bold', 'size': 11.99899959564209, 'origin_x': 30.36199951171875, 'origin_y': 41.2349853515625, 'bbox': (30.36199951171875, 28.396055221557617, 333.6966857910156, 44.918678283691406)}
{'page': 1, 'text': 'LAST REFRESHED 19/04/2024 AT 11:00:25', 'font': 'Helvetica-Bold', 'size': 11.99899959564209, 'origin_x': 565.1069946289062, 'origin_y': 41.2349853515625, 'bbox': (565.1069946289062, 28.396055221557617, 807.8870239257812, 44.918678283691406)}
{'page': 1, 'text': 'LICENCE ', 'font': 'Helvetica', 'size': 8.99899959564209, 'origin_x': 30.36199951171875, 'origin_y': 61.29901123046875, 'bbox': (30.36199951171875, 51.625083923339844, 71.86538696289062, 63.98971176147461)}
{'page': 1, 'text': 'REF', 'font': 'Helvetica', 'size': 8.99899959564209, 'origin_x': 30.36199951171875, 'origin_y': 71.3070068359375, 'bbox': (30.36199951171875, 61.633079528808594, 48.36000061035156, 73.99771118164062)}
{'page': 1, 'text':

In [47]:
structured_text

[{'page': 1,
  'text': 'LICENSED HACKNEY AND PRIVATE HIRE VEHICLES',
  'font': 'Helvetica-Bold',
  'size': 11.99899959564209,
  'origin_x': 30.36199951171875,
  'origin_y': 41.2349853515625,
  'bbox': (30.36199951171875,
   28.396055221557617,
   333.6966857910156,
   44.918678283691406)},
 {'page': 1,
  'text': 'LAST REFRESHED 19/04/2024 AT 11:00:25',
  'font': 'Helvetica-Bold',
  'size': 11.99899959564209,
  'origin_x': 565.1069946289062,
  'origin_y': 41.2349853515625,
  'bbox': (565.1069946289062,
   28.396055221557617,
   807.8870239257812,
   44.918678283691406)},
 {'page': 1,
  'text': 'LICENCE ',
  'font': 'Helvetica',
  'size': 8.99899959564209,
  'origin_x': 30.36199951171875,
  'origin_y': 61.29901123046875,
  'bbox': (30.36199951171875,
   51.625083923339844,
   71.86538696289062,
   63.98971176147461)},
 {'page': 1,
  'text': 'REF',
  'font': 'Helvetica',
  'size': 8.99899959564209,
  'origin_x': 30.36199951171875,
  'origin_y': 71.3070068359375,
  'bbox': (30.361999511718

In [48]:
import csv

def structured_text_to_csv(structured_text, csv_file_path):
    # Define the CSV column names based on the dictionary keys
    fieldnames = ['page', 'text', 'font', 'size', 'origin_x', 'origin_y', 'bbox']
    
    # Open the CSV file for writing
    with open(csv_file_path, mode='w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        # Write the header
        writer.writeheader()
        
        # Write the structured text data to the CSV
        for entry in structured_text:
            writer.writerow(entry)

# Example usage:
csv_output_path = f'output{number}.csv'
structured_text_to_csv(structured_text, csv_output_path)

In [49]:
import pandas as pd
data = pd.read_csv(csv_output_path)

data

Unnamed: 0,page,text,font,size,origin_x,origin_y,bbox
0,1,LICENSED HACKNEY AND PRIVATE HIRE VEHICLES,Helvetica-Bold,11.999,30.362000,41.234985,"(30.36199951171875, 28.396055221557617, 333.69..."
1,1,LAST REFRESHED 19/04/2024 AT 11:00:25,Helvetica-Bold,11.999,565.106995,41.234985,"(565.1069946289062, 28.396055221557617, 807.88..."
2,1,LICENCE,Helvetica,8.999,30.362000,61.299011,"(30.36199951171875, 51.625083923339844, 71.865..."
3,1,REF,Helvetica,8.999,30.362000,71.307007,"(30.36199951171875, 61.633079528808594, 48.360..."
4,1,LICENCE,Helvetica,8.999,80.545998,61.299011,"(80.5459976196289, 51.625083923339844, 122.049..."
...,...,...,...,...,...,...,...
13183,49,DIESEL,Helvetica,8.999,453.075012,325.467010,"(453.07501220703125, 315.7930908203125, 485.08..."
13184,49,16/05/2019,Helvetica,8.999,508.154999,325.467010,"(508.1549987792969, 315.7930908203125, 553.185..."
13185,49,4,Helvetica,8.999,577.851013,325.467010,"(577.8510131835938, 315.7930908203125, 582.854..."
13186,49,NO,Helvetica,8.999,634.442993,325.467010,"(634.4429931640625, 315.7930908203125, 647.941..."


In [50]:
new_data = []
current_group = []
current_page = None
current_y = None
last_x = None
page_index = 0  # Initialize page index

for _, row in data.iterrows():
    # Check if we've moved to a new page
    if row['page'] != current_page:
        # If we have a current group, append it before resetting for the new page
        if current_group:
            new_data.append({
                "page": current_page,
                "page_index": page_index,
                "origin_x": last_x,
                "origin_y": current_y,
                "concatenated_text": "".join(current_group)
            })
            current_group = []
        current_page = row['page']
        last_x = None  # Reset last_x for the new page
        page_index = 0  # Reset page index for the new page

    # Check if we've moved to a new x position or if it's a new page
    if row['origin_x'] != last_x:
        # If we have a current group, append it before starting the new one
        if current_group:
            new_data.append({
                "page": current_page,
                "page_index": page_index,
                "origin_x": last_x,
                "origin_y": current_y,
                "concatenated_text": "".join(current_group)
            })
            current_group = []
            page_index += 1  # Increment page index for the new group

        last_x = row['origin_x']
        current_y = row['origin_y']

    current_group.append(row['text'])

# Append the last group if it exists
if current_group:
    new_data.append({
        "page": current_page,
        "page_index": page_index,
        "origin_x": last_x,
        "origin_y": current_y,
        "concatenated_text": "".join(current_group)
    })

# Convert the list of dictionaries to a DataFrame
result_df = pd.DataFrame(new_data)

# Display the new DataFrame
result_df

Unnamed: 0,page,page_index,origin_x,origin_y,concatenated_text
0,1,0,30.362000,41.234985,LICENSED HACKNEY AND PRIVATE HIRE VEHICLES
1,1,1,565.106995,41.234985,LAST REFRESHED 19/04/2024 AT 11:00:25
2,1,2,30.362000,61.299011,LICENCE REF
3,1,3,80.545998,61.299011,LICENCE TYPE
4,1,4,138.794998,61.299011,ISSUE DATE EXPIRY
...,...,...,...,...,...
11633,49,132,453.075012,325.467010,DIESEL
11634,49,133,508.154999,325.467010,16/05/2019
11635,49,134,577.851013,325.467010,4
11636,49,135,634.442993,325.467010,NO


In [51]:
result_df.to_csv(f'concat_output{number}.csv')

What the template can be:


Line/Index number for 3 of the following for the first two pages (if available, after concatenation):

- VRM
- Make
- Model
- Source
- Record Type
- Date From
- Date To
- Date Received
- Council

Supplied manually

In [52]:
template_1 = {
    1: {
        "vrm": (20, 31, 42),
        "make": (19, 30, 41),
        "model": None,
        "source": None,
        "record_type": None
    },
    2: {
        "vrm": (20, 31, 42),
        "make": (19, 30, 41),
        "model": None,
        "source": None,
        "record_type": None
    },
    3: {
        "vrm": (20, 31, 42),
        "make": (19, 30, 41),
        "model": None,
        "source": None,
        "record_type": None
    },
    4: {
        "vrm": (20, 31, 42),
        "make": (19, 30, 41),
        "model": None,
        "source": None,
        "record_type": None
    }
}


In [53]:
max_index_per_page = {}
for page in result_df['page'].unique():
    max_index_per_page[page] = result_df[result_df['page'] == page].shape[0] - 1

In [54]:
# def extend_template(max_index_per_page, template, result_df):
#     """Extend the pattern of indexes to include the entire page"""


In [55]:
def extract_data_using_template(result_df, template):
    """Extract text from the result_df using the template
    result_df columns:
    page, origin_x, origin_y, concatenated_text

    extracted_data is a dataframe with the following columns:
    vrm, make, model, source, record_type

    for each page in result_df, use the three numbers inside the tuple to determine the concatenated_text to extract.
    Continue the pattern for the entire page.

    return the dataframe with the data
    """
    # Create an empty DataFrame with the required columns
    extracted_data = []

    # Iterate through each page in the DataFrame
    for page in result_df['page'].unique()[:4]:
        page_data = result_df[result_df['page'] == page]
        # Initialize the lists to hold the extracted data
        vrm_list = []
        make_list = []
        
        # Check if the template provides indices for vrm and make for the current page
        vrm_indices = template[page]["vrm"]
        make_indices = template[page]["make"]
        
        if vrm_indices:
            vrm_list = [page_data.iloc[i]['concatenated_text'] for i in vrm_indices if i < len(page_data)]
        if make_indices:
            make_list = [page_data.iloc[i]['concatenated_text'] for i in make_indices if i < len(page_data)]
        
        # Combine the extracted data into a list of dictionaries
        page_extracted_data = [
            {"vrm": vrm, "make": make, "model": None, "source": None, "record_type": None}
            for vrm, make in zip(vrm_list, make_list)
        ]
        
        # Extend the main extracted_data list
        extracted_data.extend(page_extracted_data)
    
    # Convert the list of dictionaries to a DataFrame
    extracted_df = pd.DataFrame(extracted_data)
    
    # Return the DataFrame with the extracted data
    return extracted_df

In [56]:
extracted_df = extract_data_using_template(result_df, template_1)
extracted_df

Unnamed: 0,vrm,make,model,source,record_type
0,MK10 SWW,LTI TX4,,,
1,SF67 HUK,FORD,,,
2,SB54 CAB,VAUXHALL VIVARO,,,
3,DN66 HLA,VAUXHALL VIVARO,,,
4,YJ65 CHK,RENAULT TRAFIC,,,
5,T900 NYM,LTI TX4,,,
6,DIESEL,FORD ALLIED PROCAB CB54 BAX,,,
7,DIESEL,YH21 CZZ,,,
8,DIESEL,MK18 VAH,,,
9,AF14 ZZN,TOYOTA VERSO,,,


In [57]:
extracted_df

Unnamed: 0,vrm,make,model,source,record_type
0,MK10 SWW,LTI TX4,,,
1,SF67 HUK,FORD,,,
2,SB54 CAB,VAUXHALL VIVARO,,,
3,DN66 HLA,VAUXHALL VIVARO,,,
4,YJ65 CHK,RENAULT TRAFIC,,,
5,T900 NYM,LTI TX4,,,
6,DIESEL,FORD ALLIED PROCAB CB54 BAX,,,
7,DIESEL,YH21 CZZ,,,
8,DIESEL,MK18 VAH,,,
9,AF14 ZZN,TOYOTA VERSO,,,
