# Set up dependencies

In [1]:
import nest_asyncio
import os

from IPython.utils import docs
from llama_parse import LlamaParse

nest_asyncio.apply()

LLAMA_CLOUD_API_KEY = ''
os.environ["LLAMA_CLOUD_API_KEY"] = LLAMA_CLOUD_API_KEY

In [75]:
from typing import List, Tuple, Union

from enum import Enum

import re
import pandas as pd


class PDFPartsTypes(Enum):
    """
    Represents the types of PDF parts.
    """

    TEXT = 0
    TABLE = 1
    

MIN_TEXT_SIZE = 100


class MDParser:
    @staticmethod
    def _parse_md_table(md_table: str) -> pd.DataFrame | None:
        """
        Parses a Markdown table string and converts it into a Pandas DataFrame.

        :param md_table: Markdown string which represent a table
        :return: Pandas DataFrame -- parsed table
        """
        lines = md_table.strip().split("\n")
        headers = [h.strip() for h in lines[0].strip().split("|")[1:-1]]  # Split header row by '|'
        rows = [line.strip().split("|")[1:-1] for line in lines[2:]]    # Split data rows by '|'
        data = [[cell.strip() for cell in row if cell] for row in rows]  # Clean up cells        
        
        try:
            res_df = pd.DataFrame(data, columns=headers)
        except:
            res_df = None
        return res_df
    
    @staticmethod
    def _merge_tables(tables: List[pd.DataFrame]) -> List[pd.DataFrame]:
        def infer_column_types(df):
            inferred_types = []
            for col in df.columns:
                try:
                    # Try numeric conversion
                    pd.to_numeric(df[col])
                    inferred_types.append('num')
                except:
                    try:
                        # Try datetime conversion
                        pd.to_datetime(df[col], format='mixed')
                        inferred_types.append('dt')
                    except:
                        inferred_types.append('str')
            return inferred_types
        
        inx = 0
        
        while (inx + 1) < len(tables):
            # try to merge tables[inx] and tables[inx + 1]
            # check if they have the same number of columns
            # check if their columns have the similar type
            # !!! WARNING !!! It was supposed that the first table has the right columns names
            # !!! WARNING !!! and following similar table has hallucinated columns names
            if tables[inx].shape[1] == tables[inx + 1].shape[1]:
                col_types_t0 = infer_column_types(tables[inx])
                col_types_t1 = infer_column_types(tables[inx + 1])
                if col_types_t0 == col_types_t1:
                    tables[inx + 1].columns = tables[inx].columns
                    tables[inx] = pd.concat([tables[inx], tables[inx + 1]], ignore_index=True).reset_index(drop=True)
                    del tables[inx + 1]
                    
                    continue # prevent considering the next table    
            inx += 1
        return tables
    
    @staticmethod
    def _set_col_type_for_df_tables(df_tables: List[pd.DataFrame]) -> List[pd.DataFrame]:
        """
        Converts a list of pd.DataFrame tables into the same pd.DataFrame, 
        but with column type changes.
        
        :param df_tables: list of pd.DataFrame tables with only str-typed columns
        :return: list of pd.DataFrame tables with int-, float-, bool-, datatime- or str-typed columns
        """
        
        def convert_column_type(col):
            # Try to convert to integers
            try:
                return pd.to_numeric(col, errors='raise', downcast='integer')
            except (ValueError, TypeError):
                pass
        
            # Try to convert to floats
            try:
                return pd.to_numeric(col, errors='raise', downcast='float')
            except (ValueError, TypeError):
                pass
        
            # Try to convert to booleans
            bool_values = {'True', 'False', True, False}
            if set(col.dropna().unique()).issubset(bool_values):
                return col.apply(lambda x: True if x in ['True', True] else False if x in ['False', False] else x)
        
            # Try to convert to datetime
            try:
                return pd.to_datetime(col, format='mixed', errors='raise')
            except (ValueError, TypeError):
                pass
        
            # If all else fails, return the column as a string
            return col
        
        df_typed_tables: List[pd.DataFrame] = []
        
        for df in df_tables:
            df_typed = pd.DataFrame()
            for col in df.columns:
                df_typed[col] = convert_column_type(df[col])
            df_typed_tables.append(df_typed)
        
        return df_typed_tables

    @staticmethod
    def parse_md_page(md_text: str) -> List[Tuple[Union[pd.DataFrame, str], PDFPartsTypes]]:
        """
        Extracts tables and text from Markdown in the same order they appear.
        Tables are returned as Pandas DataFrames, and text parts as strings.

        :param md_text: Markdown string which represent PDF-document or PDF-page
        :return: List[Tuple[Union[pd.DataFrame, str], PDFPartsTypes]]
        """
        content = []

        # Regular expression to match Markdown tables, allowing for optional spaces and missing cells
        table_pattern = re.compile(
            r'(\|[^\n]+\|\n(?:\|[^\n]+\|\n)*)',
            re.MULTILINE
        )
        
        # Split the text by tables
        parts = table_pattern.split(md_text + "\n")

        for part in parts:
            part = part.strip()
            if table_pattern.match(part):  # It's a table
                df = MDParser._parse_md_table(part)
                # check if table parsing is successful
                content.append((df, PDFPartsTypes.TABLE) if df is not None else (part.strip(), PDFPartsTypes.TEXT))
            elif part:  # Non-empty text
                content.append((str(part), PDFPartsTypes.TEXT))

        return content

    @staticmethod
    def parse_md_doc(md_texts: List[str]) -> List[Tuple[Union[str, pd.DataFrame], PDFPartsTypes]]:
        text_parts_processed = []
        page_parts = []
        combined_md_text = ""
        collected_tables = []
        
        # Here might be a table parsing logic which use table position in text,
        # so I keep pages after collecting tables
        for md_text in md_texts:
            page_parts.append(MDParser.parse_md_page(md_text))

        # Cleaning redundant text parts and table combining procedure
        for page_part in page_parts:
            for pdata, pd_type in page_part:
                if pd_type == PDFPartsTypes.TEXT:
                    # DO NOT add to final text too small pieces of text
                    # usually they are redundant or hallucinations
                    pdata = pdata.strip()
                    combined_md_text += pdata + "\n" if (len(combined_md_text) > 0 or len(pdata) > MIN_TEXT_SIZE) else ""
                elif pd_type == PDFPartsTypes.TABLE:
                    collected_tables.append(pdata)
        collected_tables = MDParser._merge_tables(collected_tables)
        collected_tables = MDParser._set_col_type_for_df_tables(collected_tables)
        
        if combined_md_text:
            text_parts_processed.append((combined_md_text, PDFPartsTypes.TEXT))
        text_parts_processed.extend([(df_table, PDFPartsTypes.TABLE) for df_table in collected_tables])
        
        return text_parts_processed

# Parse with the most advanced parsing variant (openai-gpt4o)
(10 credits per page)

## "IKEA LAGAN User Manual.pdf"

In [76]:
pdf_name = "IKEA LAGAN User Manual"
pdf_path = f"./_data_pdf_unlocking_with_python/{pdf_name}.pdf"

vendor_multimodal_model_name = "openai-gpt4o"
doc = LlamaParse(
    result_type="markdown",
    use_vendor_multimodal_model=True,
    vendor_multimodal_model_name=vendor_multimodal_model_name,
).load_data(pdf_path)
print(f"Num of parsed pages: {len(doc)}")

Started parsing the file under job_id e9b98fdc-7bee-415c-ac88-d774e0e241f9
Num of parsed pages: 20


In [77]:
doc[0].text

'NO_CONTENT_HERE'

In [78]:
pres = MDParser.parse_md_doc([doc[inx].text for inx in range(len(doc))])

In [79]:
for inx, res_single in enumerate(pres):
    if res_single[1] is PDFPartsTypes.TABLE:
        print(f"{inx}:", res_single[1], res_single[0].shape)
    else:
        print(f"{inx}:", res_single[1], len(res_single[0]))

0: PDFPartsTypes.TEXT 26653
1: PDFPartsTypes.TABLE (2, 2)
2: PDFPartsTypes.TABLE (3, 6)
3: PDFPartsTypes.TABLE (6, 2)
4: PDFPartsTypes.TABLE (12, 3)
5: PDFPartsTypes.TABLE (15, 2)
6: PDFPartsTypes.TABLE (34, 4)


In [12]:
print(pres[0][0])

# Contents

- Safety instructions 4
- Product description 6
- Control panel 6
- Programmes 6
- Options 7
- Before first use 8
- Daily use 9
- Hints and tips 11
- Care and cleaning 12
- Troubleshooting 13
- Technical information 15
- Environment concerns 16
- IKEA GUARANTEE 16

Subject to change without notice.

## Safety instructions

Before the installation and use of the appliance, carefully read the supplied instructions. The manufacturer is not responsible if an incorrect installation and use causes injuries and damages. Always keep the instructions with the appliance for future reference. Only a qualified service engineer can repair this appliance. Use only original spare parts.  
To prevent injury and damage to the appliance, do not try to repair the appliance. Always contact the After Sales Service.

### Children and vulnerable people safety


- Do not let persons, children included, with reduced physical sensory, reduced mental functions or lack of experience and knowledge use 

In [80]:
pres[1][0].head()

Unnamed: 0,Indicators,Description
0,Salt indicator. It is always off while the pro...,
1,End indicator.,


In [14]:
pres[2][0].head()

Unnamed: 0,Programme 1),Degree of soil Type of load,Programme phases,Duration (min),Energy (kWh),Water (l)
0,![70°](image) 2),"Heavy soil Crockery, cutlery, pots and pans",Prewash Wash 70 °C Rinses Dry,130 - 150,1.3 - 1.4,13 - 15
1,![65°](icon),Normal soil Crockery and cutlery,Prewash Wash 65 °C Rinses Dry,100 - 110,1.2 - 1.6,15 - 16
2,ECO 50° 3),Normal soil Crockery and cutlery,Prewash Wash 50 °C Rinses Dry,195,1.02,15


In [15]:
pres[3][0].head()

Unnamed: 0,Alarm code,Problem
0,• The indicator of the set programme flashes c...,The appliance does not fill with water.
1,• The end indicator flashes 1 time intermitten...,
2,• The indicator of the set programme flashes c...,The appliance does not drain the water.
3,• The end indicator flashes 2 times intermitte...,
4,• The indicator of the set programme flashes c...,The anti-flood device is on.


In [16]:
pres[4][0].head()

Unnamed: 0,Problem,Possible cause,Possible solution
0,The programme does not start.,The mains plug is not connected in the mains s...,Connect the mains plug.
1,,The appliance door is open.,Close the appliance door.
2,,The fuse in the fuse box is damaged.,Replace the fuse.
3,The appliance does not fill with water.,The water tap is closed.,Open the water tap.
4,,The water pressure is too low.,Contact your local water authority.


In [81]:
pres[5][0].head()

Unnamed: 0,Technical Information,Details
0,Dimensions,Width / Height / Depth (mm)
1,,596 / 818 - 898 / 555
2,Electrical connection,Refer to the rating plate.
3,Voltage,220-240 V
4,Frequency,50 Hz


# "PQ Series Alloy Steel Mopar Shaft Mount Rocker User Manual"

In [19]:
pdf_name = "PQ Series Alloy Steel Mopar Shaft Mount Rocker User Manual"
pdf_path = f"./_data_pdf_unlocking_with_python/{pdf_name}.pdf"

vendor_multimodal_model_name = "openai-gpt4o"
doc = LlamaParse(
    result_type="markdown",
    use_vendor_multimodal_model=True,
    vendor_multimodal_model_name=vendor_multimodal_model_name,
).load_data(pdf_path)
print(f"Num of parsed pages: {len(doc)}")

Started parsing the file under job_id d3b14a53-0c13-45d6-931d-b44edbd62e51
Num of parsed pages: 2


In [21]:
pres = MDParser.parse_md_doc([doc[inx].text for inx in range(len(doc))])

In [22]:
for inx, res_single in enumerate(pres):
    if res_single[1] is PDFPartsTypes.TABLE:
        print(f"{inx}:", res_single[1], res_single[0].shape)
    else:
        print(f"{inx}:", res_single[1], len(res_single[0]))

0: PDFPartsTypes.TEXT 3050
1: PDFPartsTypes.TABLE (11, 2)


In [23]:
print(pres[0][0])

## Quick Tips

1. All pushrods in a set do not wear evenly or uniformly. Over time each will wear in a unique fashion as it “mates” with the individual rocker arm that it is paired with. Consequently, old pushrods should be replaced when new rocker arms are installed.

2. Make sure rocker arms and other components are well lubricated. Proper lubrication of all parts can help prevent premature wear and/or permanent damage.

3. Check for any interference between rockers and valve covers.

4. Always use hardened shafts. Stock or unhardened shafts will result in premature wear and rocker arm failure.

5. Make sure oil holes on shafts are facing down.

## Installation

### Step 1
After removing the rockers and other components from the package thoroughly wash each piece, and then blow dry.

### Step 2
Totally submerge the rockers in engine oil for approximately 30 minutes prior to installation to ensure the needle bearings are lubricated.
# Step 3
Begin assembly by mounting rocker arms, was

In [25]:
pres[1][0].tail()

Unnamed: 0,ITEM,QUANTITY
6,Thick Spacers,8 each
7,Shim Kit,48 pcs
8,Large Hold Downs,4 each
9,Small Hold Downs,6 each
10,Extreme Pressure Lube #3,1 1/4oz. tube


# "Test-products-international 183A User Manual"

In [26]:
pdf_name = "Test-products-international 183A User Manual"
pdf_path = f"./_data_pdf_unlocking_with_python/{pdf_name}.pdf"

vendor_multimodal_model_name = "openai-gpt4o"
doc = LlamaParse(
    result_type="markdown",
    use_vendor_multimodal_model=True,
    vendor_multimodal_model_name=vendor_multimodal_model_name,
).load_data(pdf_path)
print(f"Num of parsed pages: {len(doc)}")

Started parsing the file under job_id 0ddcd5e6-c7e4-489b-9fbb-18cd8082dcf9
Num of parsed pages: 15


In [27]:
pres = MDParser.parse_md_doc([doc[inx].text for inx in range(len(doc))])

In [28]:
for inx, res_single in enumerate(pres):
    if res_single[1] is PDFPartsTypes.TABLE:
        print(f"{inx}:", res_single[1], res_single[0].shape)
    else:
        print(f"{inx}:", res_single[1], len(res_single[0]))

0: PDFPartsTypes.TEXT 21174
1: PDFPartsTypes.TABLE (40, 4)
2: PDFPartsTypes.TABLE (1, 2)
3: PDFPartsTypes.TABLE (2, 3)
4: PDFPartsTypes.TABLE (13, 4)
5: PDFPartsTypes.TABLE (10, 3)
6: PDFPartsTypes.TABLE (10, 2)
7: PDFPartsTypes.TABLE (12, 5)
8: PDFPartsTypes.TABLE (1, 3)
9: PDFPartsTypes.TABLE (1, 5)
10: PDFPartsTypes.TABLE (1, 2)
11: PDFPartsTypes.TABLE (2, 5)
12: PDFPartsTypes.TABLE (5, 2)


In [29]:
print(pres[0][0])

# TABLE OF CONTENTS

## A. INTRODUCTION
1. Congratulations . . . . . . . . . . . . . . . . . .3
2. Product Description . . . . . . . . . . . . . . . .3
3. EC Declaration of Conformity . . . . . . .4

## B. SAFETY CONSIDERATIONS . . . . . . . . . . .5

## C. TECHNICAL DATA
1. Features and Benefits . . . . . . . . . . . . . .6
2. Product Applications . . . . . . . . . . . . . .7
3. Specifications . . . . . . . . . . . . . . . . . . .8-11

## D. MEASUREMENT TECHNIQUES
1. Controls and Functions . . . . . . . . . . . .12
   a) Push Buttons . . . . . . . . . . . . . . . . . .12  
   b) Rotary Switch . . . . . . . . . . . . . . . . .13  
   c) Input Jacks . . . . . . . . . . . . . . . . . . .13  
   d) Disable Auto Power Off . . . . . . . . . .13  
2. Step by Step Procedures:  
   a) Measuring DC Volts . . . . . . . . . . . . .14  
   b) Measuring AC Volts . . . . . . . . . . . . .15  
   c) Measuring DC Amps . . . . . . . . . . . . .16  
   d) Measuring AC Amps . . . . . . . . . . . . .17  


In [30]:
pres[1][0].head()

Unnamed: 0,Range,Resolution,Accuracy,Impedance
0,40mV,0.01mV,±(0.5% + 2 digits),10MΩ
1,400mV,0.1mV,,
2,400mV,0.1mV,±(0.8% + 2 digits),10MΩ
3,4V,0.001V,,
4,40V,0.01V,±(0.5% + 2 digits),


In [31]:
pres[2][0].head()

Unnamed: 0,Range,"0.1 ~ 99.9% (0.5Hz to 500kHz, Width > 2uS)"
0,Accuracy,((0.1% + 0.05% / kHz) +1 Count


In [32]:
pres[3][0].head()

Unnamed: 0,Test Voltage,Max Test Current,Over Load Protection
0,2.7V,Approx. 1mA,600 V DC or Peak AC
1,0.6V,< 30Ω,600 V DC or Peak AC


In [33]:
pres[4][0].head()

Unnamed: 0,Range,Resolution,Accuracy,Overload Protection
0,40nF,0.01nF,,
1,400nF,0.1nF,,
2,4uF,0.001nF,±(3.0% +10 digits),600V DC or AC Peak
3,40uF,0.01uF,,
4,400uF,0.1uF,,


In [34]:
pres[5][0].head()

Unnamed: 0,Range,Resolution,Accuracy
0,CENTIGRADE,,
1,-40° to 10°C,0.1°,±(3.0% + 5°C)
2,10° to 200°C,0.1°,±(1.0% + 3°C)
3,200° to 400°C,0.1°,±(2.0% + 5°C)
4,400° to 1300°C,1°C,±(3.0% + 7°C)


In [35]:
pres[6][0].head()

Unnamed: 0,Specification,Details
0,Max. Volt. between any Input and Ground,1000V
1,Fuse Protection,mA: 0.5Amp/600VAC A: 10Amp/600VAC
2,Display Type,"4,000 Count, 2 times per second update"
3,Operating Temp.,0° to 40°C (32° to 104°F)
4,Storage Temp.,-10° to 50°C (14° to 122°F)


In [36]:
pres[7][0].head()

Unnamed: 0,FUNCTION,BLACK TEST LEAD,RED TEST LEAD,MINIMUM READING,MAXIMUM READING
0,mV,COM,VΩHz,0.1mV,400.0mV
1,V,COM,VΩHz,0.001V,1000V
2,V~,COM,VΩHz,0.001V,750V
3,40µA,COM,µAmA,0.01µA,40µA
4,µA,COM,µAmA,0.1µA,4000µA


In [37]:
pres[8][0].head()

Unnamed: 0,Instrument set-up:,BLACK TEST LEAD,RED TEST LEAD
0,FUNCTION,COM,VΩHz


In [38]:
pres[9][0].head()

Unnamed: 0,Instrument set-up:,BLACK TEST LEAD,RED TEST LEAD,MINIMUM READING,MAXIMUM READING
0,CAP,COM,VΩHz,0.01nF,4000uF


In [39]:
pres[10][0].head()

Unnamed: 0,REL,Activate REL mode (page 24).
0,HOLD,Freeze the reading on the LCD.


In [40]:
pres[11][0].head()

Unnamed: 0,FUNCTION,BLACK TEST LEAD,RED TEST LEAD,MINIMUM READING,MAXIMUM READING
0,Hz,COM,VΩHz,0.001Hz,10MHz
1,TEMP,COM,μmATEMP,0.1°F / 0.1°C,2462°F / 1350°C


In [41]:
pres[12][0].head()

Unnamed: 0,Problem,Probable Causes
0,**Does not power up**,- Dead or defective battery
1,,- Broken wire from battery snap to PCB
2,**Won't display current readings**,- Open fuse
3,,- Open test lead
4,,- Improperly connected to circuit under test


# "Test-products-international 565C1 User Manual"

In [42]:
pdf_name = "Test-products-international 565C1 User Manual"
pdf_path = f"./_data_pdf_unlocking_with_python/{pdf_name}.pdf"

vendor_multimodal_model_name = "openai-gpt4o"
doc = LlamaParse(
    result_type="markdown",
    use_vendor_multimodal_model=True,
    vendor_multimodal_model_name=vendor_multimodal_model_name,
).load_data(pdf_path)
print(f"Num of parsed pages: {len(doc)}")

Started parsing the file under job_id d77d1cd4-0fd7-4f29-b4fd-2cc4ae1c2ac3
Num of parsed pages: 4


In [43]:
pres = MDParser.parse_md_doc([doc[inx].text for inx in range(len(doc))])

In [44]:
for inx, res_single in enumerate(pres):
    if res_single[1] is PDFPartsTypes.TABLE:
        print(f"{inx}:", res_single[1], res_single[0].shape)
    else:
        print(f"{inx}:", res_single[1], len(res_single[0]))

0: PDFPartsTypes.TEXT 5935
1: PDFPartsTypes.TABLE (1, 2)


In [45]:
print(pres[0][0])

![TPI Logo](https://www.tpi-thevalueleader.com)

Visit TPI, The Value Leader™ at:  
[www.tpi-thevalueleader.com](http://www.tpi-thevalueleader.com)

# TPI 565

![TPI 565 Device](image-of-device)

**Hotwire Air Velocity Meter**

----

Test Products International, Inc.  
Headquarters:  
9615 SW Allen Blvd.  
Beaverton, OR 97005  
USA  
503-520-9197  •  Fax: 503-520-1225  
e-mail: info@tpi-thevalueleader.com

Test Products International, Ltd.  
342 Bronte St. South Unit #9  
Milton, Ontario L9T 5B7  
Canada  
905-693-8558  •  Fax: 905-693-0888  
e-mail: info@tpicanada.com

Test Products International UK Ltd.  
Longley House, East Park  
Crawley, West Sussex RH10 6AP  
England  
Tel: +44 (0)1293 561212  
Fax: +44 (0)1293 813465  
e-mail: info@tpi-uk.com

----

555M © 2004
# Contents

3 Introduction  
4 Instrument Overview  
4 Operating Instructions  
5 Volume Measurements CFM  
5 Recording Data  
6 RS232 Interface  
6 Specifications  
7 Calibration & Service  
7 Guarantee  
7 Troubleshooti

In [46]:
pres[1][0].head()

Unnamed: 0,Problem,Solution
0,Unit will not turn on,"Battery voltage is low, change batteries."


# Test pd.DataFrame table columns typing

In [82]:
vendor_multimodal_model_name = "openai-gpt4o"
doc = LlamaParse(
    result_type="markdown",
    use_vendor_multimodal_model=True,
    vendor_multimodal_model_name=vendor_multimodal_model_name,
).load_data("../Markdown_parsing/_data_test_pdfs_custom/autozaz_details_orders - orders_real.pdf")
print(f"Num of parsed pages: {len(doc)}")

Started parsing the file under job_id 5d117049-2278-4335-9dc5-e352bca5d48e
Num of parsed pages: 3


In [85]:
pres = MDParser.parse_md_doc([doc[inx].text for inx in range(len(doc))])
for inx, res_single in enumerate(pres):
    if res_single[1] is PDFPartsTypes.TABLE:
        print(f"{inx}:", res_single[1], res_single[0].shape)
    else:
        print(f"{inx}:", res_single[1], len(res_single[0]))

0: PDFPartsTypes.TABLE (100, 7)


In [84]:
pres[0][0]

Unnamed: 0,order_id,product_name,price_per_unit,units_sold,amount_of_discount_total,deal_date,customer_name
0,1,Alternator,350,5,157.5,2024-01-01,Emily Johnson
1,2,Engine Block,500,5,100.0,2024-01-02,Emily Johnson
2,3,Engine Block,500,4,120.0,2024-01-03,Scott Nilson
3,4,Clutch Plate,180,1,7.2,2024-01-04,Scott Nilson
4,5,Transmission Gear,300,9,189.0,2024-01-05,Scott Nilson
...,...,...,...,...,...,...,...
95,96,Engine Block,500,9,90.0,2023-06-23,Emily Johnson
96,97,Fuel Injector,250,4,40.0,2023-06-24,Emily Johnson
97,98,Radiator,200,2,16.0,2023-06-25,Emily Johnson
98,99,Fuel Injector,250,9,67.5,2023-06-26,Emily Johnson


In [86]:
pres[0][0].dtypes

order_id                              int8
product_name                        object
price_per_unit                       int16
units_sold                            int8
amount_of_discount_total           float64
deal_date                   datetime64[ns]
customer_name                       object
dtype: object