In [3]:
from enum import Enum


class PDFPartsTypes(Enum):
    """
    Represents the types of PDF parts.
    """

    TEXT = 0
    TABLE = 1

In [4]:
test_list = [(0, PDFPartsTypes.TEXT), (1, PDFPartsTypes.TABLE), (2, PDFPartsTypes.TABLE), (3, PDFPartsTypes.TABLE), (4, PDFPartsTypes.TEXT)]

In [5]:
[pdf_part for pdf_part, part_type in test_list if part_type is PDFPartsTypes.TABLE]

[1, 2, 3]

In [6]:
[pdf_part for pdf_part, part_type in test_list if part_type is PDFPartsTypes.TEXT]

[0, 4]

In [7]:
type(PDFPartsTypes.TEXT)

<enum 'PDFPartsTypes'>

In [8]:
PDFPartsTypes.TEXT

<PDFPartsTypes.TEXT: 0>

In [17]:
from typing import List, Tuple, Union
from enum import Enum

import pandas as pd


def parse_md_text(md_text: str) -> List[Tuple[Union[str, pd.DataFrame], PDFPartsTypes]]:
    print("OK")
    return [("0", PDFPartsTypes.TEXT), ("1", PDFPartsTypes.TABLE), (pd.DataFrame(), PDFPartsTypes.TABLE), (None, PDFPartsTypes.TABLE), ("4", PDFPartsTypes.TEXT)]

In [18]:
parse_md_text(1)

OK


[('0', <PDFPartsTypes.TEXT: 0>),
 ('1', <PDFPartsTypes.TABLE: 1>),
 (Empty DataFrame
  Columns: []
  Index: [],
  <PDFPartsTypes.TABLE: 1>),
 (None, <PDFPartsTypes.TABLE: 1>),
 ('4', <PDFPartsTypes.TEXT: 0>)]

In [16]:
parse_md_text("3")

OK


[('0', <PDFPartsTypes.TEXT: 0>),
 ('1', <PDFPartsTypes.TABLE: 1>),
 (Empty DataFrame
  Columns: []
  Index: [],
  <PDFPartsTypes.TABLE: 1>),
 ('3', <PDFPartsTypes.TABLE: 1>),
 ('4', <PDFPartsTypes.TEXT: 0>)]

In [19]:
import markdown
from bs4 import BeautifulSoup
import pandas as pd


def markdown_to_html(md_text):
    """
    Convert Markdown text to HTML using the markdown library.
    Enable the 'tables' extension to properly parse tables.
    """
    return markdown.markdown(md_text, extensions=['tables'])


def extract_html_tables(html):
    """
    Extract all tables from HTML content using BeautifulSoup.
    Returns a list of BeautifulSoup table objects.
    """
    soup = BeautifulSoup(html, 'html.parser')
    tables = soup.find_all('table')
    return tables


def html_tables_to_dataframes(html_tables):
    """
    Convert a list of BeautifulSoup table objects to Pandas DataFrames.
    Returns a list of DataFrames.
    """
    dataframes = []
    for table in html_tables:
        # Convert the BeautifulSoup table back to a string for pandas to read
        table_str = str(table)
        df = pd.read_html(table_str)[0]  # Convert the first (and likely only) table to DataFrame
        dataframes.append(df)
    return dataframes


def extract_tables_from_markdown(md_text):
    """
    Extract tables from Markdown text and return them as a list of Pandas DataFrames.
    """
    # Step 1: Convert Markdown to HTML
    html = markdown_to_html(md_text)

    # Step 2: Extract HTML tables
    html_tables = extract_html_tables(html)

    # Step 3: Convert HTML tables to Pandas DataFrames
    dataframes = html_tables_to_dataframes(html_tables)

    return dataframes

In [20]:
sample_md = """
# Sample Markdown Document with Tables

Here is a table of employees:

| Name    | Age | Department |
|---------|-----|------------|
| Alice   | 30  | HR         |
| Bob     | 25  | Engineering|
| Charlie | 35  | Marketing  |

Another table of products:

| Product | Price | Quantity |
|---------|-------|----------|
| Book    | 12.99 | 100      |
| Pen     | 1.99  | 500      |
| Laptop  | 999.99| 50       |
"""

# Extract tables
tables = extract_tables_from_markdown(sample_md)

# Print DataFrames
for idx, df in enumerate(tables, start=1):
    print(f"Table {idx}:")
    print(df)
    print("\n")

Table 1:
      Name  Age   Department
0    Alice   30           HR
1      Bob   25  Engineering
2  Charlie   35    Marketing


Table 2:
  Product   Price  Quantity
0    Book   12.99       100
1     Pen    1.99       500
2  Laptop  999.99        50




In [21]:
markdown_to_html(sample_md)

'<h1>Sample Markdown Document with Tables</h1>\n<p>Here is a table of employees:</p>\n<table>\n<thead>\n<tr>\n<th>Name</th>\n<th>Age</th>\n<th>Department</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>Alice</td>\n<td>30</td>\n<td>HR</td>\n</tr>\n<tr>\n<td>Bob</td>\n<td>25</td>\n<td>Engineering</td>\n</tr>\n<tr>\n<td>Charlie</td>\n<td>35</td>\n<td>Marketing</td>\n</tr>\n</tbody>\n</table>\n<p>Another table of products:</p>\n<table>\n<thead>\n<tr>\n<th>Product</th>\n<th>Price</th>\n<th>Quantity</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>Book</td>\n<td>12.99</td>\n<td>100</td>\n</tr>\n<tr>\n<td>Pen</td>\n<td>1.99</td>\n<td>500</td>\n</tr>\n<tr>\n<td>Laptop</td>\n<td>999.99</td>\n<td>50</td>\n</tr>\n</tbody>\n</table>'

In [23]:
import markdown
from bs4 import BeautifulSoup
import pandas as pd
from typing import List, Tuple, Union

def markdown_to_html(md_text):
    """
    Convert Markdown text to HTML using the markdown library.
    Enable the 'tables' extension to properly parse tables.
    """
    return markdown.markdown(md_text, extensions=['tables'])

def extract_html_tables_and_text(html):
    """
    Extract text and tables from HTML content using BeautifulSoup.
    Return a list of text and DataFrames in the order they are found.
    """
    soup = BeautifulSoup(html, 'html.parser')
    result = []
    
    # Iterate over all elements in the HTML to preserve order
    for element in soup.body.children:
        if element.name == 'p':  # Paragraph text
            result.append((element.get_text(), "text"))
        elif element.name == 'table':  # Table
            # Convert the table into a DataFrame
            table_str = str(element)
            df = pd.read_html(table_str)[0]  # Convert HTML table to DataFrame
            result.append((df, "table"))
    
    return result

def extract_text_and_tables_from_markdown(md_text):
    """
    Extract both text and tables from Markdown text and return them in the order
    they appear as a list of tuples (text or DataFrame, type).
    """
    # Step 1: Convert Markdown to HTML
    html = markdown_to_html(md_text)
    
    # Step 2: Extract text and tables from HTML in order
    elements_in_order = extract_html_tables_and_text(html)
    
    return elements_in_order

In [24]:
sample_md = """
# Sample Markdown Document

This is some introductory text.

Here is a table of employees:

| Name    | Age | Department |
|---------|-----|------------|
| Alice   | 30  | HR         |
| Bob     | 25  | Engineering|
| Charlie | 35  | Marketing  |

This is some text after the table.

Another table of products:

| Product | Price | Quantity |
|---------|-------|----------|
| Book    | 12.99 | 100      |
| Pen     | 1.99  | 500      |
| Laptop  | 999.99| 50       |

Some final text after the second table.
"""

# Extract text and tables
elements = extract_text_and_tables_from_markdown(sample_md)

# Print the extracted elements
for element, elem_type in elements:
    if elem_type == "text":
        print("Text:")
        print(element)
    elif elem_type == "table":
        print("Table:")
        print(element)
    print("\n")

AttributeError: 'NoneType' object has no attribute 'children'

In [25]:
import markdown
from bs4 import BeautifulSoup
import pandas as pd
import re

def extract_tables_and_text(md_text: str):
    """
    Extracts tables and text from Markdown and returns them in the same order.
    Tables are returned as DataFrames, while text parts remain as strings.
    """
    html = markdown.markdown(md_text, extensions=['tables'])
    soup = BeautifulSoup(html, 'html.parser')
    
    content = []
    
    # Identify table and non-table elements
    elements = soup.find_all(['p', 'table'])
    
    for elem in elements:
        if elem.name == 'table':
            df = pd.read_html(str(elem))[0]  # Convert table to DataFrame
            content.append(df)
        else:
            text = elem.get_text(strip=True)
            if text:
                content.append(text)
    
    return content

In [27]:
# Example usage
md_text = """
Here is some text before the table.

| Name    | Age | Department |
|---------|-----|------------|
| Alice   | 30  | HR         |
| Bob     | 25  | Engineering|
| Charlie | 35  | Marketing  |

This is more text after the table.
"""
result = extract_tables_and_text(md_text)

# Output the result
for inx_, item in enumerate(result):
    print(f"Item no: {inx_}")
    print(item)
    print("\n")

Item no: 0
Here is some text before the table.


Item no: 1
      Name  Age   Department
0    Alice   30           HR
1      Bob   25  Engineering
2  Charlie   35    Marketing


Item no: 2
This is more text after the table.




In [28]:
md_text = """
# Sample Markdown Document

This is some introductory text.

Here is a table of employees:

| Name    | Age | Department |
|---------|-----|------------|
| Alice   | 30  | HR         |
| Bob     | 25  | Engineering|
| Charlie | 35  | Marketing  |

This is some text after the table.

Another table of products:

| Product | Price | Quantity |
|---------|-------|----------|
| Book    | 12.99 | 100      |
| Pen     | 1.99  | 500      |
| Laptop  | 999.99| 50       |

Some final text after the second table.
"""

result = extract_tables_and_text(md_text)

# Output the result
for inx_, item in enumerate(result):
    print(f"Item no: {inx_}")
    print(item)
    print("\n")

Item no: 0
This is some introductory text.


Item no: 1
Here is a table of employees:


Item no: 2
      Name  Age   Department
0    Alice   30           HR
1      Bob   25  Engineering
2  Charlie   35    Marketing


Item no: 3
This is some text after the table.


Item no: 4
Another table of products:


Item no: 5
  Product   Price  Quantity
0    Book   12.99       100
1     Pen    1.99       500
2  Laptop  999.99        50


Item no: 6
Some final text after the second table.




In [7]:
import re
import pandas as pd
from typing import List, Union

def parse_table(md_table: str) -> pd.DataFrame:
    """
    Parses a Markdown table string and converts it into a Pandas DataFrame.
    """
    lines = md_table.strip().split("\n")
    headers = [h.strip() for h in lines[0].split("|")[1:-1]]  # Skip first and last empty strings
    rows = [line.split("|")[1:-1] for line in lines[2:]]  # Data rows (skip header separator)
    data = [[cell.strip() for cell in row] for row in rows]
    return pd.DataFrame(data, columns=headers)

def extract_tables_and_text(md_text: str) -> List[Union[str, pd.DataFrame]]:
    """
    Extracts tables and text from Markdown in the same order they appear.
    Tables are returned as Pandas DataFrames, and text parts as strings.
    """
    content = []
    
    # Regular expression for matching Markdown tables
    table_pattern = re.compile(
        r'((?:\|.+\|\n)+\|[-\s]+[-\|]+\|\n(?:\|.+\|\n)+)', re.MULTILINE
    )
    
    # Split the text by tables
    parts = table_pattern.split(md_text)
    
    for part in parts:
        part = part.strip()
        if table_pattern.match(part):  # It's a table
            df = parse_table(part)
            content.append(df)
        elif part:  # Non-empty text
            content.append(part)
    
    return content

In [13]:
# Example usage
md_text = """
# this is a paragraph
## this is a subparagraph
## this is a list
- a1
- a2
- a3
- b0
Here is some text before the table.

| Name    | Age | Department |
|---------|-----|------------|
| Alice   | 30  | HR         |
| Bob     | 25  | Engineering|
| Charlie | 35  | Marketing  |

This is more text after the table.

Here is another table:

| Product | Price | Quantity |
|---------|-------|----------|
| Book    | 12.99 | 100      |
| Pen     | 1.99  | 500      |
| Laptop  | 999.99| 50       |

Some final text after the second table.
"""
result = extract_tables_and_text(md_text)

# Output the result
for inx, item in enumerate(result):
    print(f"Item no: {inx} ------------------------------------------------------------------")
    print(f"Type of item: {type(item)}")
    print(item)
    print("\n")

Item no: 0 ------------------------------------------------------------------
Type of item: <class 'str'>
# this is a paragraph
## this is a subparagraph
## this is a list
- a1
- a2
- a3
- b0
Here is some text before the table.


Item no: 1 ------------------------------------------------------------------
Type of item: <class 'pandas.core.frame.DataFrame'>
      Name Age   Department
0    Alice  30           HR
1      Bob  25  Engineering
2  Charlie  35    Marketing


Item no: 2 ------------------------------------------------------------------
Type of item: <class 'str'>
This is more text after the table.

Here is another table:


Item no: 3 ------------------------------------------------------------------
Type of item: <class 'pandas.core.frame.DataFrame'>
  Product   Price Quantity
0    Book   12.99      100
1     Pen    1.99      500
2  Laptop  999.99       50


Item no: 4 ------------------------------------------------------------------
Type of item: <class 'str'>
Some final te

In [11]:
result[1]

Unnamed: 0,Name,Age,Department
0,Alice,30,HR
1,Bob,25,Engineering
2,Charlie,35,Marketing


In [14]:
import re
import pandas as pd
from typing import List, Union

def parse_table(md_table: str) -> pd.DataFrame:
    """
    Parses a Markdown table string and converts it into a Pandas DataFrame.
    """
    lines = md_table.strip().split("\n")
    headers = [h.strip() for h in re.split(r'\s*\|\s*', lines[0]) if h]  # Split header row by '|'
    rows = [re.split(r'\s*\|\s*', line) for line in lines[2:]]  # Split data rows by '|'
    data = [[cell.strip() for cell in row if cell] for row in rows]  # Clean up cells
    return pd.DataFrame(data, columns=headers)

def extract_tables_and_text(md_text: str) -> List[Union[str, pd.DataFrame]]:
    """
    Extracts tables and text from Markdown in the same order they appear.
    Tables are returned as Pandas DataFrames, and text parts as strings.
    """
    content = []
    
    # Improved regular expression to match Markdown tables, allowing for optional spaces and missing cells
    table_pattern = re.compile(
        r'((?:\|\s*.+?\s*\|\s*\n)+\|\s*[-:\s|]+\s*\|\s*\n(?:\|\s*.+?\s*\|\s*\n)+)', re.MULTILINE
    )
    
    # Split the text by tables
    parts = table_pattern.split(md_text)
    
    for part in parts:
        part = part.strip()
        if table_pattern.match(part):  # It's a table
            df = parse_table(part)
            content.append(df)
        elif part:  # Non-empty text
            content.append(part)
    
    return content

In [23]:
# Example usage
md_text = """
This is some text before the table.

|Name|Age|Department|
|----|---|----------|
|Alice|30|HR|
| Bob | 25|Engineering|
|Charlie|35 | Marketing|

More text after the table.

|Product|Price|Quantity|
|-------|-----|--------|
|Book|12.99|100|
| Pen |1.99|500|
|Laptop|999.99| 50|
# this is a paragraph
## this is a subparagraph
## this is a list
- a1
- a2
- a3
- b0
Here is some text before the table.

| Name    | Age | Department |
|---------|-----|------------|
| Alice   | 30  | HR         |
| Bob     | 25  | Engineering|
| Charlie | 35  | Marketing  |

This is more text after the table.

Here is another table:

| Product | Price | Quantity |
|---------|-------|----------|
| Book    | 12.99 | 100      |
| Pen     | 1.99  | 500      |
| Laptop  | 999.99| 50       |

Some final text after the second table.
Final text after the second table.
| Product | Price | Quantity |
|---------|-------|----------------------|
| Book    | 12.99 | 100  |


| Pen     | 1.99  | 500     |
| Laptop  | 999.99| 50                             |


# example of broken table

| Product | Price | Quantity |
|---------|-------|----------------------|
| Book    | 12.99  100  |


| Pen     | 1.99  | 500     |
| Laptop  | 999.99| 50                             |
"""
result = extract_tables_and_text(md_text)

# Output the result
for inx, item in enumerate(result):
    print(f"Item no: {inx} ------------------------------------------------------------------")
    print(f"Type of item: {type(item)}")
    print(item)
    print()

Item no: 0 ------------------------------------------------------------------
Type of item: <class 'str'>
This is some text before the table.

Item no: 1 ------------------------------------------------------------------
Type of item: <class 'pandas.core.frame.DataFrame'>
      Name Age   Department
0    Alice  30           HR
1      Bob  25  Engineering
2  Charlie  35    Marketing

Item no: 2 ------------------------------------------------------------------
Type of item: <class 'str'>
More text after the table.

Item no: 3 ------------------------------------------------------------------
Type of item: <class 'pandas.core.frame.DataFrame'>
  Product   Price Quantity
0    Book   12.99      100
1     Pen    1.99      500
2  Laptop  999.99       50

Item no: 4 ------------------------------------------------------------------
Type of item: <class 'str'>
# this is a paragraph
## this is a subparagraph
## this is a list
- a1
- a2
- a3
- b0
Here is some text before the table.

Item no: 5 -

In [24]:
result[-1]

Unnamed: 0,Product,Price,Quantity
0,Book,12.99 100,
1,,,
2,,,
3,Pen,1.99,500.0
4,Laptop,999.99,50.0


In [26]:
md_text = """| Product | Price | Quantity |
|---------|-------|----------------------|
| Book    | 12.99  
100  |


| Pen     | 1.99  | 500     |
| Laptop  | 999.99| 50                             |
"""
result = extract_tables_and_text(md_text)

# Output the result
for inx, item in enumerate(result):
    print(f"Item no: {inx} ------------------------------------------------------------------")
    print(f"Type of item: {type(item)}")
    print(item)
    print()

Item no: 0 ------------------------------------------------------------------
Type of item: <class 'str'>
| Product | Price | Quantity |
|---------|-------|----------------------|
| Book    | 12.99  
100  |


| Pen     | 1.99  | 500     |
| Laptop  | 999.99| 50                             |



In [27]:
list1 = [1, 2, 3]
list2 = [4, 5, 6]

list1.extend(list2)
print(list1)  # Output: [1, 2, 3, 4, 5, 6]


[1, 2, 3, 4, 5, 6]
