依赖需求:
    camelot --> pdf表格识别 https://pypi.org/project/camelot-py/ 
    pandas --> 数据处理 https://pypi.org/project/pandas/
    pdfplumber -> pdf文本识别 https://pypi.org/project/pdfplumber/
    akshare --> 数据api https://pypi.org/project/akshare/

In [1]:
import camelot

# 指定页面范围
page_range = '62-69'  # 示例：提取整个文档

# 读取 PDF 文件
tables = camelot.read_pdf('分众传媒：2020年年度报告.PDF', pages=page_range, flavor='lattice')

# 遍历所有提取的表格并保存为 CSV 文件
for i, table in enumerate(tables):
    table.to_csv(f'table_{i+1}.csv')  # 保存每个表格为 CSV 文件 

tables.export('分众传媒：2020年年度报告.csv', f='csv')

仅执行读取，不添加表格分页是否连续

In [3]:
import pdfplumber

def extract_text_around_tables(pdf_path, page_range):
    continuity_info = []
    with pdfplumber.open(pdf_path) as pdf:

        start, end = map(int, page_range.split('-'))
        pages = [pdf.pages[i] for i in range(start-1, end)]
        
        for i in range(len(pages) - 1):
            current_page = pages[i]
            next_page = pages[i + 1]


            current_tables = current_page.find_tables()
            next_tables = next_page.find_tables()


            if current_tables:
                last_table = current_tables[-1].bbox  
                text_after_last_table = current_page.crop((0, last_table[3], current_page.width, current_page.height)).extract_text()
            else:
                text_after_last_table = ""

            if next_tables:
                first_table = next_tables[0].bbox
                text_before_first_table = next_page.crop((0, 0, next_page.width, first_table[1])).extract_text()
            else:
                text_before_first_table = ""


            text_after_lines = text_after_last_table.split('\n') if text_after_last_table else []
            text_before_lines = text_before_first_table.split('\n') if text_before_first_table else []
            total_lines = len(text_after_lines) + len(text_before_lines)

            if total_lines > 2:
                continuity_info.append(f"Page {i+1} and Page {i+2} might have separate tables due to {total_lines} lines of text between them.")
            else:
                continuity_info.append(f"Page {i+1} and Page {i+2} might have continuous tables with only {total_lines} lines of text between.")

    return continuity_info


continuity_check = extract_text_around_tables('分众传媒：2020年年度报告.PDF', '62-69')
continuity_check


['Page 1 and Page 2 might have separate tables due to 3 lines of text between them.',
 'Page 2 and Page 3 might have separate tables due to 5 lines of text between them.',
 'Page 3 and Page 4 might have continuous tables with only 2 lines of text between.',
 'Page 4 and Page 5 might have separate tables due to 5 lines of text between them.',
 'Page 5 and Page 6 might have separate tables due to 7 lines of text between them.',
 'Page 6 and Page 7 might have separate tables due to 5 lines of text between them.',
 'Page 7 and Page 8 might have separate tables due to 5 lines of text between them.']

判断分页表格是否连续，
1,2分页判断失败(可采取阈值为三的判断)

In [6]:
import camelot
import pandas as pd
import pdfplumber

def extract_text_around_tables(pdf_path, page_range):
    continuity_info = []
    with pdfplumber.open(pdf_path) as pdf:
        start, end = map(int, page_range.split('-'))
        pages = [pdf.pages[i] for i in range(start - 1, end)]
        for i in range(len(pages) - 1):
            current_page = pages[i]
            next_page = pages[i + 1]
            current_tables = current_page.find_tables()
            next_tables = next_page.find_tables()
            if current_tables and next_tables:
                last_table = current_tables[-1].bbox
                first_table = next_tables[0].bbox
                text_after_last_table = current_page.crop((0, last_table[3], current_page.width, current_page.height)).extract_text()
                text_before_first_table = next_page.crop((0, 0, next_page.width, first_table[1])).extract_text()
                text_after_lines = text_after_last_table.split('\n') if text_after_last_table else []
                text_before_lines = text_before_first_table.split('\n') if text_before_first_table else []
                total_lines = len(text_after_lines) + len(text_before_lines)

                if total_lines <= 3:
                    continuity_info.append((i + 1, i + 2))  
    return continuity_info

def read_and_merge_tables(pdf_path, page_range, output_path):
    continuous_pages = extract_text_around_tables(pdf_path, page_range)
    tables = camelot.read_pdf(pdf_path, pages=page_range, flavor='lattice')
    dfs = [table.df for table in tables]
    merged_dfs = []
    i = 0
    while i < len(dfs):
        if any(x[0] == i+1 for x in continuous_pages):
            merged_df = pd.concat([dfs[i], dfs[i + 1]], ignore_index=True)
            merged_dfs.append(merged_df)
            i += 2  
        else:
            merged_dfs.append(dfs[i])
            i += 1
    with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
        for index, df in enumerate(merged_dfs):
            df.to_excel(writer, sheet_name=f'Table_{index+1}', index=False)


pdf_path = '分众传媒：2020年年度报告.PDF'
output_path = '合并资产负债表_改.xlsx'
page_range = '62-69'


read_and_merge_tables(pdf_path, page_range, output_path)


In [4]:
import akshare as ak
import pandas as pd


stock_financial_report_sina_df = ak.stock_financial_report_sina(stock="sh600600", symbol="资产负债表")

# with pd.ExcelWriter('stock_sse_summary.xlsx') as writer:

#     stock_financial_report_sina_df.to_excel(writer, sheet_name='SSE Summary', index=False)

stock_financial_report_sina_df.to_json('stock_financial_report_sina.json', orient='records', lines=True)
print("汇总数据已经成功写入Excel文件。")


汇总数据已经成功写入Excel文件。


In [None]:
import subprocess

pipe = subprocess.Popen(["java","My"])