In [None]:
import os
import re
import pandas as pd
import pdfplumber
from pathlib import Path

def extract_invoice_info(pdf_path):
    """
    从PDF发票中提取关键信息
    """
    try:
        with pdfplumber.open(pdf_path) as pdf:
            # 读取所有页面的文本
            text = ''
            for page in pdf.pages:
                text += page.extract_text() + '\n'
            
            # 提取发票号码
            invoice_no_match = re.search(r'发票号码[：:]\s*(\d+)', text)
            invoice_no = invoice_no_match.group(1) if invoice_no_match else ''
            
            # 提取开票日期
            date_match = re.search(r'开票日期[：:]\s*(\d{4}年\d{1,2}月\d{1,2}日)', text)
            invoice_date = date_match.group(1) if date_match else ''
            
            # 提取销售方名称(供货单位)
            # 先找到"销售方信息"区域,然后在其后找"名称"
            supplier_match = re.search(r'销[\s\S]{0,100}名称[：:]\s*([^\s\n]+)', text)
            supplier = supplier_match.group(1) if supplier_match else ''
            
            # 提取购买方名称
            buyer_match = re.search(r'购[\s\S]{0,100}名称[：:]\s*([^\s\n]+)', text)
            buyer = buyer_match.group(1) if buyer_match else ''
            
            # 提取销售方税号
            supplier_tax_match = re.search(r'销[\s\S]{0,150}统一社会信用代码/纳税人识别号[：:]\s*([A-Z0-9]+)', text)
            supplier_tax_no = supplier_tax_match.group(1) if supplier_tax_match else ''
            
            # 提取购买方税号
            buyer_tax_match = re.search(r'购[\s\S]{0,150}统一社会信用代码/纳税人识别号[：:]\s*([A-Z0-9]+)', text)
            buyer_tax_no = buyer_tax_match.group(1) if buyer_tax_match else ''
            
            # 提取项目名称(品名) - 提取所有*号之间的内容
            items = re.findall(r'\*([^*]+)\*', text)
            items_str = '; '.join([item.strip() for item in items if item.strip()])
            
            # 提取价税合计金额
            amount_match = re.search(r'价税合计[^¥]*¥([\d,]+\.?\d*)', text)
            total_amount = amount_match.group(1) if amount_match else ''
            
            # 提取税额
            tax_match = re.search(r'合\s*计[^¥]*¥[\d,]+\.?\d*[^¥]*¥([\d,]+\.?\d*)', text)
            tax_amount = tax_match.group(1) if tax_match else ''
            
            # 提取不含税金额
            amount_no_tax_match = re.search(r'合\s*计[^¥]*¥([\d,]+\.?\d*)', text)
            amount_no_tax = amount_no_tax_match.group(1) if amount_no_tax_match else ''
            
            return {
                '文件名': os.path.basename(pdf_path),
                '发票号码': invoice_no,
                '开票日期': invoice_date,
                '供货单位': supplier,
                '供货单位税号': supplier_tax_no,
                '购买方': buyer,
                '购买方税号': buyer_tax_no,
                '品名': items_str,
                '不含税金额': amount_no_tax,
                '税额': tax_amount,
                '价税合计': total_amount,
                '状态': '成功'
            }
            
    except Exception as e:
        return {
            '文件名': os.path.basename(pdf_path),
            '发票号码': '',
            '开票日期': '',
            '供货单位': '',
            '供货单位税号': '',
            '购买方': '',
            '购买方税号': '',
            '品名': '',
            '不含税金额': '',
            '税额': '',
            '价税合计': '',
            '状态': f'失败: {str(e)}'
        }

def process_pdf_folder(folder_path, output_excel='发票汇总.xlsx'):
    """
    处理文件夹中的所有PDF发票
    """
    # 获取文件夹中所有PDF文件
    pdf_files = list(Path(folder_path).glob('*.pdf'))
    
    if not pdf_files:
        print(f"在 {folder_path} 中没有找到PDF文件")
        return
    
    print(f"找到 {len(pdf_files)} 个PDF文件，开始处理...")
    
    # 提取所有发票信息
    results = []
    for i, pdf_file in enumerate(pdf_files, 1):
        print(f"正在处理 ({i}/{len(pdf_files)}): {pdf_file.name}")
        info = extract_invoice_info(str(pdf_file))
        results.append(info)
    
    # 转换为DataFrame并保存为Excel
    df = pd.DataFrame(results)
    
    # 保存到Excel
    output_path = os.path.join(folder_path, output_excel)
    df.to_excel(output_path, index=False, engine='openpyxl')
    
    print(f"\n处理完成!")
    print(f"成功: {len([r for r in results if r['状态'] == '成功'])} 个")
    print(f"失败: {len([r for r in results if r['状态'] != '成功'])} 个")
    print(f"结果已保存到: {output_path}")
    
    return df

# 主程序
if __name__ == "__main__":
    # 设置PDF文件所在的文件夹路径
    folder_path = r"C:\Users\Admin\Desktop\张思远2025发票\全部发票"  # 修改为你的文件夹路径
    
    # 如果想使用当前脚本所在目录，取消下面这行的注释
    # folder_path = os.path.dirname(os.path.abspath(__file__))
    
    print("="*50)
    print("PDF发票信息提取工具")
    print("="*50)
    print(f"目标文件夹: {folder_path}\n")
    
    # 处理PDF文件
    df = process_pdf_folder(folder_path)
    
    # 显示前5条结果预览
    if df is not None and not df.empty:
        print("\n前5条结果预览:")
        print(df.head().to_string())