In [5]:
import re
from pypdf import PdfReader
import pandas as pd

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PdfReader(file)
        text = ""
        for page_num in range(reader.get_num_pages()):
            page = reader.get_page(page_num)
            text += page.extract_text()
    return text

def parse_facture(text):
    numero_pattern = r"Facture\s*n°(\d+)"
    date_pattern = r"Date\s*:\s*(\d{2}/\d{2}/\d{4})"
    montant_ht_pattern = r"Montant\s*Total\s*HT\s*:\s*(\d+\.\d{2})"
    montant_ttc_pattern = r"Montant\s*Total\s*TTC\s*:\s*(\d+\.\d{2})"
    
    numero = re.search(numero_pattern, text).group(1)
    date = re.search(date_pattern, text).group(1)
    montant_ht = float(re.search(montant_ht_pattern, text).group(1))
    montant_ttc = float(re.search(montant_ttc_pattern, text).group(1))
    
    return {
        "numero": numero,
        "date": date,
        "montant_ht": montant_ht,
        "montant_ttc": montant_ttc
    }

def save_to_csv(data, csv_path):
    df = pd.DataFrame(data)
    df.to_csv(csv_path, index=False)

# Exemple d'utilisation
pdf_path = "../Data/Facture_001.pdf"
csv_path = "factures.csv"

text = extract_text_from_pdf(pdf_path)
facture_data = [parse_facture(text)]
save_to_csv(facture_data, csv_path)

print(f"Les données des factures ont été enregistrées dans {csv_path}")


Les données des factures ont été enregistrées dans factures.csv
