In [1]:
import spacy
import PyPDF2
import pandas as pd

In [2]:
nlp = spacy.load("de_core_news_sm")

def extract_text_from_pdf_with_ocr(pdf_path):
    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        # Create a PDF reader object
        pdf_reader = PyPDF2.PdfReader(file)

        # Initialize an empty string to store extracted text
        text = ''

        # Iterate through all pages of the PDF
        for page_num in range(len(pdf_reader.pages)):
            # Extract text from the current page
            text += pdf_reader.pages[page_num].extract_text()

    return text

In [13]:
def process_text_with_spacy(text):
    # Process the text using spaCy
    doc = nlp(text)

    # Extracted data organized by person
    person_data = {}

    for entity in doc.ents:
        # Check if the entity is a person
        if entity.label_ == 'PERSON':
            person_name = entity.text

            # Check if the person already exists in the data dictionary
            if person_name not in person_data:
                person_data[person_name] = {
                    'Identity Number': len(person_data) + 1,
                    'Attributes': [entity.text],
                }
            else:
                # Append additional attributes to the existing person
                person_data[person_name]['Attributes'].append(entity.text)

    return person_data

In [14]:
def save_to_excel(data, excel_path):
    # Create a pandas DataFrame from the extracted data
    rows = []
    for person_name, attributes in data.items():
        row = {'Identity Number': attributes['Identity Number'], 'Person': person_name, 'Attributes': ', '.join(attributes['Attributes'])}
        rows.append(row)

    df = pd.DataFrame(rows)

    # Save the DataFrame to an Excel file
    df.to_excel(excel_path, index=False)

In [15]:
if __name__ == "__main__":
    # Provide the path to your PDF file
    pdf_path = "/Users/pankajrathi/PycharmProjects/projcv/work/Mietvertrag 2 neu 13.OG-1.pdf"

    # Extract text from PDF with OCR
    pdf_text = extract_text_from_pdf_with_ocr(pdf_path)

    # Process text using spaCy
    extracted_data = process_text_with_spacy(pdf_text)

    # Save the extracted data to an Excel sheet
    excel_path = "/Users/pankajrathi/PycharmProjects/projcv/work/j8.xlsx"
    save_to_excel(extracted_data, excel_path)

    print(f"Data extracted and saved to {excel_path}")

Data extracted and saved to /Users/pankajrathi/PycharmProjects/projcv/work/j8.xlsx
