In [51]:
%%capture
!pip install dkpro-cassis pdfplumber PyMuPDF stanza

In [2]:
from cassis import *
from pathlib import Path
from pprint import pprint
import pandas as pd


In [109]:
import re
import pdfplumber
import stanza

# Initialize Stanford NLP pipeline
stanza.download("de")  # Download German models
nlp = stanza.Pipeline("de", processors="tokenize")

def extract_pdf_text(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        return " ".join([page.extract_text() for page in pdf.pages if page.extract_text()])

def split_into_paragraphs(text):
    # Split text into paragraphs using double line breaks
    paragraphs = re.split(r"\n\s*\n", text)
    return [p.strip() for p in paragraphs if p.strip()]

def clean_paragraph(paragraph):
    # Step 1: Remove "Ja" and "Nein"
    cleaned_paragraph = re.sub(r"\bJa\b|\bNein\b", "", paragraph)

    # Step 2: Remove lettered numbering like "a)", "b)", "c)"
    cleaned_paragraph = re.sub(r"^\s*[a-zA-Z]\)\s*", "", cleaned_paragraph, flags=re.MULTILINE)

    # Step 3: Remove numeric numbering like "1.", "2.", etc.
    cleaned_paragraph = re.sub(r"^\s*\d+\.\s*", "", cleaned_paragraph, flags=re.MULTILINE)

    # Step 4: Handle bullet points (replace "" with a connector like "; ")
    cleaned_paragraph = re.sub(r"", "; ", cleaned_paragraph)

    # Step 5: Merge line breaks into spaces
    cleaned_paragraph = re.sub(r"\s*\n\s*", " ", cleaned_paragraph).strip()

    return cleaned_paragraph

def split_by_semicolon(text):
    # Split the paragraph or sentence by semicolons (;) and remove leading/trailing spaces
    sentences = [s.strip() for s in re.split(r"\s*;\s*", text) if s.strip()]
    return sentences

def segment_text_with_stanza(text, nlp_pipeline):
    doc = nlp_pipeline(text)
    sentences = [sentence.text for sentence in doc.sentences]
    return sentences

# Paths
pdf_path = "1_Harmonisiert_oeffentliche_submission_gesunde_ernaehrung (1).pdf"

# Step 1: Extract text from PDF
pdf_text = extract_pdf_text(pdf_path)

# Step 2: Split text into paragraphs
paragraphs = split_into_paragraphs(pdf_text)

# Step 3: Process each paragraph
for i, paragraph in enumerate(paragraphs[:5], start=1):  # Limit to the first 5 paragraphs
    print(f"\nProcessing paragraph {i}...")

    # Clean the paragraph
    cleaned_paragraph = clean_paragraph(paragraph)

    # Step 4: Split by semicolon
    semicolon_split_sentences = split_by_semicolon(cleaned_paragraph)
#########################################################################################################
    # Step 5: Segment each part using Stanza
    #USING PRETRAINED STANFORDAI 
    all_sentences = []
    for sentence in semicolon_split_sentences:
        sentences = segment_text_with_stanza(sentence, nlp)
        all_sentences.extend(sentences)
        
#####################################################################################
    #SAVE FLIE INTO CSV
    df = pd.DataFrame(all_sentences, columns=['text'])
    df.to_csv("1_Harmonisiert_pflichtenheft.csv", index=False, encoding="utf-8")
    ##############################################################################
    
    # Output results
    print(f"Total sentences in paragraph {i}: {len(all_sentences)}")
    for j, sentence in enumerate(all_sentences, start=1):
        print(f"Sentence {j}: {sentence}")


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-12-04 17:25:54 INFO: Downloaded file to C:\Users\Hamza\stanza_resources\resources.json
2024-12-04 17:25:54 INFO: Downloading default packages for language: de (German) ...
2024-12-04 17:25:56 INFO: File exists: C:\Users\Hamza\stanza_resources\de\default.zip
2024-12-04 17:26:02 INFO: Finished downloading models and saved to C:\Users\Hamza\stanza_resources
2024-12-04 17:26:02 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-12-04 17:26:02 INFO: Downloaded file to C:\Users\Hamza\stanza_resources\resources.json
2024-12-04 17:26:02 INFO: Loading these models for language: de (German):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |

2024-12-04 17:26:02 INFO: Using device: cpu
2024-12-04 17:26:02 INFO: Loading: tokenize
2024-12-04 17:26:02 INFO: Loading: mwt
2024-12-04 17:26:02 INFO: Done loading processors!



Processing paragraph 1...
Total sentences in paragraph 1: 231
Sentence 1: Öffentliche Submission Gesunde Ernährung «Kalte Linie Biel» in den Kitas und Tagesschulen der Stadt Biel Vergebende Stelle Stadt Biel Direktion Bildung, Kultur und Sport Abteilung Schule & Sport Zentralstrasse 60 2501 Biel Verfahrensart Offenes Verfahren gemäss Art.
Sentence 2: 4 Verordnung über das öffentliche Beschaffungswesen (ÖBV
Sentence 3: BSG 731.21) Wichtig Die Implementierung des Reglements «Gesunde Ernährung an den Bieler Betreuungsstrukturen» hat seinen Ursprung in einer städtischen Volksinitiative.
Sentence 4: Der politische Prozess hat gezeigt, dass sowohl eine stadteigene Produktion (interne Produktion) als auch eine Produktion durch einen privaten Anbieter (externe Produktion) in Frage kommt.
Sentence 5: Deshalb wird dieses Geschäft «unter Vorbehalt» als «Offenes Verfahren» öffentlich ausgeschrieben:
Sentence 6: Die eingereichten Offerten wer- den zwar wie üblich gemäss ÖBV bewertet.
Sentence 7: D

In [108]:
import re
from cassis import *
from pathlib import Path
import fitz  # PyMuPDF
import pdfplumber
import csv


# File Paths for the first and second set of files
typesystem_path_1 = Path("./annotations/4_Harmonisiert_ausschreibung_herstellung")
xmi_path_1 = "4_Harmonisiert_ausschreibung_herstellung.xmi"

typesystem_path_2 = Path("./annotations/2_Harmonisiert_pflichtenheft")
xmi_path_2 = "2_Harmonisiert_pflichtenheft.xmi"

typesystem_path_3 = Path("./annotations/1_Harmonisiert_oeffentliche_submission_gesunde_ernaehrung")
xmi_path_3 = "1_Harmonisiert_oeffentliche_submission_gesunde_ernaehrung.xmi"

# Function to clean paragraphs (removes Ja, Nein, numbering, bullet points, and extra spaces)
def clean_paragraph(paragraph):
    # Step 1: Remove "Ja" and "Nein"
    paragraph = re.sub(r"\bJa\b|\bNein\b", "", paragraph)

    # Step 2: Split on lettered numbering like "a)", "b)", "c)"
    sentences = re.split(r"^\s*[a-zA-Z]\)\s*", paragraph, flags=re.MULTILINE)

    # Step 3: Further split on numeric numbering like "1.", "2.", etc.
    sentences = [s for sentence in sentences for s in re.split(r"^\s*\d+\.\s*", sentence, flags=re.MULTILINE)]
    
    # Step 4: Further split on bullet points ""
    sentences = [s for sentence in sentences for s in re.split(r"", sentence)]

    # Step 5: Further split on hyphens surrounded by spaces " - "
    sentences = [s for sentence in sentences for s in re.split(r"\s+–\s+", sentence)]

    # Step 6: Split on semicolons ";" (optional)
    sentences = [s for sentence in sentences for s in re.split(r"\s*;\s*", sentence)]

    # Step 7: Remove line breaks and strip spaces from each sentence
    sentences = [re.sub(r"\s*\n\s*", " ", s).strip() for s in sentences if s.strip()]

    return sentences



# Step 1: Load TypeSystem and XMI File
def load_xmi(typesystem_path, xmi_path):
    with open(typesystem_path / "TypeSystem.xml", "rb") as f:
        typesystem = load_typesystem(f)

    with open(typesystem_path / xmi_path, "rb") as f:
        cas = load_cas_from_xmi(f, typesystem=typesystem)
    
    return cas


# Assuming 'named_entities' is already defined and contains the entities
def process_named_entitie(named_entitie):
    data = {
        "text": [],
        "criterion_id": [],
        "criterion_type": []
    }

    # Populate data dictionary from named entities

    for ne in named_entitie:
        data["criterion_id"].append(ne.identifier.split("#")[-1])  # Extract the criterion ID
        data["criterion_type"].append(ne.value)  # Extract the criterion type
        data["text"].append(ne.get_covered_text())
    return data
# Step 2: Extract Sentences from XMI
def extract_sentences_from_cas(cas,xml_type):
    sentences = cas.select(xml_type)
    return sentences

# Step 3: Process and Clean Sentences
def process_sentences(sentences , skip_count=0):
    sentence_annotations = []
    for i, sentence in enumerate(sentences[skip_count:], start=skip_count + 1):
        # Extract sentence text
        cleaned_text = sentence.get_covered_text()

        # Clean the paragraph (remove "Ja", "Nein", numbering, etc.)
        cleaned_text = clean_paragraph(cleaned_text)

        # Create the sentence annotation
        for split_sentence in cleaned_text:
            sentence_annotations.append({
                "begin": sentence.begin,
                "end": sentence.end,
                "type": "SENTENCE",  # Similar to NamedEntity
                "text": split_sentence.strip(),  # Cleaned text
                "identifier": f"sentence-{sentence.begin}-{sentence.end}"  # Unique identifier based on span
            })

    return sentence_annotations

# Step 4: Output the Results
def output_results(sentence_annotations):
    # Print the total number of cleaned sentences
    print(f"SENTENCE NUMBERS : {len(sentence_annotations)}")
    for i, sentence in enumerate(sentence_annotations[:240], start=1):
        print(f"Sentence {i}: {sentence['text']} ")

def save_sentences_to_csv(sentences, file_name):
    # Create a DataFrame from the sentence texts
    df = pd.DataFrame(sentences, columns=['text'])
    
    # Save the DataFrame to a CSV file
    df.to_csv(file_name, index=False, encoding="utf-8")   # Directly write the sentence text


# Function to save named entities (criterion_id, criterion_type, and text) to CSV
def save_named_entities_to_csv(named_entities, file_name):
    # Create a DataFrame from the named entity data
    df = pd.DataFrame(named_entities, columns=['criterion_id', 'criterion_type', 'text'])
    
    # Save the DataFrame to a CSV file
    df.to_csv(file_name, index=False, encoding="utf-8")

# Main function to load XMI, extract sentences, clean, and display results
def main():
    Sentence_xml_type="de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence"
    Namedentity_xml_type="de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity"
    # Load XMI and extract sentences for both sets of files for pdfs
    cas_1 = load_xmi(typesystem_path_1, xmi_path_1)
    sentences_1 = extract_sentences_from_cas(cas_1,Sentence_xml_type)
    sentence_cleaned_1 = process_sentences(sentences_1, skip_count=20)

    cas_2 = load_xmi(typesystem_path_2, xmi_path_2)
    sentences_2 = extract_sentences_from_cas(cas_2,Sentence_xml_type)
    sentence_cleaned_2 = process_sentences(sentences_2, skip_count=20)

    # Load XMI and extract sentences for both sets of files annotations
    
    cas_ne_1 = load_xmi(typesystem_path_1, xmi_path_1)
    ne_1 = extract_sentences_from_cas(cas_ne_1,Namedentity_xml_type)
    ne_cleaned_1 = process_named_entitie(ne_1)
    
    cas_ne_2 = load_xmi(typesystem_path_2, xmi_path_2)
    ne_2 = extract_sentences_from_cas(cas_ne_2,Namedentity_xml_type)
    ne_cleaned_2 = process_named_entitie(ne_2)
    
    cas_ne_3 = load_xmi(typesystem_path_3, xmi_path_3)
    ne_3 = extract_sentences_from_cas(cas_ne_3,Namedentity_xml_type)
    ne_cleaned_3 = process_named_entitie(ne_3)
    
    
#############################################################################################################
    #SVAE FILES INTO CSV:
    save_sentences_to_csv(sentence_cleaned_1, "4_Harmonisiert_ausschreibung.csv")
    save_sentences_to_csv(sentence_cleaned_2, "2_Harmonisiert_pflichtenheft.csv")
    save_named_entities_to_csv(ne_cleaned_1, "named_entity_annotations_4_Harmonisiert_ausschreibung.csv")
    save_named_entities_to_csv(ne_cleaned_2, "named_entity_annotations_2_Harmonisiert_ausschreibung.csv")
    save_named_entities_to_csv(ne_cleaned_3, "named_entity_annotations_1_Harmonisiert_ausschreibung.csv")
############################################################################################################
    
############################################################################################################
    #OPTIONAL PRINT 
    print("\nResults for 4_Harmonisiert_ausschreibung_herstellung:")
    output_results(sentence_cleaned_1)

    # Output results for the second set of files
    print("\nResults for 2_Harmonisiert_pflichtenheft:")
    output_results(sentence_cleaned_2)
    print ("\nAnnotations for 4_Harmonisiert_pflichtenheft:")
    print(ne_cleaned_1)
    print ("\nAnnotations for 2_Harmonisiert_pflichtenheft:")
    print(ne_cleaned_2)
    print ("\nAnnotations for 1_Harmonisiert_pflichtenheft:")
    print(ne_cleaned_3)
############################################################################################################

# Run the main function
if __name__ == "__main__":
    main()



Results for 4_Harmonisiert_ausschreibung_herstellung:
SENTENCE NUMBERS : 182
Sentence 1: ANGABEN ZUM SUBMISSIONSVERFAHREN 
Sentence 2: 1 VERFAHREN Die Auftragsvergabe erfolgt im offenen Verfahren im Staatsvertragsbereich gemäss den Bestimmungen der Interkantonalen Vereinbarung über das öffentliche Beschaffungswesen (IVöB) und der Submissionsverord- nung des Kantons Zürich (SubmV). 
Sentence 3: 2 AUFTRAGGEBERIN Stadt Illnau-Effretikon Abteilung Bildung, Bereich Betreuung Stadthaus, Märtplatz 29, 8307 Effretikon 
Sentence 4: 3 ZUSTÄNDIGE PERSON Franzisca Keel Leiterin Bereich Betreuung Stadthaus, Märtplatz 29, 8307 Effretikon 052 354 23 83 / franzisca.keel@ilef.ch 
Sentence 5: GEGENSTAND DER AUSSCHREIBUNG 
Sentence 6: 1 AUSGANGSLAGE Der Auftrag zur Herstellung und Lieferung der Mittagsverpflegung für drei schulergänzende Betreuungseinrich- tungen der Volksschule und für die zwei städtischen Kindertagesstätten der Stadt Illnau-Effretikon wird neu vergeben. 
Sentence 7: Pro Standort und T