In [None]:
# https://stackabuse.com/reading-and-writing-xml-files-in-python-with-pandas/

In [47]:
import os
import unicodedata
from lxml import etree
from collections import Counter
import pandas as pd

# Directory paths
input_dir = './bgh_urteile/'
output_dir = './normalized_bgh_urteile/'

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)


In [48]:
# Function to normalize text using unicodedata
def normalize_text(text):
    return unicodedata.normalize('NFKC', text)

# Function to create 10-grams from text
def create_ngrams(text, n=10):
    words = text.split()
    ngrams = [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]
    return ngrams

# Function to extract all text content from an XML element
def extract_text_and_elements(root):
    texts = []
    elements = []
    for elem in root.iter():
        if elem.text:
            texts.append(elem.text.strip())
            elements.append(elem)
    return texts, elements

# Function to remove duplicated 10-grams from text
def remove_duplicated_ngrams(text, duplicated_ngrams, ngram_tracker):
    deleted_ngrams = []
    for ngram in duplicated_ngrams:
        count = ngram_tracker[ngram]
        occurrences = text.count(ngram)
        if occurrences > 1:
            text = text.replace(ngram, '', occurrences - 1)
            deleted_ngrams.append(ngram)
    return text, deleted_ngrams

In [49]:
for root, dirs, files in os.walk(input_dir):
    for filename in files:
        if filename.endswith('.xml') and 'checkpoint' not in filename:
            input_path = os.path.join(root, filename)
            normalized_filename = f"normalized_{filename}"
            relpath = os.path.relpath(root, input_dir)
            output_subdir = os.path.join(output_dir, relpath)
            os.makedirs(output_subdir, exist_ok=True)
            output_path = os.path.join(output_subdir, normalized_filename)
            
            # Parse the XML file
            tree = etree.parse(input_path)
            root_element = tree.getroot()

            # Normalize text in XML elements
            for element in root_element.iter():
                if element.text:
                    element.text = normalize_text(element.text)

            # Extract normalized text and elements
            normalized_texts, normalized_elements = extract_text_and_elements(root_element)
            
            # Create 10-grams from normalized text
            all_ngrams = []
            for text in normalized_texts:
                all_ngrams.extend(create_ngrams(text))

            # Count the 10-grams to find duplicates
            ngram_counts = Counter(all_ngrams)
            
            # Find duplicated 10-grams
            duplicated_ngrams = [ngram for ngram, count in ngram_counts.items() if count > 1]

            # Track deleted n-grams
            deleted_ngrams = []

            # Initialize n-gram tracker to track deletions
            ngram_tracker = ngram_counts.copy()

            # Remove duplicated 10-grams from normalized text elements
            for element in normalized_elements:
                if element.text:
                    element.text, deleted = remove_duplicated_ngrams(element.text, duplicated_ngrams, ngram_tracker)
                    deleted_ngrams.extend(deleted)

            # Save the modified XML file
            tree.write(output_path, pretty_print=True, xml_declaration=True, encoding='UTF-8')

            # Print deleted n-grams
            if deleted_ngrams:
                print(f"Deleted n-grams in file {filename}:")
                for ngram in deleted_ngrams:
                    print(ngram)

print("Normalization and duplicate removal completed for all files.")

Deleted n-grams in file bgh_20100223_XI-ZR-190-09.xml:
Gesetzes zur Umsetzung der Verbraucherkreditrichtlinie, des zivilrechtlichen Teils der Zahlungsdiensterichtlinie
Deleted n-grams in file bgh_20100211_III-ZR-12-09.xml:
dass die im Prospekt werbend herausgestellte Mittelverwendungskontrolle bislang nicht stattgefunden
Deleted n-grams in file bgh_20100112_3-StR-439-09.xml:
so schon Tröndle/Fischer, StGB 49. Aufl. (1999) § 66 Rdn.
Deleted n-grams in file bgh_20100211_III-ZR-11-09.xml:
dass die im Prospekt werbend herausgestellte Mittelverwendungskontrolle bislang nicht stattgefunden
Deleted n-grams in file bgh_20100222_II-ZR-287-07.xml:
ob die Übereignung der Zylinder von der Beklagten zu 1
Deleted n-grams in file bgh_20100211_III-ZR-7-09.xml:
dass die im Prospekt werbend herausgestellte Mittelverwendungskontrolle bislang nicht stattgefunden
Deleted n-grams in file bgh_20100218_IX-ZA-39-09.xml:
Beschl. v. 16. Juli 2009 aaO S. 1779 f Rn.
Deleted n-grams in file bgh_20100113_3-StR-507-0

In [None]:
# Optional: Display the 10-grams
df_ngrams = pd.DataFrame(all_ngrams, columns=['10-Gram'])
# Set display options to show all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
display(df_ngrams)