In [12]:
# "¬"Zeichen entfernen und Wörter verbinden
def reflow(infile, outfile):
    with open(infile) as source, open(outfile, "w") as dest:
        holdover = ""
        for line in source.readlines():
            line = line.rstrip("\n")
            if line.endswith("¬"):
                lin, _, e = line.rpartition(" ")
            else:
                lin, e = line, ""
            dest.write(f"{holdover}{lin}\n")
            holdover = e[:-1]

if __name__ == "__main__":
    reflow("/Users/wanjagerber/desktop/txt/brescia.txt", "/Users/wanjagerber/desktop/txt/bresciaModified.txt")


In [13]:
#Seitenzahlen entfernen

import re

def remove_three_digit_numbers(infile, outfile):
    with open(infile) as source, open(outfile, "w") as dest:
        for line in source:
            # Remove all three-digit numbers from the line
            line_without_numbers = re.sub(r'\b\d{3}\b', '', line)
            # Remove extra whitespace (except for newlines)
            line_without_whitespace = re.sub(r'(?<!\n)\s+(?!\n)', ' ', line_without_numbers)
            dest.write(line_without_whitespace)

if __name__ == "__main__":
    reflow("/Users/wanjagerber/desktop/txt/bresciaModified.txt", "/Users/wanjagerber/desktop/txt/bresciaReady.txt")


In [14]:
#Zeilenumbruch nach jedem Satz

import re

def process_text(input_file, output_file):
    # Read the entire content from the input file
    with open(input_file, 'r', encoding='utf-8') as infile:
        text = infile.read()

    # Remove all line breaks that aren't at the end of a sentence
    text = re.sub(r'(?<!\.)\n', ' ', text)

    # Add a line break after each sentence
    sentences = re.split(r'(?<=\.)\s+', text)
    formatted_text = '\n'.join(sentences)

    # Write the formatted text to the output file
    with open(output_file, 'w', encoding='utf-8') as outfile:
        outfile.write(formatted_text)

if __name__ == "__main__":
    input_path = '/Users/wanjagerber/desktop/txt/bresciaReady.txt'
    output_path = '/Users/wanjagerber/desktop/txt/bresciaFinal.txt'
    process_text(input_path, output_path)

            

In [17]:
# Umwandlung von txt zu xml

import xml.etree.ElementTree as ET

input_file = "/Users/wanjagerber/desktop/txt/bresciaFinal.txt"
output_file = "/Users/wanjagerber/desktop/txt/bresciaFinal.xml"

# Define XML structure
root = ET.Element("root")
body = ET.SubElement(root, "body")

# Read text file and convert to XML
with open(input_file, "r", encoding="utf-8") as file:
    # Read the entire content and preserve line breaks
    content = file.read()

# Add the entire content as text for the <body> element
body.text = content

# Write XML to file
tree = ET.ElementTree(root)
tree.write(output_file, encoding="utf-8", xml_declaration=True)


In [18]:
# Named Entity Recognition, direkt in xml-Datei abspeichern

import spacy
import xml.etree.ElementTree as ET

# Load English model
nlp = spacy.load('/users/wanjagerber/downloads/output/model-best')  # Adjust as needed

# Define function for entity replacement
def replace_entities(input_text):
    # Preparing list of entities and tokens
    doc = nlp(input_text)
    # Placing them into a dictionary for better manipulation
    ls_ents = {str(x): x.label_ for x in doc.ents}

    # Iterate over the sorted entities
    for ent in sorted(ls_ents.keys(), key=len, reverse=True):
        label = ls_ents[ent]
        # Define patterns for each entity label
        if label == "PERSON":
            pattern = r'\b' + re.escape(ent) + r'\b'
            input_text = re.sub(pattern, f"<persName>{ent}</persName>", input_text)
        elif label == "LOCATION":
            pattern = r'\b' + re.escape(ent) + r'\b'
            input_text = re.sub(pattern, f"<placeName>{ent}</placeName>", input_text)
        elif label == "DATE":
            pattern = r'\b' + re.escape(ent) + r'\b'
            input_text = re.sub(pattern, f"<date>{ent}</date>", input_text)
        elif label == "INSTRUMENT":
            pattern = r'\b' + re.escape(ent) + r'\b'
            input_text = re.sub(pattern, f"<instrument>{ent}</instrument>", input_text)
        elif label == "MUSIC":
            pattern = r'\b' + re.escape(ent) + r'\b'
            input_text = re.sub(pattern, f"<music>{ent}</music>", input_text)
        # Add more conditions for other entity types as needed
    return input_text

# Path to your XML file
input_file = "/users/wanjagerber/desktop/txt/bresciaFinal.xml"
output_file = "/users/wanjagerber/desktop/txt/bresciaNER.xml"  # Update output file path

# Parsing the XML file
tree = ET.parse(input_file)
root = tree.getroot()

# Concatenate text from all text nodes, excluding text within XML tags
body_text = ""
for elem in root.iter():
    if elem.text is not None:
        body_text += elem.text.strip() + " "

# Replace entities in the text
output_text = replace_entities(body_text)

# Write the output text to the output file
with open(output_file, "w", encoding="utf-8") as f:
    f.write(output_text)

print("Replacement done and saved to:", output_file)


Replacement done and saved to: /users/wanjagerber/desktop/txt/bresciaNER.xml
