In [1]:
import spacy
import csv
import re

In [2]:

def process_text(input_text):
    # Load the German language model in spaCy
    nlp = spacy.load('de_core_news_sm')

    # Process the input text
    doc = nlp(input_text)

    # Initialize lists to store token information
    token_ids = []
    tokens = []
    lemmas = []
    pos_tags = []

    # Extract token information
    for token in doc:
        if token.pos_ != "SPACE":
            token_ids.append(token.i)
            tokens.append(token.text)
            lemmas.append(token.lemma_)
            pos_tags.append(token.pos_)

    return token_ids, tokens, lemmas, pos_tags



In [3]:
# Read the input text from a file
def read_input_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            input_text = file.read()
        return input_text
    except FileNotFoundError:
        print(f"File '{file_path}' not found.")
        return None
    except:
        print("An error occurred while reading the file.")
        return None



In [4]:
# Save output to a CSV file
def save_output_to_csv(file_path, token_ids, tokens, lemmas, pos_tags):
    try:
        with open(file_path, 'w', encoding='utf-8', newline='') as file:
            writer = csv.writer(file, delimiter=';')
            writer.writerow(['Token ID', 'Token', 'Lemma', 'POS Tag'])
            for i in range(len(token_ids)):
                writer.writerow([token_ids[i], tokens[i], lemmas[i], pos_tags[i]])
        print(f"Output saved to '{file_path}' successfully.")
    except:
        print("An error occurred while saving the output to a CSV file.")



In [8]:
# Example usage
input_file_path = '/Users/guhr/Desktop/Diss_Korpus/Diss_Korpus_202303_bereiningt/von_Saar_Ferdinand_Ausser_Dienst.txt'
output_file_path = '/Users/guhr/Desktop/Diss_jupyter/for_annotation/von_Saar_Ferdinand_Ausser_Dienst_for_anno.csv'

input_text = read_input_file(input_file_path)

single_spaces_text = re.sub('\n+', ' ', input_text)
single_spaces_text = re.sub('--+', ' -- ', single_spaces_text)
single_spaces_text = re.sub('&', 'und', single_spaces_text)
single_spaces_text = re.sub('<|/', ' ', single_spaces_text)
single_spaces_text = re.sub('>', ' ', single_spaces_text)
single_spaces_text = re.sub("'s", ' s', single_spaces_text)
single_spaces_text = re.sub(';', '.', single_spaces_text)
single_spaces_text = re.sub('\d', ' ', single_spaces_text)
input_text = re.sub('\s+', ' ', single_spaces_text)



In [9]:
if input_text:
    token_ids, tokens, lemmas, pos_tags = process_text(input_text)

    save_output_to_csv(output_file_path, token_ids, tokens, lemmas, pos_tags)

Output saved to '/Users/guhr/Desktop/Diss_jupyter/for_annotation/von_Saar_Ferdinand_Ausser_Dienst_for_anno.csv' successfully.
