## Chose the extension of the training data: automatically preannotate, manually control and correct the false positives

In [1]:
# this function iterates over a folder with txt files and copies and pastes all the files whose names are written in a list with file names to store them in a new folder
# used for subcorpora creation

import os
import shutil

def copy_files(source_dir, destination_dir, file_list):
    # Create the destination directory if it doesn't exist
    if not os.path.exists(destination_dir):
        os.makedirs(destination_dir)

    # Iterate through the list of filenames
    for filename in file_list:
        source_file_path = os.path.join(source_dir, filename)
        destination_file_path = os.path.join(destination_dir, filename)
        
        # Check if the file exists in the source directory
        if os.path.exists(source_file_path):
            # Copy the file to the destination directory
            shutil.copy2(source_file_path, destination_file_path)
            print(f"Copied '{filename}' to '{destination_dir}'")
        else:
            print(f"File '{filename}' not found in '{source_dir}'")

# Example usage
source_directory = "/Users/sguhr/Downloads/corpora-master/DNov"
destination_directory = "/Users/sguhr/Downloads/corpora-master/DNovXML"
file_names_list = ["BH.txt", "BR.txt", "DC.txt", "DS.txt", "ED.txt", "GE.txt", "HT.txt", "LD.txt", "MC.txt", "NN.txt", "OCS.txt", "OMF.txt", "OT.txt", "PP.txt", "TTC.txt"] # List of file names to copy

copy_files(source_directory, destination_directory, file_names_list)


Copied 'BH.txt' to '/Users/sguhr/Downloads/corpora-master/DNovXML'
Copied 'BR.txt' to '/Users/sguhr/Downloads/corpora-master/DNovXML'
Copied 'DC.txt' to '/Users/sguhr/Downloads/corpora-master/DNovXML'
Copied 'DS.txt' to '/Users/sguhr/Downloads/corpora-master/DNovXML'
Copied 'ED.txt' to '/Users/sguhr/Downloads/corpora-master/DNovXML'
Copied 'GE.txt' to '/Users/sguhr/Downloads/corpora-master/DNovXML'
Copied 'HT.txt' to '/Users/sguhr/Downloads/corpora-master/DNovXML'
Copied 'LD.txt' to '/Users/sguhr/Downloads/corpora-master/DNovXML'
Copied 'MC.txt' to '/Users/sguhr/Downloads/corpora-master/DNovXML'
Copied 'NN.txt' to '/Users/sguhr/Downloads/corpora-master/DNovXML'
Copied 'OCS.txt' to '/Users/sguhr/Downloads/corpora-master/DNovXML'
Copied 'OMF.txt' to '/Users/sguhr/Downloads/corpora-master/DNovXML'
Copied 'OT.txt' to '/Users/sguhr/Downloads/corpora-master/DNovXML'
Copied 'PP.txt' to '/Users/sguhr/Downloads/corpora-master/DNovXML'
Copied 'TTC.txt' to '/Users/sguhr/Downloads/corpora-master/D

# Take the corpus texts and prepare them for becoming XML files
1) No "&", "$", "<", ">"
2) Normalize all "" to "" and ''
2) change ´s to 's
3) no \n in the texts if not as double \n\n for real paragraphs

In [2]:
corpus_directory = destination_directory

In [3]:
# Generate a corpus by loading all the txt files from the chosen directory 
# and list the names of the first 10 txt files 
corpus = os.listdir(corpus_directory)
corpus[:10]

['OMF.txt',
 'GE.txt',
 'TTC.txt',
 'BH.txt',
 'NN.txt',
 'MC.txt',
 'LD.txt',
 'OT.txt',
 'BR.txt',
 'OCS.txt']

In [4]:
#to delete the .DS_Store file that always pops up with Mac

import os

#corpus_directory = "/path/to/your/corpus/directory"
file_to_delete = ".DS_Store"
file_path = os.path.join(corpus_directory, file_to_delete)

if os.path.exists(file_path):
    os.remove(file_path)
    print(f"{file_to_delete} has been deleted.")
else:
    print(f"{file_to_delete} does not exist in the specified directory.")

.DS_Store does not exist in the specified directory.


In [5]:
# Print how many txt files are in the corpus; if .DS_Store file was deleted, you have to read the corpus in again as the revised version without the .DS_Store file
corpus_length = len(corpus)
print(corpus_length)

15


In [6]:
txt_folder = corpus_directory


Now, I want to iterate over the txt files and change some strings with the help of regular expressions to have cleaned input files.

In [7]:
import os
import re

def process_txt_files(txt_folder, replacements):
    # Iterate over each txt file in the folder
    for filename in os.listdir(txt_folder):
        if filename.endswith('.txt'):
            txt_file_path = os.path.join(txt_folder, filename)
            # Read the text file
            with open(txt_file_path, 'r', encoding='utf-8') as file:
                text = file.read()

            # Iterate over replacements to find and replace regex matches
            for regex_pattern, replacement in replacements.items():
                text = re.sub(regex_pattern, replacement, text)
      

            # Write the modified text back to the file
            with open(txt_file_path, 'w', encoding='utf-8') as file:
                file.write(text)


In [8]:
replacements = {
    r'&': 'and',
    r'\n\n': r'\n',
    r'\s+': ' ',
    r'<<': '',
    r'>>': '',
    r'‹': '«',
    r'›': '»',
    r'<poem>': '',
    r'»': '» ',
    r'»  ': '» ',
    r'«': ' «',
    r'  «': ' «',
    r"´s": "'s",
    r'„': '» ',
    r'“':' «',
    r'–':' – ',
    r'  –  ':' – ',
    r'…':' … ',
    r'  …  ':' … ',
    r'</title> ': r'</title>\n\n',
    r'</scene> ': r'</scene>\n\n',
    r'</non-scene> ': r'</non-scene>\n\n',
    r'</section> ': r'</section>\n\n',
    r'</chapter> ': r'</chapter>\n\n',
    r'<scene>': r'\n\n<scene>',
    r'<non-scene>': r'\n\n<non-scene>',
    r'<section>': r'\n\n<section>',
    r'<chapter>': r'\n\n<chapter>',
}

#control if there are </chapter>Chapter</chapter> errors or other!


In [9]:
process_txt_files(txt_folder, replacements)


Take the chosen files and automatically transform them into xml files.

In [10]:
import os

def remove_ds_store_file(source_dir):
    # Remove the .DS_Store file if it exists
    ds_store_path = os.path.join(source_dir, ".DS_Store")
    if os.path.exists(ds_store_path):
        os.remove(ds_store_path)
        print(".DS_Store file removed")


def transform_to_xml(source_dir, destination_dir):
    # Extract the folder name from the source directory
    source_folder_name = os.path.basename(os.path.normpath(source_dir))
    
    # Construct the destination directory name by adding '_XML' to the source folder name
    destination_folder_name = source_folder_name + "_XML"
    destination_dir = os.path.join(destination_dir, destination_folder_name)
    
    # Create the destination directory if it doesn't exist
    if not os.path.exists(destination_dir):
        os.makedirs(destination_dir)
           
    # Iterate through each file in the source directory
    for filename in os.listdir(source_dir):
        source_file_path = os.path.join(source_dir, filename)
        
        # Check if the file is a regular file
        if os.path.isfile(source_file_path):
            # Read the content of the file
            with open(source_file_path, 'r') as file:
                content = file.read()
            
            # Construct the XML content
            xml_content = f"""<?xml version="1.0" encoding="UTF-8"?>
<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:xs="http://www.w3.org/2001/XMLSchema">
    <teiHeader>
        <fileDesc>
            <titleStmt>
                <title>{filename}</title>
            </titleStmt>
        </fileDesc>
    </teiHeader>
    <text>
        <body>
{content}
        </body>
    </text>
</TEI>
"""

            # Write the XML content to a new file in the destination directory
            xml_file_path = os.path.join(destination_dir, filename.replace('.txt', '.xml'))
            with open(xml_file_path, 'w') as xml_file:
                xml_file.write(xml_content)
            
            print(f"File '{filename}' transformed to XML and saved as '{xml_file_path}'")

# Example usage
source_directory = corpus_directory
destination_directory = "/Users/sguhr/Downloads/corpora-master/DNov_XML"

# Optionally remove the .DS_Store file
remove_ds_store_file(source_directory)

# Transform files to XML
transform_to_xml(source_directory, destination_directory)


File 'OMF.txt' transformed to XML and saved as '/Users/sguhr/Desktop/Diss_notebooks/Diss_data_notebooks_man_anno/Subcorpora_XML/DNovXML_XML/OMF.xml'
File 'GE.txt' transformed to XML and saved as '/Users/sguhr/Desktop/Diss_notebooks/Diss_data_notebooks_man_anno/Subcorpora_XML/DNovXML_XML/GE.xml'
File 'TTC.txt' transformed to XML and saved as '/Users/sguhr/Desktop/Diss_notebooks/Diss_data_notebooks_man_anno/Subcorpora_XML/DNovXML_XML/TTC.xml'
File 'BH.txt' transformed to XML and saved as '/Users/sguhr/Desktop/Diss_notebooks/Diss_data_notebooks_man_anno/Subcorpora_XML/DNovXML_XML/BH.xml'
File 'NN.txt' transformed to XML and saved as '/Users/sguhr/Desktop/Diss_notebooks/Diss_data_notebooks_man_anno/Subcorpora_XML/DNovXML_XML/NN.xml'
File 'MC.txt' transformed to XML and saved as '/Users/sguhr/Desktop/Diss_notebooks/Diss_data_notebooks_man_anno/Subcorpora_XML/DNovXML_XML/MC.xml'
File 'LD.txt' transformed to XML and saved as '/Users/sguhr/Desktop/Diss_notebooks/Diss_data_notebooks_man_anno/Su

In [11]:
print("Ready.")

Ready.


Enriching the new XML files with <chapter number=1> element-attribute combinations. 

In [14]:
import re
import os

# 1. Define the folder path containing your .xml files
#folder_path = "/Users/sguhr/Downloads/corpora-master/DNov_XML"  # 
folder_path = destination_directory

# 2. Define the regex pattern to match the word "CHAPTER" (uppercase only, standalone word)
chapter_pattern = re.compile(r'\bCHAPTER\b')

# 3. Function to wrap "CHAPTER" in a numbered <chapter> tag
def wrap_chapters(xml_text):
    chapter_count = [0]  # Using a list to allow mutation inside nested function

    def replacer(match):
        chapter_count[0] += 1
        return f'<chapter number="{chapter_count[0]}">CHAPTER</chapter>'

    return chapter_pattern.sub(replacer, xml_text)

# 4. Process each XML file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.xml'):
        file_path = os.path.join(folder_path, filename)

        # Read the file
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # Apply the transformation
        enriched_content = wrap_chapters(content)

        # Write to a new file with "_enriched" added to the name
        new_file_path = os.path.join(folder_path, filename.replace('.xml', '_enriched.xml'))
        with open(new_file_path, 'w', encoding='utf-8') as file:
            file.write(enriched_content)

        print(f"Processed: {filename}")


Processed: BH.xml
Processed: MC.xml
Processed: NN.xml
Processed: LD.xml
Processed: GE.xml
Processed: OMF.xml
Processed: TTC.xml
Processed: HT.xml
Processed: ED.xml
Processed: OCS.xml
Processed: DC.xml
Processed: PP.xml
Processed: DS.xml
Processed: BR.xml
Processed: OT.xml
