The script below defines a class `TextChunkExtractor` designed to process Latin text files by splitting them into non-overlapping chunks based on a specified token count. The class performs preprocessing, tokenization, and chunking of texts, and saves the processed files into a designated directory.
Key Components

    Initialization:
        `directory_to_read`: The directory containing the text files to be processed.
        `directory_to_write`: The directory where the processed files will be saved. If it does not exist, it will be created.
        `threshold_to_slice`: The minimum token count required to split the text into chunks.
        `chunk_size`: The size of each non-overlapping chunk.

    Methods:
        `count_files(directory)`: Counts the number of .txt files in the given directory.
        `read_file(filepath)`: Reads the content of a file.
        `preprocess(text)`: Removes non-alphanumeric characters (except whitespace) from the text.
        `tokenize_latin_text(text)`: Converts the text to lowercase and tokenizes it using CLTK's LatinWordTokenizer.
        `extract_chunks()`: Processes each text file, splits it into chunks if it exceeds the threshold, and saves the chunks as separate files.

In [1]:
from pathlib import Path
import re
from cltk.tokenizers import LatinWordTokenizer

class TextChunkExtractor:
    def __init__(self, directory_to_read, directory_to_write, threshold_to_slice, chunk_size):
        """
        Initialize the TextChunkExtractor with directories and chunking parameters.

        Parameters:
            directory_to_read (str): The directory containing the texts to slice.
            directory_to_write (str): The directory to write the results. If it doesn't exist, it will be created.
            threshold_to_slice (int): The token count threshold above which texts will be split into chunks.
            chunk_size (int): The size of each non-overlapping chunk.
        """
        self.directory_to_read = Path(directory_to_read)
        self.directory_to_write = Path(directory_to_write)
        self.threshold_to_slice = threshold_to_slice
        self.chunk_size = chunk_size

        # Create the directory if it does not exist
        self.directory_to_write.mkdir(parents=True, exist_ok=True)
        print(f"Directory {self.directory_to_write} is ready!")

    def count_files(self, directory):
        """Count the number of .txt files in the given directory."""
        return len(list(Path(directory).glob("*.txt")))

    def read_file(self, filepath):
        """Read the content of a file."""
        with filepath.open('r', encoding='utf-8') as file:
            return file.read()

    def preprocess(self, text):
        """Remove Arabic numbers from the text and return the cleaned text."""
        return re.sub(r'[^\w\s]', '', text)

    def tokenize_latin_text(self, text):
        """Lowercase and tokenize Latin text."""
        latin_tokenizer = LatinWordTokenizer()
        text = self.preprocess(text.lower())
        tokens = latin_tokenizer.tokenize(text)
        return tokens

    def extract_chunks(self):
        """Extract chunks from text files and save them in the specified directory."""
        # Process each file in the directory
        for file_path in self.directory_to_read.glob("*.txt"):
            tokens = self.tokenize_latin_text(self.read_file(file_path))

            if len(tokens) > self.threshold_to_slice:
                chunks = [tokens[i:i + self.chunk_size] for i in range(0, len(tokens), self.chunk_size)]
                for i, chunk in enumerate(chunks):
                    chunk_text = " ".join(chunk)
                    chunk_file_name = f"{file_path.stem}_chunk{i + 1}.txt"
                    with (self.directory_to_write / chunk_file_name).open("w", encoding='utf-8') as f:
                        f.write(chunk_text)
            else:
                text = " ".join(tokens)
                with (self.directory_to_write / file_path.name).open("w", encoding='utf-8') as f:
                    f.write(text)

        print(f"""
        Every file has been written successfully.
        The new directory (path={self.directory_to_write}) contains {self.count_files(self.directory_to_write)} text samples.""")


# Example usage:
directory_to_read = "../../corpora/corpus_imposters/"  # get the working directory
directory_to_write = "../../corpora/corpus_chunks/"    # set the directory where you want to write the results
extractor = TextChunkExtractor(directory_to_read, directory_to_write, 500, 500)
extractor.extract_chunks()

Directory ../../corpora/corpus_kestemont_chunks/ already exists!

        Every file has been written successfully.
        The new directory (path=../../corpora/corpus_kestemont_chunks/) contains 6080 text samples.
