In [9]:
import os
import re

def read_text_file(file_path, encoding='utf-8'):
    with open(file_path, 'r', encoding=encoding) as file:
        return file.readlines()

def clean_text(text):
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove special characters except for punctuation
    text = re.sub(r'[^a-zA-Z0-9\s.,?!]', '', text)
    
    # Separate sentences based on punctuation marks
    text = re.sub(r'\.\s+', '. ', text)
    text = re.sub(r'\?\s+', '? ', text)
    text = re.sub(r'!\s+', '! ', text)
    
    return text

In [10]:
def categorize_sentences(sentences):
    clean_text = []
    text_with_numbers = []
    nonsensical_text = []
    
    for sentence in sentences:
        sentence = sentence.strip()
        if sentence:
            if any(char.isdigit() for char in sentence):
                text_with_numbers.append(sentence)
            elif is_coherent(sentence):
                clean_text.append(sentence)
            else:
                nonsensical_text.append(sentence)
    
    return clean_text, text_with_numbers, nonsensical_text

In [11]:
def is_coherent(sentence):
    # Implement coherence check logic here
    # For simplicity, let's assume any sentence with less than 3 words is nonsensical
    words = sentence.split()
    return len(words) >= 3

In [12]:
def create_text_file(file_path, lines):
    with open(file_path, 'w') as file:
        file.write('\n'.join(lines))

def main():
    input_folder = "input"
    output_folder = "output"
    
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Step 1: Data Reading
    input_files = [file for file in os.listdir(input_folder) if file.endswith('.txt')]
    for input_file in input_files:
        input_path = os.path.join(input_folder, input_file)
        input_lines = read_text_file(input_path)

        # Step 2: Cleaning
        cleaned_lines = [clean_text(line) for line in input_lines]

        # Step 3: Organization
        clean_text_result, text_with_numbers, nonsensical_text = categorize_sentences(cleaned_lines)

        # Step 4: File Creation
        file_name_without_extension = os.path.splitext(input_file)[0]
        output_file_clean_text = os.path.join(output_folder, file_name_without_extension + "_clean_text.txt")
        output_file_text_with_numbers = os.path.join(output_folder, file_name_without_extension + "_text_with_numbers.txt")
        output_file_nonsensical_text = os.path.join(output_folder, file_name_without_extension + "_nonsensical_text.txt")
        
        create_text_file(output_file_clean_text, ["Clean Text:"] + clean_text_result)
        create_text_file(output_file_text_with_numbers, ["Text with Numbers:"] + text_with_numbers)
        create_text_file(output_file_nonsensical_text, ["Nonsensical Text:"] + nonsensical_text)

        print("Files created successfully for '{}'.".format(input_file))

    print("All files processed and saved in the '{}' folder.".format(output_folder))

if __name__ == "__main__":
    main()

Files created successfully for 'Afrikaans_temp.txt'.
Files created successfully for 'bulayo-lo-talifhaho.txt'.
Files created successfully for 'Zadie Smith - White Teeth-Vintage (2001).txt'.
All files processed and saved in the 'output' folder.
