#### Text Preprocessing Notebook
This notebook is used to preprocess text data downloaded from Project Gutenberg.
We will read text files, clean them, and save the cleaned text to a single output file.


In [1]:
import os

In [2]:
def preprocess_text(file_path, chunk_size=250):
    """
    Preprocess the text data: clean and split into chunks.
    
    Args:
    - file_path (str): Path to the text file.
    - chunk_size (int): Number of characters in each chunk.
    
    Returns:
    - List of text chunks.
    """
    # Read the text file
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    # Basic text cleaning: remove extra spaces and new lines
    text = ' '.join(text.split())
    
    # Split the text into manageable chunks
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

    return chunks

# Define the input file and output file
input_file = "../artifacts/text_data/book.txt"  # Adjust path as needed
output_file = "../artifacts/text_data/preprocessed_text.txt"

# Preprocess the text and save to a new file
text_chunks = preprocess_text(input_file)

# Save the processed chunks to a new text file
with open(output_file, 'w', encoding='utf-8') as file:
    for chunk in text_chunks:
        file.write(chunk + "\n")

print(f"Preprocessed text saved to '{output_file}' with {len(text_chunks)} chunks.")


Preprocessed text saved to '../artifacts/text_data/preprocessed_text.txt' with 471 chunks.
