In [1]:
import re
import os

In [2]:
def clean_text(text):
    # Convert to uniform spacing
    text = re.sub(r'\r\n|\r', '\n', text)               # Normalize newlines
    text = re.sub(r'\n{2,}', '\n\n', text)              # Collapse extra line breaks
    text = re.sub(r'\s+', ' ', text)                    # Replace multiple spaces

    # Remove common boilerplate/navigation patterns
    boilerplate_patterns = [
        r"Page \d+ of \d+",                              # Page numbers
        r"Back to Top",
        r"Home(.*?)Sitemap",                             # Menu text
        r"©.*?All rights reserved",
        r"Download PDF",
        r"\bDisclaimer\b",
        r"Table of Contents",
        r"Contents",                                     # Generic headers
    ]
    for pattern in boilerplate_patterns:
        text = re.sub(pattern, "", text, flags=re.IGNORECASE)

    # Remove lines with only numbers (e.g., page headers)
    text = re.sub(r'\n\d+\n', '\n', text)

    # Remove URLs
    text = re.sub(r'https?://\S+', '', text)

    # Strip remaining HTML tags if any
    text = re.sub(r'<[^>]+>', '', text)

    # Final strip
    text = text.strip()

    return text


In [3]:
def clean_file(input_file, output_file):
    if not os.path.exists(input_file):
        print(f"[❌ ERROR] Input file not found: {input_file}")
        return

    try:
        with open(input_file, "r", encoding="utf-8", errors="ignore") as f:
            raw_text = f.read()
    except Exception as e:
        print(f"[❌ ERROR] Failed to read input file: {e}")
        return

    cleaned_text = clean_text(raw_text)

    try:
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(cleaned_text)
        print(f"[✅ DONE] Cleaned text saved to: {output_file}")
    except Exception as e:
        print(f"[❌ ERROR] Failed to write output file: {e}")



In [5]:
if __name__ == "__main__":
    INPUT_FILE = r"E:\Income Tax Fine-tuning\indian_tax_code_final_corpus.txt"
    OUTPUT_FILE = "indian_tax_code_cleaned.txt"
    clean_file(INPUT_FILE, OUTPUT_FILE)

[✅ DONE] Cleaned text saved to: indian_tax_code_cleaned.txt
