# Text Preprocessing Tutorial for AG News Classification

## Overview

This tutorial demonstrates comprehensive text preprocessing techniques following methodologies from:
- Jurafsky & Martin (2023): "Speech and Language Processing"
- Manning et al. (2008): "Introduction to Information Retrieval"
- Devlin et al. (2019): "BERT: Pre-training of Deep Bidirectional Transformers"

### Learning Objectives
1. Understand text preprocessing pipeline components
2. Apply various cleaning and normalization techniques
3. Implement tokenization strategies for transformer models
4. Compare preprocessing effects on model performance
5. Create custom preprocessing pipelines

Author: Võ Hải Dũng  
Email: vohaidung.work@gmail.com  
Date: 2025

## 1. Setup and Imports

In [None]:
# Standard library imports
import os
import sys
import re
import string
import unicodedata
from pathlib import Path
from typing import List, Dict, Tuple, Optional, Callable
from collections import Counter
import time

# Data manipulation
import numpy as np
import pandas as pd

# NLP libraries
import nltk
from transformers import AutoTokenizer
import spacy

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Project setup
PROJECT_ROOT = Path("../..").resolve()
sys.path.insert(0, str(PROJECT_ROOT))

# Project imports
from src.data.preprocessing.text_cleaner import (
    TextCleaner, 
    CleaningConfig,
    get_minimal_cleaner,
    get_standard_cleaner,
    get_aggressive_cleaner
)
from src.data.preprocessing.tokenization import (
    TokenizerWrapper,
    TokenizationConfig
)
from src.data.preprocessing.feature_extraction import (
    FeatureExtractor,
    FeatureConfig
)
from src.data.datasets.ag_news import AGNewsDataset, AGNewsConfig
from src.utils.reproducibility import set_seed
from configs.constants import DATA_DIR, AG_NEWS_CLASSES

# Set random seed
set_seed(42)

# Download NLTK data if needed
try:
    nltk.data.find('punkt')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('averaged_perceptron_tagger')

print("Environment ready for preprocessing tutorial")

## 2. Load Sample Data

In [None]:
# Load sample data for preprocessing
dataset_config = AGNewsConfig(
    data_dir=DATA_DIR / "processed",
    max_samples=1000,
    use_cache=False
)

try:
    dataset = AGNewsDataset(dataset_config, split="train")
    texts = dataset.texts[:100]  # Use first 100 samples
    labels = dataset.labels[:100]
    print(f"Loaded {len(texts)} sample texts")
except:
    # Fallback sample texts
    texts = [
        "Wall Street #1 stocks rise; S&P 500 hits NEW record HIGH!!!",
        "Scientists discover potential cure for COVID-19 using mRNA technology.",
        "Manchester United defeats Chelsea 3-2 in Premier League match.",
        "Apple Inc. announces new iPhone 15 with revolutionary features.",
        "NASA's Mars rover finds evidence of ancient water on the Red Planet."
    ]
    labels = [2, 3, 1, 2, 3]  # Business, Sci/Tech, Sports, Business, Sci/Tech
    print(f"Using {len(texts)} fallback sample texts")

# Display sample texts
print("\nSample texts before preprocessing:")
for i, text in enumerate(texts[:3]):
    print(f"{i+1}. {text[:100]}..." if len(text) > 100 else f"{i+1}. {text}")

## 3. Basic Text Cleaning

In [None]:
def demonstrate_cleaning_steps(text: str) -> Dict[str, str]:
    """
    Demonstrate individual cleaning steps.
    
    Args:
        text: Input text
        
    Returns:
        Dictionary of cleaning steps and results
    """
    steps = {}
    
    # Original
    steps['original'] = text
    
    # Lowercase
    steps['lowercase'] = text.lower()
    
    # Remove HTML tags
    steps['no_html'] = re.sub(r'<[^>]+>', '', text)
    
    # Remove URLs
    steps['no_urls'] = re.sub(r'http[s]?://\S+', '', text)
    
    # Remove email addresses
    steps['no_emails'] = re.sub(r'\S+@\S+', '', text)
    
    # Remove punctuation
    steps['no_punctuation'] = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove numbers
    steps['no_numbers'] = re.sub(r'\d+', '', text)
    
    # Remove extra whitespace
    steps['normalized_whitespace'] = ' '.join(text.split())
    
    # Remove non-ASCII characters
    steps['ascii_only'] = ''.join(char for char in text if ord(char) < 128)
    
    return steps

# Demonstrate on sample text
sample_text = "Check out this URL: https://example.com! Email us at info@example.com. Price: $99.99 #awesome"
cleaning_steps = demonstrate_cleaning_steps(sample_text)

print("Text Cleaning Steps Demonstration")
print("=" * 50)
for step_name, result in cleaning_steps.items():
    print(f"\n{step_name}:")
    print(f"  {result}")

## 4. Using Text Cleaner Classes

In [None]:
# Create different cleaning configurations
cleaning_configs = {
    'minimal': get_minimal_cleaner(),
    'standard': get_standard_cleaner(),
    'aggressive': get_aggressive_cleaner()
}

# Apply different cleaning strategies
test_text = texts[0] if texts else sample_text

print("Comparing Cleaning Strategies")
print("=" * 50)
print(f"\nOriginal text ({len(test_text)} chars):")
print(f"  {test_text}")

cleaning_results = {}
for strategy_name, cleaner in cleaning_configs.items():
    cleaned = cleaner.clean(test_text)
    cleaning_results[strategy_name] = cleaned
    
    print(f"\n{strategy_name.capitalize()} cleaning ({len(cleaned)} chars):")
    print(f"  {cleaned}")
    print(f"  Character reduction: {(1 - len(cleaned)/len(test_text))*100:.1f}%")

# Measure cleaning speed
print("\nCleaning Speed Comparison:")
for strategy_name, cleaner in cleaning_configs.items():
    start_time = time.time()
    for text in texts[:100]:
        _ = cleaner.clean(text)
    elapsed = time.time() - start_time
    
    print(f"  {strategy_name}: {elapsed*1000:.2f}ms for 100 texts")
    print(f"    ({elapsed*10:.2f}ms per text)")

## 5. Tokenization Strategies

In [None]:
# Compare different tokenization methods
test_sentence = "The quick brown fox jumps over the lazy dog. It's a beautiful day!"

print("Tokenization Methods Comparison")
print("=" * 50)
print(f"\nOriginal: {test_sentence}")
print()

# 1. Simple whitespace tokenization
whitespace_tokens = test_sentence.split()
print(f"Whitespace tokenization ({len(whitespace_tokens)} tokens):")
print(f"  {whitespace_tokens}")

# 2. NLTK word tokenization
from nltk.tokenize import word_tokenize
nltk_tokens = word_tokenize(test_sentence)
print(f"\nNLTK tokenization ({len(nltk_tokens)} tokens):")
print(f"  {nltk_tokens}")

# 3. Transformer tokenization (BERT)
bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_tokens = bert_tokenizer.tokenize(test_sentence.lower())
print(f"\nBERT tokenization ({len(bert_tokens)} tokens):")
print(f"  {bert_tokens}")

# 4. Subword tokenization example
complex_word = "unbelievably"
print(f"\nSubword tokenization of '{complex_word}':")
print(f"  BERT: {bert_tokenizer.tokenize(complex_word)}")

# Demonstrate handling of special cases
special_cases = [
    "don't",
    "U.S.A.",
    "email@example.com",
    "#hashtag",
    "$100.50"
]

print("\nSpecial Cases Handling:")
for case in special_cases:
    print(f"  '{case}' -> {word_tokenize(case)}")

## 6. Transformer Model Tokenization

In [None]:
# Compare tokenization for different transformer models
from src.data.preprocessing.tokenization import TokenizerWrapper, TokenizationConfig

# Create tokenization config
tokenizer_config = TokenizationConfig(
    model_name='bert-base-uncased',
    max_length=128,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)

# Initialize tokenizer wrapper
tokenizer_wrapper = TokenizerWrapper(tokenizer_config)

# Tokenize sample texts
sample_texts_for_tokenization = [
    "This is a short text.",
    "This is a much longer text that might need to be truncated if it exceeds the maximum length limit set in the configuration.",
    texts[0] if texts else "Default text for tokenization example."
]

print("Transformer Tokenization Examples")
print("=" * 50)

for i, text in enumerate(sample_texts_for_tokenization):
    # Tokenize
    encoded = tokenizer_wrapper.encode_single(text)
    
    print(f"\nText {i+1}: {text[:50]}..." if len(text) > 50 else f"\nText {i+1}: {text}")
    print(f"  Original length: {len(text)} characters")
    print(f"  Token IDs shape: {encoded['input_ids'].shape}")
    print(f"  Attention mask shape: {encoded['attention_mask'].shape}")
    
    # Decode back to text
    decoded = tokenizer_wrapper.tokenizer.decode(encoded['input_ids'][0], skip_special_tokens=True)
    print(f"  Decoded: {decoded[:50]}..." if len(decoded) > 50 else f"  Decoded: {decoded}")
    
    # Show special tokens
    tokens_with_special = tokenizer_wrapper.tokenizer.convert_ids_to_tokens(encoded['input_ids'][0][:20])
    print(f"  First 20 tokens: {tokens_with_special}")

# Batch tokenization
print("\nBatch Tokenization:")
batch_encoded = tokenizer_wrapper.encode_batch(sample_texts_for_tokenization)
print(f"  Batch input_ids shape: {batch_encoded['input_ids'].shape}")
print(f"  Batch attention_mask shape: {batch_encoded['attention_mask'].shape}")

## 7. Feature Extraction

In [None]:
# Extract various features from text
from src.data.preprocessing.feature_extraction import FeatureExtractor, FeatureConfig

# Create feature extraction config
feature_config = FeatureConfig(
    extract_length_features=True,
    extract_pos_features=True,
    extract_entity_features=True,
    extract_sentiment_features=False,  # Requires additional models
    extract_readability_features=True
)

# Initialize feature extractor
feature_extractor = FeatureExtractor(feature_config)

# Extract features from sample texts
test_texts_for_features = [
    "Apple Inc. announced record profits in Q4 2024.",
    "The team won the championship after a thrilling final match.",
    "Researchers at MIT developed a new quantum computing algorithm."
]

print("Feature Extraction Examples")
print("=" * 50)

for text in test_texts_for_features:
    features = feature_extractor.extract(text)
    
    print(f"\nText: {text}")
    print("Features:")
    
    # Display features in organized way
    if 'length_features' in features:
        print("  Length features:")
        for key, value in features['length_features'].items():
            print(f"    {key}: {value}")
    
    if 'pos_features' in features:
        print("  POS tag distribution:")
        for tag, count in list(features['pos_features'].items())[:5]:
            print(f"    {tag}: {count}")
    
    if 'entities' in features:
        print(f"  Named entities: {features['entities']}")
    
    if 'readability' in features:
        print(f"  Readability score: {features['readability']:.2f}")

## 8. Custom Preprocessing Pipeline

In [None]:
class CustomPreprocessingPipeline:
    """
    Custom preprocessing pipeline for AG News classification.
    """
    
    def __init__(self, 
                 cleaning_strategy: str = 'standard',
                 tokenizer_model: str = 'bert-base-uncased',
                 max_length: int = 128):
        """
        Initialize preprocessing pipeline.
        
        Args:
            cleaning_strategy: Cleaning strategy to use
            tokenizer_model: Tokenizer model name
            max_length: Maximum sequence length
        """
        # Initialize cleaner
        if cleaning_strategy == 'minimal':
            self.cleaner = get_minimal_cleaner()
        elif cleaning_strategy == 'aggressive':
            self.cleaner = get_aggressive_cleaner()
        else:
            self.cleaner = get_standard_cleaner()
        
        # Initialize tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)
        self.max_length = max_length
        
        # Track statistics
        self.stats = {
            'texts_processed': 0,
            'avg_original_length': 0,
            'avg_cleaned_length': 0,
            'avg_token_count': 0
        }
    
    def preprocess(self, text: str) -> Dict[str, any]:
        """
        Apply full preprocessing pipeline.
        
        Args:
            text: Input text
            
        Returns:
            Dictionary with preprocessed data
        """
        # Original text stats
        original_length = len(text)
        
        # Clean text
        cleaned_text = self.cleaner.clean(text)
        cleaned_length = len(cleaned_text)
        
        # Tokenize
        encoded = self.tokenizer(
            cleaned_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Count actual tokens (excluding padding)
        token_count = (encoded['attention_mask'][0] == 1).sum().item()
        
        # Update statistics
        self.stats['texts_processed'] += 1
        n = self.stats['texts_processed']
        self.stats['avg_original_length'] = (
            (self.stats['avg_original_length'] * (n-1) + original_length) / n
        )
        self.stats['avg_cleaned_length'] = (
            (self.stats['avg_cleaned_length'] * (n-1) + cleaned_length) / n
        )
        self.stats['avg_token_count'] = (
            (self.stats['avg_token_count'] * (n-1) + token_count) / n
        )
        
        return {
            'original_text': text,
            'cleaned_text': cleaned_text,
            'input_ids': encoded['input_ids'],
            'attention_mask': encoded['attention_mask'],
            'token_count': token_count,
            'original_length': original_length,
            'cleaned_length': cleaned_length
        }
    
    def preprocess_batch(self, texts: List[str]) -> Dict[str, any]:
        """
        Preprocess batch of texts.
        
        Args:
            texts: List of input texts
            
        Returns:
            Dictionary with batch preprocessed data
        """
        results = [self.preprocess(text) for text in texts]
        
        # Stack tensors
        import torch
        batch_result = {
            'input_ids': torch.cat([r['input_ids'] for r in results]),
            'attention_mask': torch.cat([r['attention_mask'] for r in results]),
            'cleaned_texts': [r['cleaned_text'] for r in results],
            'token_counts': [r['token_count'] for r in results]
        }
        
        return batch_result
    
    def get_statistics(self) -> Dict[str, float]:
        """Get preprocessing statistics."""
        return self.stats.copy()

# Create and test custom pipeline
pipeline = CustomPreprocessingPipeline(
    cleaning_strategy='standard',
    tokenizer_model='bert-base-uncased',
    max_length=128
)

# Process sample texts
print("Custom Pipeline Processing")
print("=" * 50)

for i, text in enumerate(texts[:3]):
    result = pipeline.preprocess(text)
    
    print(f"\nText {i+1}:")
    print(f"  Original ({result['original_length']} chars): {text[:50]}...")
    print(f"  Cleaned ({result['cleaned_length']} chars): {result['cleaned_text'][:50]}...")
    print(f"  Tokens: {result['token_count']}")
    print(f"  Input shape: {result['input_ids'].shape}")

# Display statistics
stats = pipeline.get_statistics()
print("\nPipeline Statistics:")
for key, value in stats.items():
    print(f"  {key}: {value:.1f}")

## 9. Preprocessing Impact Analysis

In [None]:
# Analyze impact of different preprocessing strategies
strategies = ['minimal', 'standard', 'aggressive']
analysis_results = []

print("Preprocessing Impact Analysis")
print("=" * 50)

for strategy in strategies:
    # Create pipeline
    pipeline = CustomPreprocessingPipeline(cleaning_strategy=strategy)
    
    # Process texts
    processing_times = []
    for text in texts[:50]:  # Use first 50 texts
        start_time = time.time()
        result = pipeline.preprocess(text)
        processing_times.append(time.time() - start_time)
    
    # Get statistics
    stats = pipeline.get_statistics()
    
    # Store results
    analysis_results.append({
        'strategy': strategy,
        'avg_original_length': stats['avg_original_length'],
        'avg_cleaned_length': stats['avg_cleaned_length'],
        'avg_token_count': stats['avg_token_count'],
        'reduction_ratio': 1 - stats['avg_cleaned_length'] / stats['avg_original_length'],
        'avg_processing_time': np.mean(processing_times) * 1000  # Convert to ms
    })

# Display results
df_results = pd.DataFrame(analysis_results)
print("\nComparative Analysis:")
print(df_results.to_string(index=False))

# Recommendations
print("\nRecommendations:")
print("""
1. Minimal cleaning: Best for transformer models that handle raw text well
2. Standard cleaning: Balanced approach for most use cases
3. Aggressive cleaning: Useful for classical ML models or noisy data
4. Consider domain-specific requirements (e.g., keeping numbers for financial news)
5. Test different strategies on validation set to find optimal approach
""")

## 10. Save Preprocessing Configuration

In [None]:
# Create optimal preprocessing configuration
optimal_config = {
    'cleaning': {
        'strategy': 'standard',
        'lowercase': True,
        'remove_urls': True,
        'remove_emails': True,
        'remove_html': True,
        'remove_special_chars': False,
        'remove_numbers': False,
        'remove_punctuation': False,
        'normalize_whitespace': True
    },
    'tokenization': {
        'model_name': 'bert-base-uncased',
        'max_length': 128,
        'padding': 'max_length',
        'truncation': True,
        'return_tensors': 'pt'
    },
    'features': {
        'extract_length_features': True,
        'extract_pos_features': False,
        'extract_entity_features': True,
        'extract_readability_features': False
    },
    'analysis': df_results.to_dict('records')
}

# Save configuration
import json
config_path = PROJECT_ROOT / "outputs" / "preprocessing" / "optimal_config.json"
config_path.parent.mkdir(parents=True, exist_ok=True)

with open(config_path, 'w') as f:
    json.dump(optimal_config, f, indent=2, default=str)

print(f"Preprocessing configuration saved to: {config_path}")

print("\nSummary:")
print("""
Key Preprocessing Insights:
1. Text cleaning reduces noise but may remove useful information
2. Transformer tokenizers handle subwords effectively
3. Feature extraction can provide additional signals for models
4. Pipeline efficiency is important for large-scale processing
5. Different tasks may require different preprocessing strategies

Next Steps:
- Apply preprocessing to full dataset
- Test impact on model performance
- Optimize pipeline for production use
- Consider domain-specific preprocessing needs
""")