In [19]:
# Khmer Text Preprocessing with Improved Stopword Removal
import pandas as pd
import khmernltk
from tqdm import tqdm
import subprocess
import sys
import os
import re
import unicodedata

# Define paths
# Separate base directory and data directory
# Dynamically set BASE_DIR to the parent directory of the current notebook's directory
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
DATA_DIR = os.path.join(BASE_DIR, "FYP-Data-Preprocessing")

# Set the processed texts output directory to the specified absolute path
ORIGINAL_TEXTS_DIR = os.path.join(BASE_DIR, "original_articles/texts")
PROCESSED_TEXTS_DIR = os.path.join(BASE_DIR, "Preprocess_articles")
METADATA_PATH = os.path.join(BASE_DIR, "original_articles/metadata.csv")
STOPWORDS_PATH = os.path.join(DATA_DIR, "Khmer-Stop-Word-1000.txt")
STATS_OUTPUT_PATH = os.path.join(PROCESSED_TEXTS_DIR, "stopword_removal_stats.txt")
DEBUG_OUTPUT_PATH = os.path.join(PROCESSED_TEXTS_DIR, "stopword_debug.txt")

# Create output directory if it doesn't exist
os.makedirs(PROCESSED_TEXTS_DIR, exist_ok=True)

# Load metadata to get category information
if os.path.exists(METADATA_PATH):
    metadata_df = pd.read_csv(METADATA_PATH)
    # Create a dictionary for quick lookup: docId -> category
    doc_categories = dict(zip(metadata_df['docId'], metadata_df['category']))
else:
    print(f"Warning: Metadata file not found at {METADATA_PATH}. Categories will be set to 'unknown'.")
    metadata_df = None
    doc_categories = {}

# Define Khmer character sets
KHCONST = set(u'កខគឃងចឆជឈញដឋឌឍណតថទធនបផពភមយរលវឝឞសហឡអឣឤឥឦឧឨឩឪឫឬឭឮឯឰឱឲឳ')
KHVOWEL = set(u'឴឵ាិីឹឺុូួើឿៀេែៃោៅ\u17c6\u17c7\u17c8')
KHSUB = set(u'្')
KHDIAC = set(u"\u17c9\u17ca\u17cb\u17cc\u17cd\u17ce\u17cf\u17d0")
KHSYM = set('៕។៛ៗ៚៙៘៖«»')
KHNUMBER = set(u'០១២៣៤៥៦៧៨៩')
ARABIC_NUMBER = set('0123456789')
LATIN_CHARS = set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')
PUNCTUATION = set('"!@#$%^&*()-_+=[]{};\'"\\|,.<>?/`~፡.,፣;፤፥፦፧፪፠፨')

def normalize_khmer_text(text):
    """Normalize Khmer text for consistent processing"""
    if not text:
        return ""
    
    # Unicode normalization
    normalized = unicodedata.normalize('NFC', text)
    
    # Remove invisible characters and control characters
    normalized = ''.join(char for char in normalized 
                        if not unicodedata.category(char).startswith('C'))
    
    return normalized

def clean_khmer_text(text):
    if not text:
        return ""
    
    # First normalize the text
    text = normalize_khmer_text(text)
      
    # Characters to remove: symbols, numbers, Latin chars, punctuation
    chars_to_remove = KHSYM | KHNUMBER | ARABIC_NUMBER | LATIN_CHARS | PUNCTUATION
    
    # Create translation table (faster than regex for character removal)
    translation_table = str.maketrans('', '', ''.join(chars_to_remove))
    
    # Apply translation to remove unwanted characters
    text = text.translate(translation_table)
    
    # Normalize spaces (remove excessive spaces)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def normalize_word(word):
    """Normalize a single word for consistent comparison"""
    if not word:
        return ""
    
    # Normalize unicode
    word = unicodedata.normalize('NFC', word)
    
    # Remove any leading/trailing whitespace
    word = word.strip()
    
    # Remove any remaining invisible characters
    word = ''.join(char for char in word 
                   if not unicodedata.category(char).startswith('C'))
    
    return word

def segment_khmer_text(text):
    """
    Apply word segmentation to Khmer text using Khmer-NLTK
    """
    try:
        # Append title to content before segmentation
        # Apply word segmentation using khmernltk
        segmented_text = khmernltk.word_tokenize(text)
        
        # Normalize each token
        normalized_tokens = [normalize_word(token) for token in segmented_text]
        
        # Filter out empty tokens
        normalized_tokens = [token for token in normalized_tokens if token]
        
        # Join with spaces to create a segmented string
        return ' '.join(normalized_tokens)
    except Exception as e:
        print(f"Error in segmentation: {e}")
        # If segmentation fails, return the original text
        return text

def load_khmer_stopwords(file_path):
    """
    Load Khmer stopwords from a text file with normalization and debugging
    """
    try:
        # Initialize an empty list for stopwords
        stopwords_list = []
        
        # Debug counters
        total_lines = 0
        empty_lines = 0
        normalized_words = {}  # Track normalized words and their original form
        
        # Open and read the text file with UTF-8 encoding for Khmer characters
        with open(file_path, 'r', encoding='utf-8') as file:
            # Read each line and add to stopwords list
            for line in file:
                total_lines += 1
                word = line.strip()
                if not word:  # Check if the word is empty
                    empty_lines += 1
                    continue
                
                # Normalize the stopword
                normalized_word = normalize_word(word)
                if normalized_word:
                    # Track if this normalized form already exists
                    if normalized_word in normalized_words:
                        # This is a duplicate after normalization
                        normalized_words[normalized_word].append(word)
                    else:
                        normalized_words[normalized_word] = [word]
                    stopwords_list.append(normalized_word)
        
        # Create set for faster lookup (removes duplicates)
        original_count = len(stopwords_list)
        stopwords_set = set(stopwords_list)
        duplicates_removed = original_count - len(stopwords_set)
        
        # Print debug information
        print(f"\n--- Stopword Loading Debug Info ---")
        print(f"Total lines in file: {total_lines}")
        print(f"Empty lines skipped: {empty_lines}")
        print(f"Words after normalization: {original_count}")
        print(f"Unique words after removing duplicates: {len(stopwords_set)}")
        print(f"Duplicates removed: {duplicates_removed}")
        
        # Print details about words that normalized to the same form
        duplicate_normalizations = {word: instances for word, instances in normalized_words.items() if len(instances) > 1}
        if duplicate_normalizations:
            print("\nWords that normalized to the same form:")
            for norm_word, originals in list(duplicate_normalizations.items())[:5]:  # Show first 5 examples
                print(f"  '{norm_word}' from: {originals}")
            if len(duplicate_normalizations) > 5:
                print(f"  ...and {len(duplicate_normalizations) - 5} more.")
        
        print(f"\nSuccessfully loaded {len(stopwords_set)} stopwords")
        return stopwords_set
    except Exception as e:
        print(f"Error loading stopwords: {e}")
        # Return an empty set if loading fails
        return set()
def create_stopword_variations(stopword):
    """Create variations of a stopword for better matching"""
    variations = set()
    
    # Add the original
    variations.add(stopword)
    
    # Add different normalizations
    variations.add(unicodedata.normalize('NFD', stopword))
    variations.add(unicodedata.normalize('NFKC', stopword))
    variations.add(unicodedata.normalize('NFKD', stopword))
    
    # Remove variations that are empty or too short
    variations = {var for var in variations if var and len(var) > 0}
    
    return variations

def load_khmer_stopwords_with_variations(file_path):
    """Load stopwords and create variations for better matching"""
    base_stopwords = load_khmer_stopwords(file_path)
    
    # Create expanded set with variations
    expanded_stopwords = set()
    
    for stopword in base_stopwords:
        variations = create_stopword_variations(stopword)
        expanded_stopwords.update(variations)
    
    print(f"Expanded to {len(expanded_stopwords)} stopword variations")
    return expanded_stopwords

def remove_stopwords(text, stopwords_set, debug_info=None):
    """
    Remove stopwords from segmented Khmer text with improved matching
    """
    # Split the text into words
    words = text.split()
    
    # Filter out stopwords and count the removed ones
    removed_count = 0
    filtered_words = []
    missed_stopwords = []
    
    for word in words:
        # Normalize the word for comparison
        normalized_word = normalize_word(word)
        
        if normalized_word in stopwords_set:
            removed_count += 1
        else:
            # Check if word contains any stopwords as substrings
            found_as_substring = False
            for stopword in stopwords_set:
                if stopword in normalized_word or normalized_word in stopword:
                    removed_count += 1
                    found_as_substring = True
                    break
            
            if not found_as_substring:
                filtered_words.append(word)
                
                # Debug: Check if this might be a missed stopword
                if debug_info is not None:
                    # Check for potential stopwords based on character composition
                    if all(char in KHCONST or char in KHVOWEL or char in KHSUB for char in normalized_word):
                        missed_stopwords.append(word)
    
    if debug_info is not None:
        debug_info['missed_stopwords'] = missed_stopwords
    
    # Join the filtered words back into a string
    return ' '.join(filtered_words), removed_count

def preprocess_text(text, stopwords_set, debug_info=None):
    """
    Apply all preprocessing steps to a text
    """
    cleaned = clean_khmer_text(text)
    segmented = segment_khmer_text(cleaned)
    filtered, removed_count = remove_stopwords(segmented, stopwords_set, debug_info)
    return filtered, removed_count

# Main function to process all text files
def process_khmer_text_files():
    """
    Process all Khmer text files with improved stopword removal
    """
    # Get list of all text files
    text_files = [f for f in os.listdir(ORIGINAL_TEXTS_DIR) if f.endswith('.txt')]
    print(f"Found {len(text_files)} text files to process")
    
    # Try to load stopwords with variations
    try:
        # Use the correct stopwords path (text file)
        khmer_stopwords = load_khmer_stopwords_with_variations(STOPWORDS_PATH)
        print(f"Loaded and expanded {len(khmer_stopwords)} stopword variations")
    except Exception as e:
        print(f"Warning: Could not load stopwords: {e}")
        khmer_stopwords = set()
    
    # Process each file
    processed_count = 0
    processing_details = []
    debug_details = []
    total_stopwords_removed = 0
    
    for filename in tqdm(text_files, desc="Processing Khmer text files"):
        try:
            # Get docId from filename
            doc_id = os.path.splitext(filename)[0]
            
            # Read the original file
            input_path = os.path.join(ORIGINAL_TEXTS_DIR, filename)
            with open(input_path, 'r', encoding='utf-8') as f:
                text = f.read()
            
            # Combine title and content
            parts = text.split('\n\n', 1)
            title = parts[0] if parts else ""
            content = parts[1] if len(parts) > 1 else ""
            
            # Combine title and content as you mentioned
            combined_text = f"{title} {content}"
            
            # Process the combined text with debug info
            debug_info = {'missed_stopwords': []}
            processed_text, stopwords_removed = preprocess_text(combined_text, khmer_stopwords, debug_info)
            
            total_stopwords_removed += stopwords_removed
            
            # Store statistics
            processing_details.append({
                'filename': filename,
                'doc_id': doc_id,
                'category': doc_categories.get(doc_id, 'unknown'),
                'stopwords_removed': stopwords_removed
            })
            
            # Store debug information
            if debug_info['missed_stopwords']:
                debug_details.append({
                    'filename': filename,
                    'missed_stopwords': debug_info['missed_stopwords'][:10]  # Limit to first 10
                })
            
            # Save the processed text
            output_path = os.path.join(PROCESSED_TEXTS_DIR, filename)
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(processed_text)
            
            processed_count += 1
            
        except Exception as e:
            print(f"Error processing {filename}: {e}")
    
    # Save statistics
    save_statistics(processed_count, total_stopwords_removed, processing_details, khmer_stopwords)
    
    # Save debug information
    save_debug_information(debug_details, text_files[0] if text_files else None)
    
    print(f"Successfully processed {processed_count} out of {len(text_files)} files")

def save_statistics(processed_count, total_stopwords_removed, processing_details, khmer_stopwords):
    """Save stopword removal statistics to a text file"""
    with open(STATS_OUTPUT_PATH, 'w', encoding='utf-8') as f:
        f.write("================================\n")
        f.write("Khmer Text Preprocessing Statistics\n")
        f.write("================================\n\n")
        f.write(f"Total files processed: {processed_count}\n")
        f.write(f"Total stopwords loaded (with variations): {len(khmer_stopwords)}\n")
        f.write(f"Total stopwords removed across all files: {total_stopwords_removed}\n")
        f.write(f"Average stopwords removed per file: {total_stopwords_removed/processed_count if processed_count > 0 else 0:.2f}\n")
        f.write("\n" + "="*50 + "\n\n")
        f.write("Per-file Details:\n")
        f.write("-" * 50 + "\n")
        
        for detail in processing_details:
            f.write(f"File: {detail['filename']} (ID: {detail['doc_id']}, Category: {detail['category']})\n")
            f.write(f"  Stopwords removed: {detail['stopwords_removed']}\n")
            f.write("-" * 50 + "\n")
    
    print(f"\nStopword removal statistics saved to: {STATS_OUTPUT_PATH}")

def save_debug_information(debug_details, sample_file):
    """Save debug information about potentially missed stopwords"""
    with open(DEBUG_OUTPUT_PATH, 'w', encoding='utf-8') as f:
        f.write("=====================================\n")
        f.write("Debug: Potentially Missed Stopwords\n")
        f.write("=====================================\n\n")
        
        for detail in debug_details[:10]:  # Limit to first 10 files for clarity
            f.write(f"File: {detail['filename']}\n")
            f.write("Potentially missed stopwords:\n")
            for i, word in enumerate(detail['missed_stopwords'][:10], 1):
                f.write(f"  {i}. '{word}'\n")
            f.write("-" * 30 + "\n")
        
        # Show sample processing
        if sample_file:
            f.write("\n\nSAMPLE PROCESSING EXAMPLE\n")
            f.write("========================\n")
            
            try:
                with open(os.path.join(ORIGINAL_TEXTS_DIR, sample_file), 'r', encoding='utf-8') as original:
                    original_text = original.read()
                
                with open(os.path.join(PROCESSED_TEXTS_DIR, sample_file), 'r', encoding='utf-8') as processed:
                    processed_text = processed.read()
                
                f.write(f"File: {sample_file}\n")
                f.write("\nOriginal words (first 50):\n")
                original_words = original_text.split()[:50]
                f.write(' '.join(original_words) + "\n")
                
                f.write("\nProcessed words (first 50):\n")
                processed_words = processed_text.split()[:50]
                f.write(' '.join(processed_words) + "\n")
                
            except Exception as e:
                f.write(f"Error showing sample: {e}\n")
    
    print(f"Debug information saved to: {DEBUG_OUTPUT_PATH}")

# Run the processing function
if __name__ == "__main__":
    process_khmer_text_files()

Found 15000 text files to process

--- Stopword Loading Debug Info ---
Total lines in file: 1040
Empty lines skipped: 0
Words after normalization: 1040
Unique words after removing duplicates: 984
Duplicates removed: 56

Words that normalized to the same form:
  'កាន់' from: ['កាន់', 'កាន់']
  'ច្បាស់ណាស់' from: ['ច្បាស់ណាស់', 'ច្បាស់ណាស់']
  'ទទួលយក' from: ['ទទួលយក', 'ទទួលយក']
  'ទាំងនេះ' from: ['ទាំងនេះ', 'ទាំងនេះ']
  'ទាំងពីរ' from: ['ទាំងពីរ', 'ទាំងពីរ']
  ...and 50 more.

Successfully loaded 984 stopwords
Expanded to 984 stopword variations
Loaded and expanded 984 stopword variations


Processing Khmer text files: 100%|██████████| 15000/15000 [08:49<00:00, 28.31it/s]


Stopword removal statistics saved to: /Users/socheata/Documents/FYP-Khmer-Classification/Preprocess_articles/stopword_removal_stats.txt
Debug information saved to: /Users/socheata/Documents/FYP-Khmer-Classification/Preprocess_articles/stopword_debug.txt
Successfully processed 15000 out of 15000 files



