# Khmer Text Preprocessing

This notebook preprocesses Khmer language text from article datasets by:
- Removing Khmer punctuations
- Removing special characters
- Removing spaces (normalizing to single spaces)
- Removing numbers (both Khmer and Arabic)

The cleaned text will be saved back to the original JSON files.


In [56]:
# Import necessary libraries
import json
import re
import os
import pandas as pd
import khmernltk
from tqdm.notebook import tqdm 
import json

In [57]:
import subprocess
import sys

# List of required libraries
required_libraries = ['json', 're', 'os', 'pandas', 'khmernltk', 'tqdm']

# Function to install a library
def install_library(library):
  try:
    subprocess.check_call([sys.executable, "-m", "pip", "install", library])
  except Exception as e:
    print(f"Error installing {library}: {e}")

# Check and install missing libraries
for lib in required_libraries:
  try:
    __import__(lib)
  except ImportError:
    print(f"{lib} is not installed. Installing...")
    if lib == 'json' or lib == 're' or lib == 'os':
      print(f"{lib} is a built-in library and does not require installation.")
    elif lib == 'pandas':
      install_library('pandas')
    elif lib == 'khmernltk':
      install_library('khmer-nltk')
    elif lib == 'tqdm':
      install_library('tqdm')

In [58]:
# Define paths
ORIGINAL_TEXTS_DIR = '/Users/socheata/Documents/FYP-Khmer-Classification/orginal_articles/texts'
PROCESSED_TEXTS_DIR = '/Users/socheata/Documents/FYP-Khmer-Classification/Preprocess_articles'
METADATA_PATH = '/Users/socheata/Documents/FYP-Khmer-Classification/orginal_articles/metadata.csv'

# Create output directory if it doesn't exist
os.makedirs(PROCESSED_TEXTS_DIR, exist_ok=True)

# Load metadata to get category information
metadata_df = pd.read_csv(METADATA_PATH)
# Create a dictionary for quick lookup: docId -> category
doc_categories = dict(zip(metadata_df['docId'], metadata_df['category']))


In [59]:
# Define functions for the three main preprocessing steps

def clean_khmer_text(text):
  """
  Clean Khmer text by removing unwanted characters and normalizing spaces.
  """
  # First, normalize all whitespace (spaces, tabs, newlines) to single spaces
  text = re.sub(r'\s+', ' ', text)
  
  # Define pattern for unwanted characters
  # Include all punctuation marks, symbols, numbers and English characters
  pattern = r"[។៘ៗ៕៚៙៖«»៛!@#$%^&*()_\-+=\[\]{};:'\"\\\|<>?/`~០-៩0-9a-zA-Z,.]"
  
  # Remove unwanted characters
  text = re.sub(pattern, "", text)
  
  # Normalize spaces again after character removal
  text = re.sub(r'\s+', ' ', text)
  
  return text.strip()

def remove_stopwords_from_unsegmented(text, stopwords_set):
  """
  Improved stopword removal that handles Khmer text better
  """
  if not stopwords_set:
    return text
  
  # First normalize the text to ensure consistent spacing
  text = ' ' + text + ' '  # Add spaces at beginning and end for boundary matching
  
  # Sort stopwords by length (longest first) to prevent partial matches
  sorted_stopwords = sorted(stopwords_set, key=len, reverse=True)
  
  # Replace each stopword with a space
  for word in sorted_stopwords:
    # Need to add space boundaries for Khmer since \b doesn't work well
    word_with_boundaries = ' ' + word + ' '
    text = text.replace(word_with_boundaries, ' ')
  
  # Clean up any resulting multiple spaces
  text = re.sub(r'\s+', ' ', text)
  
  return text.strip()

def segment_khmer_text(text):
  """
  Apply word segmentation to Khmer text using Khmer-NLTK
  """
  try:
    # Apply word segmentation using khmernltk
    segmented_text = khmernltk.word_tokenize(text)
    
    # Join with spaces to create a segmented string
    return ' '.join(segmented_text)
  except Exception as e:
    print(f"Error in segmentation: {e}")
    # If segmentation fails, return the original text
    return text

def load_khmer_stopwords(file_path):
  """
  Load Khmer stopwords from either a text file or an Excel file.
  
  Args:
      file_path (str): Path to the file containing Khmer stopwords
      
  Returns:
      set: A set of Khmer stopwords
  """
  try:
    # Check the file extension
    if file_path.endswith('.txt'):
      # For text files
      with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
      
      # Filter out comment lines and empty lines, and strip whitespace
      stopwords = []
      for line in lines:
        line = line.strip()
        if line and not line.startswith('//'):
          stopwords.append(line)
      
      return set(stopwords)
    
    elif file_path.endswith('.xlsx'):
      # For Excel files
      # Read stopwords from Excel file
      stopwords_df = pd.read_excel(file_path)
      
      # Extract stopwords from the DataFrame
      stopwords_list = stopwords_df.iloc[:, 0].tolist()
      
      # Clean the stopwords list (remove NaN values and convert to string)
      stopwords_list = [str(word).strip() for word in stopwords_list if isinstance(word, str)]
      
      return set(stopwords_list)
    
    else:
      raise ValueError(f"Unsupported file format for {file_path}. Use .txt or .xlsx.")
  
  except Exception as e:
    print(f"Error loading stopwords: {e}")
    # Return an empty set if loading fails
    return set()

def normalize_file_encoding(input_path, encoding='utf-8'):
  """
  Read file with proper encoding and normalize line endings
  """
  try:
    with open(input_path, 'r', encoding=encoding) as f:
      text = f.read()
    
    # Normalize line endings to \n
    text = text.replace('\r\n', '\n').replace('\r', '\n')
    
    return text
  except UnicodeDecodeError:
    # Try with different encoding if utf-8 fails
    try:
      with open(input_path, 'r', encoding='utf-16') as f:
        text = f.read()
      return text.replace('\r\n', '\n').replace('\r', '\n')
    except:
      # If all else fails, read as binary and decode with errors ignored
      with open(input_path, 'rb') as f:
        text = f.read().decode('utf-8', errors='ignore')
      return text.replace('\r\n', '\n').replace('\r', '\n')

def preprocess_text(text, stopwords_set):
  """
  Apply all preprocessing steps to a text in the order:
  1. Segment words (to identify word boundaries first)
  2. Clean text (remove punctuation while preserving word structure)
  3. Remove stopwords
  """
  segmented = segment_khmer_text(text)
  cleaned = clean_khmer_text(segmented)
  filtered = remove_stopwords_from_unsegmented(cleaned, stopwords_set)
  return filtered


# Function to process a specific number of files
def process_khmer_text_files(max_files=None, output_dir=None):
  """
  Process Khmer text files with preprocessing steps.
  
  Args:
      max_files (int, optional): Maximum number of files to process. If None, process all files.
      output_dir (str, optional): Directory to save processed files. If None, use default.
  
  Returns:
      tuple: (processed_count, total_files, processing_time)
  """
  import time
  start_time = time.time()
  
  # Use specified output directory or default
  output_directory = output_dir if output_dir is not None else PROCESSED_TEXTS_DIR
  os.makedirs(output_directory, exist_ok=True)
  
  # Get list of all text files
  text_files = [f for f in os.listdir(ORIGINAL_TEXTS_DIR) if f.endswith('.txt')]
  
  # Limit the number of files if max_files is specified
  if max_files is not None and max_files > 0:
    text_files = text_files[:min(max_files, len(text_files))]
  
  print(f"Found {len(text_files)} text files to process")
  
  # Try to load stopwords from text file instead of Excel
  try:
    stopwords_path = "khmer_stopwords.txt"  # Use text file instead of Excel
    with open(stopwords_path, 'r', encoding='utf-8') as f:
      khmer_stopwords = set(f.read().splitlines())
    print(f"Loaded {len(khmer_stopwords)} Khmer stopwords")
  except Exception as e:
    print(f"Warning: Could not load stopwords: {e}")
    # Try loading from Excel as fallback
    try:
      stopwords_path = "khmer stopwords-corpus-385.xlsx"
      khmer_stopwords = load_khmer_stopwords(stopwords_path)
      print(f"Loaded {len(khmer_stopwords)} Khmer stopwords from Excel")
    except Exception:
      khmer_stopwords = set()
  
  # Process each file
  processed_count = 0
  
  # Update this part in your process_khmer_text_files function
  for filename in tqdm(text_files, desc="Processing Khmer text files"):
    try:
      # Get docId from filename
      doc_id = os.path.splitext(filename)[0]
      
      # Read the original file with encoding normalization
      input_path = os.path.join(ORIGINAL_TEXTS_DIR, filename)
      text = normalize_file_encoding(input_path)
      
      # Split into title and content
      parts = text.split('\n\n', 1)
      title = parts[0] if parts else ""
      content = parts[1] if len(parts) > 1 else ""
      
      # Process both title and content
      processed_title = preprocess_text(title, khmer_stopwords)
      processed_content = preprocess_text(content, khmer_stopwords)
      
      # Save the processed text
      output_path = os.path.join(output_directory, filename)
      with open(output_path, 'w', encoding='utf-8') as f:
        # Save the processed title and content
        f.write(f"{processed_title}\n\n{processed_content}")
      
      processed_count += 1
      
    except Exception as e:
      print(f"Error processing {filename}: {str(e)}")
  
  # Calculate processing time
  processing_time = time.time() - start_time
  
  print(f"Successfully processed {processed_count} out of {len(text_files)} files")
  print(f"Processing time: {processing_time:.2f} seconds")
  
  # Show a sample of a processed file
  if processed_count > 0:
    # Display sample comparison (same as before)
    sample_file = text_files[0]
    print(f"\nSample comparison for file: {sample_file}")
    
    # Read original
    with open(os.path.join(ORIGINAL_TEXTS_DIR, sample_file), 'r', encoding='utf-8') as f:
      original_text = f.read()
    
    # Read processed
    with open(os.path.join(output_directory, sample_file), 'r', encoding='utf-8') as f:
      processed_text = f.read()
    
    # Split to get title and content
    original_parts = original_text.split('\n\n', 1)
    processed_parts = processed_text.split('\n\n', 1)
    
    original_title = original_parts[0] if original_parts else ""
    original_content = original_parts[1] if len(original_parts) > 1 else ""
    
    processed_title = processed_parts[0] if processed_parts else ""
    processed_content = processed_parts[1] if len(processed_parts) > 1 else ""
    
    print("\nOriginal title:")
    print(original_title)
    
    print("\nProcessed title:")
    print(processed_title)
    
    print("\nOriginal content (first 100 chars):")
    print(original_content[:100] + "...")
    
    print("\nProcessed content (first 100 chars):")
    print(processed_content[:100] + "...")
    
    # Print statistics
    original_title_word_count = len(original_title.split())
    processed_title_word_count = len(processed_title.split())
    
    original_content_word_count = len(original_content.split())
    processed_content_word_count = len(processed_content.split())
    
    print(f"\nTitle word count: {original_title_word_count} → {processed_title_word_count}")
    print(f"Content word count: {original_content_word_count} → {processed_content_word_count}")
  
  return processed_count, len(text_files), processing_time

# Function to run tests on different batch sizes
def run_batch_tests():
  """
  Run preprocessing on different batch sizes (10, 100, 1000, 5000, 10000, 15000)
  and compare performance
  """
  batch_sizes = [10, 100, 1000, 5000, 10000, 15000]
  results = []
  
  for batch_size in batch_sizes:
    print(f"\n{'='*50}")
    print(f"Testing with batch size: {batch_size}")
    print(f"{'='*50}\n")
    
    # Create a new output directory for this batch
    batch_output_dir = f"{PROCESSED_TEXTS_DIR}_{batch_size}"
    
    # Process the batch
    processed, total, proc_time = process_khmer_text_files(max_files=batch_size, output_dir=batch_output_dir)
    
    # Store results
    results.append((batch_size, processed, proc_time))
  
  # Print summary of results
  print("\n\n")
  print(f"{'='*70}")
  print("Batch Processing Results Summary")
  print(f"{'='*70}")
  print(f"{'Batch Size':<15}{'Files Processed':<20}{'Processing Time':<20}{'Time per File':<15}")
  print(f"{'-'*70}")
  
  for batch_size, processed, proc_time in results:
    time_per_file = proc_time / processed if processed > 0 else 0
    print(f"{batch_size:<15}{processed:<20}{proc_time:.2f}s{'':<10}{time_per_file:.4f}s{'':<5}")

# Run a specific batch size test
def run_specific_test(size):
  """
  Process a specific number of files
  
  Args:
      size (int): Number of files to process
  """
  print(f"Processing {size} files...")
  process_khmer_text_files(max_files=size)


  
# Choose how to run the tests

# Option 1: Run all batch sizes (10, 100, 1000, 5000, 10000, 15000)
# WARNING: This may take a long time to complete
# run_batch_tests()

# Option 2: Run a specific batch size (uncomment and modify the line below)
# run_specific_test(100)  # Process 100 articles

# Option 3: Run the default processing (all files)
process_khmer_text_files()

Found 15000 text files to process
Loaded 380 Khmer stopwords


Processing Khmer text files:   0%|          | 0/15000 [00:00<?, ?it/s]

Successfully processed 15000 out of 15000 files
Processing time: 376.76 seconds

Sample comparison for file: health600.txt

Original title:
អភិបាលខេត្តរតនគិរីបញ្ជាឱ្យបិទទីតាំងផលិតស្រាសទូទាំងខេត្ត ក្រោយពុលស្លាប់មនុស្ស ៣នាក់

Processed title:
អភិបាល ខេត្ត រតនគិរី បញ្ជា បិទ ទីតាំង ផលិត ស្រាស ខេត្ត ពុល ស្លាប់ មនុស្ស នាក់

Original content (first 100 chars):
អភិបាលខេត្តរតនគិរីបញ្ជាឱ្យបិទទីតាំងផលិតស្រាស នៅទូទាំងខេត្តភ្លាមៗ ក្រោយមានពលរដ្ឋចំនួន ៣នាក់បានស្លាប់ដ...

Processed content (first 100 chars):
អភិបាល ខេត្ត រតនគិរី បញ្ជា បិទ ទីតាំង ផលិត ស្រាស នៅ ខេត្ត ភ្លាម ពលរដ្ឋ នាក់ បាន ស្លាប់ សង្ស័យ ពុល ស្...

Title word count: 3 → 13
Content word count: 60 → 172


(15000, 15000, 376.75803685188293)