In [1]:
# MODULE 1: Data Loading
# Load Data from Github
# Convert from JSON to Txt for easy structuring and Preprocessing
# ============================================================================

# Libraries Import
import os
import json
import pandas as pd
import re
import urllib.request
import zipfile
import tempfile
import shutil
import time
import khmernltk
from tqdm import tqdm
import subprocess
import sys
import unicodedata

# Using local paths relative to current project
TEXTS_DIR = 'raw_articles'
METADATA_PATH = 'metadata.csv'

# Create output directories if they don't exist
os.makedirs(TEXTS_DIR, exist_ok=True)

def download_and_extract_github_zip():
    """Download the articles.zip file from GitHub and extract it"""
    temp_dir = tempfile.mkdtemp()
    zip_path = os.path.join(temp_dir, "articles.zip")
    
    # Simple download using urllib
    github_url = "https://raw.githubusercontent.com/RithDarapong/FinalYearProject/main/articles.zip"
    print(f"Downloading from: {github_url}")
    try:
        urllib.request.urlretrieve(github_url, zip_path)
    except Exception as e:
        print(f"Download failed: {e}")
        print("Trying alternative URL...")
        alt_url = "https://raw.githubusercontent.com/RithDarapong/FinalYearProject/main/articles.zip"
        urllib.request.urlretrieve(alt_url, zip_path)
    
    # Extract the zip file
    print("Extracting ZIP file...")
    with zipfile.ZipFile(zip_path) as zip_ref:
        zip_ref.extractall(temp_dir)
    
    # Find articles directory containing JSON files
    articles_dir = temp_dir
    for root, dirs, files in os.walk(temp_dir):
        if any(f.endswith('.json') for f in files):
            articles_dir = root
            break
    
    print(f"Extracted files to {articles_dir}")
    return articles_dir

def count_words(text):
    """Count the number of words in a text."""
    if not text:
        return 0
    words = re.findall(r'\b\w+\b', text)
    return len(words)

def process_articles():
    """Process all article JSON files and create metadata and text files."""
    start_time = time.time()
    
    # Check if articles directory exists locally, if not download
    if os.path.exists('articles'):
        articles_dir = 'articles'
        print(f"Using local articles directory: {articles_dir}")
    else:
        # Download and extract the articles from GitHub
        articles_dir = download_and_extract_github_zip()
    
    # Get all JSON files in the articles directory
    json_files = [f for f in os.listdir(articles_dir) if f.endswith('.json')]
    
    if not json_files:
        raise Exception(f"No JSON files found in {articles_dir}")
    
    print(f"Found {len(json_files)} JSON files: {json_files}")
    
    # List to store all article metadata
    metadata = []
    index = 1
    
    # Process each JSON file
    for json_file in json_files:
        category = os.path.splitext(json_file)[0]  # Get category from filename
        category_counter = 1  # Initialize counter for each category
        
        # Load the JSON file
        with open(os.path.join(articles_dir, json_file), 'r', encoding='utf-8') as f:
            articles = json.load(f)
        
        print(f"Processing {json_file} with {len(articles)} articles")
        
        # Process each article in the JSON file
        for article in articles:
            # Extract required fields
            title = article.get('title', '')
            content = article.get('content', '')
            url = article.get('url', '')
            
            # Create docId in the format {category}+{numberOrder}
            doc_id = f"{category}{category_counter}"
            category_counter += 1
            
            # Calculate counts
            char_count = len(content)
            word_count = count_words(content)
            
            # Create text file with title and content
            text_path = os.path.join(TEXTS_DIR, f"{doc_id}.txt")
            with open(text_path, 'w', encoding='utf-8') as f:
                f.write(f"{title}\n\n{content}")
            
            # Add to metadata
            metadata.append({
                'index': index,
                'docId': doc_id,
                'category': category,
                'charCount': char_count,
                'wordCount': word_count,
                'url': url
            })
            
            index += 1
    
    # Create metadata DataFrame and save to CSV
    metadata_df = pd.DataFrame(metadata)
    metadata_df.to_csv(METADATA_PATH, index=False)
    
    # Clean up the temporary directory only if we downloaded
    if not os.path.exists('articles'):
        shutil.rmtree(articles_dir)
    
    elapsed_time = time.time() - start_time
    print(f"Processed {len(metadata)} articles in {elapsed_time:.2f} seconds")
    print(f"Metadata saved to {METADATA_PATH}")
    print(f"Text files saved to {TEXTS_DIR}")
    
    return metadata_df

# Execute the processing function
if __name__ == "__main__":
    try:
        metadata_df = process_articles()
        
        # Display the first few rows of the metadata
        print("\nMetadata sample:")
        print(metadata_df.head())
        
        # Display statistics per category
        category_stats = metadata_df.groupby('category').agg({
            'index': 'count',
            'charCount': ['mean', 'min', 'max'],
            'wordCount': ['mean', 'min', 'max']
        })
        print("\nCategory statistics:")
        print(category_stats)
        
    except Exception as e:
        print(f"Error: {e}")

Downloading from: https://raw.githubusercontent.com/RithDarapong/FinalYearProject/main/articles.zip
Extracting ZIP file...
Extracted files to /var/folders/9n/ttnnmpy11wb3xzbvs4djr1yr0000gn/T/tmpup5jb7ge/articles
Found 6 JSON files: ['health.json', 'sport.json', 'technology.json', 'economic.json', 'politic.json', 'environment.json']
Processing health.json with 2500 articles
Processing sport.json with 2500 articles
Processing technology.json with 2500 articles
Processing economic.json with 2500 articles
Processing politic.json with 2500 articles
Processing environment.json with 2500 articles
Processed 15000 articles in 22.17 seconds
Metadata saved to metadata.csv
Text files saved to raw_articles

Metadata sample:
   index    docId category  charCount  wordCount  \
0      1  health1   health       2320        846   
1      2  health2   health       2760       1021   
2      3  health3   health        739        267   
3      4  health4   health       1454        541   
4      5  health5  

In [2]:
# MODULE 2: Data Preprocessing
# Removing Khmer punctuations
# Removing special characters
# Removing spaces (normalizing to single spaces)
# Removing numbers (both Khmer and Arabic)
# Perform Word Segmentation
# Removing Stop words Removal
# ============================================================================
import pandas as pd
import khmernltk
from tqdm import tqdm
import subprocess
import sys
import os
import re
import unicodedata

# Define local paths
ORIGINAL_TEXTS_DIR = "raw_articles"
PROCESSED_TEXTS_DIR = 'preprocessed_articles'
METADATA_PATH = 'metadata.csv'
STOPWORDS_PATH = "Khmer-Stop-Word-1000.txt"
# STOPWORDS_PATH = "khmer_stopword.txt"
STATS_OUTPUT_PATH = "stopword_removal_stats.txt"
DEBUG_OUTPUT_PATH = "stopword_debug.txt"

# Create output directory if it doesn't exist
os.makedirs(PROCESSED_TEXTS_DIR, exist_ok=True)

# Load metadata to get category information
metadata_df = pd.read_csv(METADATA_PATH)
# Create a dictionary for quick lookup: docId -> category
doc_categories = dict(zip(metadata_df['docId'], metadata_df['category']))

# Define Khmer character sets
KHCONST = set(u'កខគឃងចឆជឈញដឋឌឍណតថទធនបផពភមយរលវឝឞសហឡអឣឤឥឦឧឨឩឪឫឬឭឮឯឰឱឲឳ')
KHVOWEL = set(u'឴឵ាិីឹឺុូួើឿៀេែៃោៅ\u17c6\u17c7\u17c8')
KHSUB = set(u'្')
KHDIAC = set(u"\u17c9\u17ca\u17cb\u17cc\u17cd\u17ce\u17cf\u17d0")
KHSYM = set('៕។៛ៗ៚៙៘៖«»')
KHNUMBER = set(u'០១២៣៤៥៦៧៨៩')
ARABIC_NUMBER = set('0123456789')
LATIN_CHARS = set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')
PUNCTUATION = set('!@#$%^&*()_+=[]{};\'"\\|,.<>?/`~፡.,፣;፤፥፦፧፪፠፨')

def normalize_khmer_text(text):
    """Normalize Khmer text for consistent processing"""
    if not text:
        return ""
    
    # Unicode normalization
    normalized = unicodedata.normalize('NFC', text)
    
    # Remove invisible characters and control characters
    normalized = ''.join(char for char in normalized 
                        if not unicodedata.category(char).startswith('C'))
    
    return normalized

def clean_khmer_text(text):
    if not text:
        return ""
    
    # First normalize the text
    text = normalize_khmer_text(text)
      
    # Characters to remove: symbols, numbers, Latin chars, punctuation
    chars_to_remove = KHSYM | KHNUMBER | ARABIC_NUMBER | LATIN_CHARS | PUNCTUATION
    
    # Create translation table (faster than regex for character removal)
    translation_table = str.maketrans('', '', ''.join(chars_to_remove))
    
    # Apply translation to remove unwanted characters
    text = text.translate(translation_table)
    
    # Normalize spaces (remove excessive spaces)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def normalize_word(word):
    """Normalize a single word for consistent comparison"""
    if not word:
        return ""
    
    # Normalize unicode
    word = unicodedata.normalize('NFC', word)
    
    # Remove any leading/trailing whitespace
    word = word.strip()
    
    # Remove any remaining invisible characters
    word = ''.join(char for char in word 
                   if not unicodedata.category(char).startswith('C'))
    
    return word

def segment_khmer_text(text):
    """
    Apply word segmentation to Khmer text using Khmer-NLTK
    """
    try:
        # Append title to content before segmentation
        # Apply word segmentation using khmernltk
        segmented_text = khmernltk.word_tokenize(text)
        
        # Normalize each token
        normalized_tokens = [normalize_word(token) for token in segmented_text]
        
        # Filter out empty tokens
        normalized_tokens = [token for token in normalized_tokens if token]
        
        # Join with spaces to create a segmented string
        return ' '.join(normalized_tokens)
    except Exception as e:
        print(f"Error in segmentation: {e}")
        # If segmentation fails, return the original text
        return text

def load_khmer_stopwords(file_path):
    """
    Load Khmer stopwords from a text file with normalization
    """
    try:
        # Initialize an empty list for stopwords
        stopwords_list = []
        
        # Open and read the text file with UTF-8 encoding for Khmer characters
        with open(file_path, 'r', encoding='utf-8') as file:
            # Read each line and add to stopwords list
            for line in file:
                word = line.strip()
                if word:  # Check if the word is not empty
                    # Normalize the stopword
                    normalized_word = normalize_word(word)
                    if normalized_word:
                        stopwords_list.append(normalized_word)
        
        # Create set for faster lookup
        stopwords_set = set(stopwords_list)
        
        print(f"Successfully loaded {len(stopwords_set)} stopwords")
        return stopwords_set
    except Exception as e:
        print(f"Error loading stopwords: {e}")
        # Return an empty set if loading fails
        return set()

def create_stopword_variations(stopword):
    """Create variations of a stopword for better matching"""
    variations = set()
    
    # Add the original
    variations.add(stopword)
    
    # Add different normalizations
    variations.add(unicodedata.normalize('NFD', stopword))
    variations.add(unicodedata.normalize('NFKC', stopword))
    variations.add(unicodedata.normalize('NFKD', stopword))
    
    # Remove variations that are empty or too short
    variations = {var for var in variations if var and len(var) > 0}
    
    return variations

def load_khmer_stopwords_with_variations(file_path):
    """Load stopwords and create variations for better matching"""
    base_stopwords = load_khmer_stopwords(file_path)
    
    # Create expanded set with variations
    expanded_stopwords = set()
    
    for stopword in base_stopwords:
        variations = create_stopword_variations(stopword)
        expanded_stopwords.update(variations)
    
    print(f"Expanded to {len(expanded_stopwords)} stopword variations")
    return expanded_stopwords

def remove_stopwords(text, stopwords_set, debug_info=None):
    """
    Remove stopwords from segmented Khmer text with improved matching
    """
    # Split the text into words
    words = text.split()
    
    # Filter out stopwords and count the removed ones
    removed_count = 0
    filtered_words = []
    missed_stopwords = []
    
    for word in words:
        # Normalize the word for comparison
        normalized_word = normalize_word(word)
        
        if normalized_word in stopwords_set:
            removed_count += 1
        else:
            # Check if word contains any stopwords as substrings
            found_as_substring = False
            for stopword in stopwords_set:
                if stopword in normalized_word or normalized_word in stopword:
                    removed_count += 1
                    found_as_substring = True
                    break
            
            if not found_as_substring:
                filtered_words.append(word)
                
                # Debug: Check if this might be a missed stopword
                if debug_info is not None:
                    # Check for potential stopwords based on character composition
                    if all(char in KHCONST or char in KHVOWEL or char in KHSUB for char in normalized_word):
                        missed_stopwords.append(word)
    
    if debug_info is not None:
        debug_info['missed_stopwords'] = missed_stopwords
    
    # Join the filtered words back into a string
    return ' '.join(filtered_words), removed_count

def preprocess_text(text, stopwords_set, debug_info=None):
    """
    Apply all preprocessing steps to a text
    """
    cleaned = clean_khmer_text(text)
    segmented = segment_khmer_text(cleaned)
    filtered, removed_count = remove_stopwords(segmented, stopwords_set, debug_info)
    return filtered, removed_count

# Main function to process all text files
def process_khmer_text_files():
    """
    Process all Khmer text files with improved stopword removal
    """
    # Get list of all text files
    text_files = [f for f in os.listdir(ORIGINAL_TEXTS_DIR) if f.endswith('.txt')]
    print(f"Found {len(text_files)} text files to process")
    
    # Try to load stopwords with variations
    try:
        # Use the correct stopwords path (text file)
        khmer_stopwords = load_khmer_stopwords_with_variations(STOPWORDS_PATH)
        print(f"Loaded and expanded {len(khmer_stopwords)} stopword variations")
    except Exception as e:
        print(f"Warning: Could not load stopwords: {e}")
        khmer_stopwords = set()
    
    # Process each file
    processed_count = 0
    processing_details = []
    debug_details = []
    total_stopwords_removed = 0
    
    for filename in tqdm(text_files, desc="Processing Khmer text files"):
        try:
            # Get docId from filename
            doc_id = os.path.splitext(filename)[0]
            
            # Read the original file
            input_path = os.path.join(ORIGINAL_TEXTS_DIR, filename)
            with open(input_path, 'r', encoding='utf-8') as f:
                text = f.read()
            
            # Combine title and content
            parts = text.split('\n\n', 1)
            title = parts[0] if parts else ""
            content = parts[1] if len(parts) > 1 else ""
            
            # Combine title and content as you mentioned
            combined_text = f"{title} {content}"
            
            # Process the combined text with debug info
            debug_info = {'missed_stopwords': []}
            processed_text, stopwords_removed = preprocess_text(combined_text, khmer_stopwords, debug_info)
            
            total_stopwords_removed += stopwords_removed
            
            # Store statistics
            processing_details.append({
                'filename': filename,
                'doc_id': doc_id,
                'category': doc_categories.get(doc_id, 'unknown'),
                'stopwords_removed': stopwords_removed
            })
            
            # Store debug information
            if debug_info['missed_stopwords']:
                debug_details.append({
                    'filename': filename,
                    'missed_stopwords': debug_info['missed_stopwords'][:10]  # Limit to first 10
                })
            
            # Save the processed text
            output_path = os.path.join(PROCESSED_TEXTS_DIR, filename)
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(processed_text)
            
            processed_count += 1
            
        except Exception as e:
            print(f"Error processing {filename}: {e}")
    
    # Save statistics
    save_statistics(processed_count, total_stopwords_removed, processing_details, khmer_stopwords)
    
    # Save debug information
    save_debug_information(debug_details, text_files[0] if text_files else None)
    
    print(f"Successfully processed {processed_count} out of {len(text_files)} files")

def save_statistics(processed_count, total_stopwords_removed, processing_details, khmer_stopwords):
    """Save stopword removal statistics to a text file"""
    with open(STATS_OUTPUT_PATH, 'w', encoding='utf-8') as f:
        f.write("================================\n")
        f.write("Khmer Text Preprocessing Statistics\n")
        f.write("================================\n\n")
        f.write(f"Total files processed: {processed_count}\n")
        f.write(f"Total stopwords loaded (with variations): {len(khmer_stopwords)}\n")
        f.write(f"Total stopwords removed across all files: {total_stopwords_removed}\n")
        f.write(f"Average stopwords removed per file: {total_stopwords_removed/processed_count if processed_count > 0 else 0:.2f}\n")
        f.write("\n" + "="*50 + "\n\n")
        f.write("Per-file Details:\n")
        f.write("-" * 50 + "\n")
        
        for detail in processing_details:
            f.write(f"File: {detail['filename']} (ID: {detail['doc_id']}, Category: {detail['category']})\n")
            f.write(f"  Stopwords removed: {detail['stopwords_removed']}\n")
            f.write("-" * 50 + "\n")
    
    print(f"\nStopword removal statistics saved to: {STATS_OUTPUT_PATH}")

def save_debug_information(debug_details, sample_file):
    """Save debug information about potentially missed stopwords"""
    with open(DEBUG_OUTPUT_PATH, 'w', encoding='utf-8') as f:
        f.write("=====================================\n")
        f.write("Debug: Potentially Missed Stopwords\n")
        f.write("=====================================\n\n")
        
        for detail in debug_details[:10]:  # Limit to first 10 files for clarity
            f.write(f"File: {detail['filename']}\n")
            f.write("Potentially missed stopwords:\n")
            for i, word in enumerate(detail['missed_stopwords'][:10], 1):
                f.write(f"  {i}. '{word}'\n")
            f.write("-" * 30 + "\n")
        
        # Show sample processing
        if sample_file:
            f.write("\n\nSAMPLE PROCESSING EXAMPLE\n")
            f.write("========================\n")
            
            try:
                with open(os.path.join(ORIGINAL_TEXTS_DIR, sample_file), 'r', encoding='utf-8') as original:
                    original_text = original.read()
                
                with open(os.path.join(PROCESSED_TEXTS_DIR, sample_file), 'r', encoding='utf-8') as processed:
                    processed_text = processed.read()
                
                f.write(f"File: {sample_file}\n")
                f.write("\nOriginal words (first 50):\n")
                original_words = original_text.split()[:50]
                f.write(' '.join(original_words) + "\n")
                
                f.write("\nProcessed words (first 50):\n")
                processed_words = processed_text.split()[:50]
                f.write(' '.join(processed_words) + "\n")
                
            except Exception as e:
                f.write(f"Error showing sample: {e}\n")
    
    print(f"Debug information saved to: {DEBUG_OUTPUT_PATH}")

# Run the processing function
if __name__ == "__main__":
    process_khmer_text_files()

Found 15000 text files to process
Successfully loaded 988 stopwords
Expanded to 988 stopword variations
Loaded and expanded 988 stopword variations


Processing Khmer text files:   0%|          | 0/15000 [00:00<?, ?it/s]| 2025-06-14 19:26:07,396 | [1;32mINFO[0m | khmer-nltk | Loaded model from /Users/socheata/Library/Python/3.9/lib/python/site-packages/khmernltk/word_tokenize/sklearn_crf_ner_10000.sav |
Processing Khmer text files: 100%|██████████| 15000/15000 [08:34<00:00, 29.17it/s]


Stopword removal statistics saved to: stopword_removal_stats.txt
Debug information saved to: stopword_debug.txt
Successfully processed 15000 out of 15000 files



