In [1]:
import os
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import time

# Define paths
PROCESSED_TEXTS_DIR = '/Users/socheata/Documents/FYP-Khmer-Classification/Preprocess_articles'
METADATA_PATH = '/Users/socheata/Documents/FYP-Khmer-Classification/orginal_articles/metadata.csv'
OUTPUT_DIR = '/Users/socheata/Documents/FYP-Khmer-Classification/TF_IDF_Features'
MODEL_DIR = '/Users/socheata/Documents/FYP-Khmer-Classification/Models'
# Create output directory if needed
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Define tokenizer at module level (not inside a function)
def tokenizer_split(text):
    """Simple tokenizer that splits on spaces (for preprocessed Khmer text)"""
    return text.split()

# Step 1: Load training documents
def load_processed_documents():
    """Load preprocessed Khmer documents with their categories"""
    print("Step 1: Loading processed documents...")
    start_time = time.time()

    # Load metadata
    metadata_df = pd.read_csv(METADATA_PATH)
    doc_categories = dict(zip(metadata_df['docId'], metadata_df['category']))

    # Initialize lists to store document data
    doc_ids = []
    doc_texts = []
    categories = []

    # Load all text files
    text_files = [f for f in os.listdir(PROCESSED_TEXTS_DIR) if f.endswith('.txt')]

    for filename in text_files:
        # Get docId from filename
        doc_id = os.path.splitext(filename)[0]

        # Skip if no category is available
        if doc_id not in doc_categories:
            continue

        # Read text content
        with open(os.path.join(PROCESSED_TEXTS_DIR, filename), 'r', encoding='utf-8') as f:
            text = f.read()

        # Replace double newlines with space (combine title and body)
        text = text.replace('\n\n', ' ')

        # Store document information
        doc_ids.append(doc_id)
        doc_texts.append(text)
        categories.append(doc_categories[doc_id])

    elapsed_time = time.time() - start_time
    print(f"Loaded {len(doc_ids)} documents in {elapsed_time:.2f} seconds")

    return doc_ids, doc_texts, categories

# Step 2: Create pandas DataFrame
def create_dataframe(doc_ids, doc_texts, categories):
    """Create a pandas DataFrame containing document data"""
    print("\nStep 2: Creating pandas DataFrame...")

    # Create DataFrame
    df = pd.DataFrame({
        'id': doc_ids,
        'text': doc_texts,
        'cat': categories
    })

    # Display category distribution
    print("Category distribution:")
    category_counts = df['cat'].value_counts()
    for cat, count in category_counts.items():
        print(f"  - {cat}: {count} documents ({count/len(df)*100:.1f}%)")

    return df

# Step 3 & 4: Create TF-IDF vectorizer, fit on data, and save
def build_and_save_tfidf(texts):
    """Build TF-IDF vectorizer, fit on all texts, and save the vectorizer"""
    print("\nStep 3 & 4: Creating and fitting TF-IDF vectorizer...")
    start_time = time.time()

    # Create TF-IDF vectorizer (using the module-level tokenizer)
    tfidf_vectorizer = TfidfVectorizer(
        tokenizer=tokenizer_split,  # Use simple space-based tokenization
        token_pattern=None,        # Disable default token pattern to use custom tokenizer
        encoding='utf-8',           # Ensure proper handling of UTF-8 (for Khmer)
        ngram_range=(1, 1),         # Use unigrams and bigrams
        min_df=2,                   # Minimum document frequency
        max_df=0.85,                # Maximum document frequency
        sublinear_tf=True           # Apply sublinear TF scaling
    )

    # Fit the vectorizer on all texts
    tfidf_vectorizer.fit(texts)

    # Get vocabulary size
    vocab_size = len(tfidf_vectorizer.vocabulary_)

    # Save the fitted vectorizer
    vectorizer_path = os.path.join(OUTPUT_DIR, 'tfidf_vectorizer.pkl')
    with open(vectorizer_path, 'wb') as f:
        pickle.dump(tfidf_vectorizer, f)

    elapsed_time = time.time() - start_time
    print(f"TF-IDF vectorizer created with {vocab_size} features in {elapsed_time:.2f} seconds")
    print(f"Vectorizer saved to: {vectorizer_path}")

    return tfidf_vectorizer

# Step 5: Transform texts to TF-IDF features
def transform_to_tfidf(tfidf_vectorizer, texts):
    """Transform texts to TF-IDF feature matrix"""
    print("\nStep 5: Transforming texts to TF-IDF features...")
    start_time = time.time()

    # Transform texts to TF-IDF features
    features = tfidf_vectorizer.transform(texts)

    elapsed_time = time.time() - start_time
    print(f"Created feature matrix with shape: {features.shape}")
    print(f"Transformation completed in {elapsed_time:.2f} seconds")

    return features

# Step 6: Split data into training and validation sets
def split_data(df, test_size=0.3):
    """Split data into training and validation sets"""
    print(f"\nStep 6: Splitting data into training ({100-test_size*100:.0f}%) and validation ({test_size*100:.0f}%) sets...")

    # Split the dataset
    train_x, valid_x, train_y, valid_y = train_test_split(
        df['text'],
        df['cat'],
        test_size=test_size,
        random_state=42,
        stratify=df['cat']  # Maintain class distribution in both sets
    )

    print(f"Training set: {len(train_x)} documents")
    print(f"Validation set: {len(valid_x)} documents")

    return train_x, valid_x, train_y, valid_y

# Step 7: Encode category labels
def encode_labels(train_y, valid_y):
    """Encode category labels to integers"""
    print("\nStep 7: Encoding category labels...")

    # Create and fit label encoder
    encoder = preprocessing.LabelEncoder()
    train_y_encoded = encoder.fit_transform(train_y)
    valid_y_encoded = encoder.transform(valid_y)

    # Save the encoder
    encoder_path = os.path.join(OUTPUT_DIR, 'label_encoder.pkl')
    with open(encoder_path, 'wb') as f:
        pickle.dump(encoder, f)

    # Display label mapping
    label_mapping = {i: label for i, label in enumerate(encoder.classes_)}
    print("Label mapping:")
    for index, label in label_mapping.items():
        print(f"  {index}: {label}")

    print(f"Label encoder saved to: {encoder_path}")

    return train_y_encoded, valid_y_encoded, encoder

# Step 8: Transform train and validation sets with TF-IDF
def transform_train_valid_sets(tfidf_vectorizer, train_x, valid_x):
    """Transform training and validation sets with TF-IDF"""
    print("\nStep 8: Transforming training and validation sets...")

    # Transform training and validation sets
    xtrain_tfidf = tfidf_vectorizer.transform(train_x)
    xvalid_tfidf = tfidf_vectorizer.transform(valid_x)

    print(f"Training features: {xtrain_tfidf.shape}")
    print(f"Validation features: {xvalid_tfidf.shape}")

    # Save the TF-IDF matrices
    np.savez(os.path.join(OUTPUT_DIR, 'train_features.npz'),
             data=xtrain_tfidf.data,
             indices=xtrain_tfidf.indices,
             indptr=xtrain_tfidf.indptr,
             shape=xtrain_tfidf.shape)

    np.savez(os.path.join(OUTPUT_DIR, 'valid_features.npz'),
             data=xvalid_tfidf.data,
             indices=xvalid_tfidf.indices,
             indptr=xvalid_tfidf.indptr,
             shape=xvalid_tfidf.shape)

    print("Features saved to output directory")

    return xtrain_tfidf, xvalid_tfidf

# Main function
def main():
    print("=" * 80)
    print("TF-IDF FEATURE EXTRACTION FOR KHMER TEXT CLASSIFICATION".center(80))
    print("=" * 80)

    # Steps 1: Load documents
    doc_ids, doc_texts, categories = load_processed_documents()

    # Step 2: Create DataFrame
    df = create_dataframe(doc_ids, doc_texts, categories)

    # Steps 3 & 4: Create, fit and save TF-IDF vectorizer
    tfidf_vectorizer = build_and_save_tfidf(df['text'])

    # Step 5: Transform to TF-IDF features
    features = transform_to_tfidf(tfidf_vectorizer, df['text'])

    # Step 6: Split data
    train_x, valid_x, train_y, valid_y = split_data(df)

    # Step 7: Encode labels
    train_y_encoded, valid_y_encoded, encoder = encode_labels(train_y, valid_y)

    # Step 8: Transform train and validation sets
    xtrain_tfidf, xvalid_tfidf = transform_train_valid_sets(tfidf_vectorizer, train_x, valid_x)

    # Save additional metadata for convenience
    metadata = {
        'num_documents': len(doc_ids),
        'num_features': features.shape[1],
        'num_classes': len(encoder.classes_),
        'class_distribution': df['cat'].value_counts().to_dict(),
        'train_size': len(train_x),
        'valid_size': len(valid_x),
        'classes': encoder.classes_.tolist()
    }

    with open(os.path.join(OUTPUT_DIR, 'tfidf_training_metadata.pkl'), 'wb') as f:
        pickle.dump(metadata, f)

    print("\nTF-IDF feature extraction and training data preparation complete!")
    print("=" * 80)
    print("\nReady for model training with the following files:")
    print(f"1. TF-IDF Vectorizer: {os.path.join(OUTPUT_DIR, 'tfidf_vectorizer.pkl')}")
    print(f"2. Label Encoder: {os.path.join(OUTPUT_DIR, 'label_encoder.pkl')}")
    print(f"3. Training Features: {os.path.join(OUTPUT_DIR, 'train_features.npz')}")
    print(f"4. Validation Features: {os.path.join(OUTPUT_DIR, 'valid_features.npz')}")
    print(f"5. Training Metadata: {os.path.join(OUTPUT_DIR, 'tfidf_training_metadata.pkl')}")
    print("=" * 80)

if __name__ == "__main__":
    main()

            TF-IDF FEATURE EXTRACTION FOR KHMER TEXT CLASSIFICATION             
Step 1: Loading processed documents...
Loaded 15000 documents in 2.24 seconds

Step 2: Creating pandas DataFrame...
Category distribution:
  - health: 2500 documents (16.7%)
  - environment: 2500 documents (16.7%)
  - technology: 2500 documents (16.7%)
  - economic: 2500 documents (16.7%)
  - sport: 2500 documents (16.7%)
  - politic: 2500 documents (16.7%)

Step 3 & 4: Creating and fitting TF-IDF vectorizer...
TF-IDF vectorizer created with 21708 features in 1.07 seconds
Vectorizer saved to: /Users/socheata/Documents/FYP-Khmer-Classification/TF_IDF_Features/tfidf_vectorizer.pkl

Step 5: Transforming texts to TF-IDF features...
Created feature matrix with shape: (15000, 21708)
Transformation completed in 1.02 seconds

Step 6: Splitting data into training (70%) and validation (30%) sets...
Training set: 10500 documents
Validation set: 4500 documents

Step 7: Encoding category labels...
Label mapping:
  0: e