In [1]:
def tokenizer_split(text):
    """Simple tokenizer that splits on spaces (for preprocessed Khmer text)"""
    return text.split()

import os
import pickle
import dill
import numpy as np
from scipy.sparse import csr_matrix
import pandas as pd

# Use correct macOS paths
OUTPUT_DIR = '/Users/socheata/Documents/FYP-Khmer-Classification/TF_IDF_Features'
PROCESSED_TEXTS_DIR = '/Users/socheata/Documents/FYP-Khmer-Classification/Preprocess_articles'
METADATA_PATH = '/Users/socheata/Documents/FYP-Khmer-Classification/orginal_articles/metadata.csv'

def print_top_features_by_category():
    """Print and save top 10 TF-IDF features for each category"""
    print("\nTOP 10 TF-IDF FEATURES BY CATEGORY")
    print("=" * 70)

    # Load the necessary files
    with open(os.path.join(OUTPUT_DIR, 'tfidf_vectorizer.pkl'), 'rb') as f:
        vectorizer = pickle.load(f)
    feature_names = vectorizer.get_feature_names_out()
    with open(os.path.join(OUTPUT_DIR, 'label_encoder.pkl'), 'rb') as f:
        encoder = pickle.load(f)
    categories = list(encoder.classes_)

    # Load metadata for category info (optional)
    try:
        with open(os.path.join(OUTPUT_DIR, 'tfidf_training_metadata.pkl'), 'rb') as f:
            metadata = pickle.load(f)
        if 'class_distribution' in metadata:
            print("Category distribution:")
            for cat, count in metadata['class_distribution'].items():
                print(f"  - {cat}: {count} documents")
    except Exception:
        print("Metadata not available.")

    # Load original documents and categories
    metadata_df = pd.read_csv(METADATA_PATH)
    doc_categories = dict(zip(metadata_df['docId'], metadata_df['category']))
    text_files = [f for f in os.listdir(PROCESSED_TEXTS_DIR) if f.endswith('.txt')]
    document_texts = []
    document_categories = []
    for filename in text_files:
        doc_id = os.path.splitext(filename)[0]
        if doc_id not in doc_categories:
            continue
        with open(os.path.join(PROCESSED_TEXTS_DIR, filename), 'r', encoding='utf-8') as f:
            text = f.read()
        text = text.replace('\n\n', ' ')
        document_texts.append(text)
        document_categories.append(doc_categories[doc_id])

    X_all = vectorizer.transform(document_texts)

    for category in categories:
        print(f"\nTOP FEATURES FOR CATEGORY: {category}")
        print("=" * 70)
        category_indices = [i for i, cat in enumerate(document_categories) if cat == category]
        if not category_indices:
            print(f"No documents found for category: {category}")
            continue
        X_category = X_all[category_indices]
        category_importance = X_category.mean(axis=0).A1
        top_indices = category_importance.argsort()[-10:][::-1]

        # Save all features (not just top 10) for this category
        all_indices = category_importance.argsort()[::-1]
        feature_lines = []
        for idx in all_indices:
            feature = feature_names[idx]
            importance = category_importance[idx]
            if importance > 0:
                feature_lines.append(f"{feature}\t{importance:.8f}")

        # Write to file
        out_path = os.path.join(OUTPUT_DIR, f"features_{category}.txt")
        with open(out_path, "w", encoding="utf-8") as f:
            f.write("\n".join(feature_lines))
        print(f"Saved all features for category '{category}' to {out_path}")

        # Print top 10
        print(f"{'Rank':<6}{'Feature':<40}{'TF-IDF Value':<14}")
        print("-" * 70)
        for j, idx in enumerate(top_indices[:10]):
            feature = feature_names[idx]
            importance = category_importance[idx]
            print(f"{j+1:<6}{feature:<40}{importance:.8f}")

    print("\n" + "=" * 70)
    print("Feature analysis complete!")

if __name__ == "__main__":
    print_top_features_by_category()


TOP 10 TF-IDF FEATURES BY CATEGORY
Category distribution:
  - health: 2500 documents
  - environment: 2500 documents
  - technology: 2500 documents
  - economic: 2500 documents
  - sport: 2500 documents
  - politic: 2500 documents

TOP FEATURES FOR CATEGORY: economic
Saved all features for category 'economic' to /Users/socheata/Documents/FYP-Khmer-Classification/TF_IDF_Features/features_economic.txt
Rank  Feature                                 TF-IDF Value  
----------------------------------------------------------------------
1     កម្ពុជា                                 0.05733325
2     សេដ្ឋកិច្ច                              0.04205128
3     វិស័យ                                   0.03978192
4     ឆ្នាំ                                   0.03863973
5     ដុល្លារ                                 0.03812835
6     ពាណិជ្ជកម្ម                             0.03724382
7     លាន                                     0.03623067
8     ប្រទេស                                  0.03621327
9     ឯក