In [1]:
import os

# Paths
input_folder = "/Users/socheata/Documents/FYP-Khmer-Classification/TF_IDF_Features"
output_folder = os.path.join(input_folder, "Selected_Features")
os.makedirs(output_folder, exist_ok=True)

categories = [
    "economic",
    "environment",
    "health",
    "politic",
    "sport",
    "technology"
]

# Load stopwords
stopwords_path = "/Users/socheata/Documents/FYP-Khmer-Classification/FYP-Data-Preprocessing/khmer_stopwords.txt"
with open(stopwords_path, "r", encoding="utf-8") as f:
    stopwords = set(line.strip() for line in f if line.strip())

# Load all features per category with their scores
features_per_cat = {}
for cat in categories:
    path = os.path.join(input_folder, f"features_{cat}.txt")
    features = []
    with open(path, "r", encoding="utf-8") as fin:
        for line in fin:
            parts = line.strip().split('\t')
            word = parts[0]
            score = float(parts[1]) if len(parts) > 1 else 0
            features.append((word, score, line))
    features_per_cat[cat] = features

# Find common features across all categories
all_feature_words = [set(word for word, _, _ in features) for features in features_per_cat.values()]
common_features = set.intersection(*all_feature_words)

# Save all common features to a file
common_features_path = os.path.join(output_folder, "common_features_removed.txt")
with open(common_features_path, "w", encoding="utf-8") as f:
    for word in sorted(common_features):
        f.write(f"{word}\n")

# Remove common features and stopwords, then save the unique features per category
remaining_features = {}
for cat in categories:
    filtered_features = []
    for word, score, line in features_per_cat[cat]:
        if word not in common_features and word not in stopwords:
            filtered_features.append((word, score))
    
    # Sort by score (descending)
    filtered_features.sort(key=lambda x: x[1], reverse=True)
    remaining_features[cat] = [word for word, _ in filtered_features]
    
    # Save to file
    output_path = os.path.join(output_folder, f"unique_features_{cat}.txt")
    with open(output_path, "w", encoding="utf-8") as fout:
        for word, score in filtered_features:
            fout.write(f"{word}\t{score}\n")
    
    print(f"{cat}: {len(remaining_features[cat])} unique features after removing {len(common_features)} common features and {len(stopwords)} stopwords")

# Display the first 10 unique features for each category
for cat in categories:
    print(f"\nTop 10 unique features for {cat}:")
    for word in remaining_features[cat][:10]:
        print(f"- {word}")

print(f"\nAll common features ({len(common_features)}) saved to {common_features_path}")

economic: 7691 unique features after removing 4239 common features and 380 stopwords
environment: 8467 unique features after removing 4239 common features and 380 stopwords
health: 5715 unique features after removing 4239 common features and 380 stopwords
politic: 8253 unique features after removing 4239 common features and 380 stopwords
sport: 6079 unique features after removing 4239 common features and 380 stopwords
technology: 5782 unique features after removing 4239 common features and 380 stopwords

Top 10 unique features for economic:
- នាំចេញ
- រ៉ាឌី
- អត្ថប
- អង្ករ
- ពន្ធដារ
- ប្រជាកសិករ
- ប៊ីលាន
- បូរ៉ាវី
- រដ្ឋាករ
- កសិផល

Top 10 unique features for environment:
- សេចក្ដីថ្លៃថ្នូរ
- ហិកតារ
- ព្រៃឡង់
- ចំណាំ
- អ្នកបញ្ចូល
- ប្រជា
- វារី
- ការកាប់
- ទន្ទ្រាន
- អាដហុក

Top 10 unique features for health:
- ចូរ
- បុត្រធីតា
- វីតាមីន
- វ៉េស្ទឡាញន៍
- ណត្សឡាញន៍
- មត្តេយ្យ
- អាកប្បកិរិយា
- មាតាបិតា
- ការគេង
- អូមី

Top 10 unique features for politic:
- រង្ស៊ី
- ធិបតី
- គជ
- ទណ្ឌិត
- ទៀ