In [None]:
# Khmer Stop-Word Removal System - Impact Analysis

import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
# Load stopword lists
stopwords_dir = "../data/stopwords"

with open(os.path.join(stopwords_dir, "frequency_candidates.txt"), 'r', encoding='utf-8') as f:
    freq_candidates = [line.split('\t')[0] for line in f.readlines()[:20]]

with open(os.path.join(stopwords_dir, "linguistic_candidates.txt"), 'r', encoding='utf-8') as f:
    ling_candidates = [line.strip() for line in f.readlines()[:20]]

with open(os.path.join(stopwords_dir, "final_stopword_list.txt"), 'r', encoding='utf-8') as f:
    final_stopwords = [line.strip() for line in f.readlines()[:20]]

print("=== TOP FREQUENCY CANDIDATES ===")
print(', '.join(freq_candidates))
print("\n=== TOP LINGUISTIC CANDIDATES ===")
print(', '.join(ling_candidates))
print("\n=== FINAL STOPWORD LIST (Sample) ===")
print(', '.join(final_stopwords))

In [None]:
# Visualization: Vocabulary Reduction
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Bar chart
categories = ['Without Stopwords', 'With Stopwords']
values = [636, 625]
ax1.bar(categories, values, color=['skyblue', 'lightcoral'])
ax1.set_ylabel('Vocabulary Size')
ax1.set_title('Vocabulary Reduction Impact')
for i, v in enumerate(values):
    ax1.text(i, v + 5, str(v), ha='center')

# Pie chart
ax2.pie([11, 625], labels=['Removed\nStopwords', 'Retained\nTerms'], 
        autopct='%1.1f%%', colors=['lightcoral', 'skyblue'])
ax2.set_title('Vocabulary Composition')

plt.tight_layout()
plt.show()

In [None]:
# Visualization: Document Similarity Heatmaps
similarity_without = np.array([[1.0, 0.941, 0.347],
                               [0.941, 1.0, 0.373],
                               [0.347, 0.373, 1.0]])

similarity_with = np.array([[1.0, 0.939, 0.335],
                            [0.939, 1.0, 0.36],
                            [0.335, 0.36, 1.0]])

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

sns.heatmap(similarity_without, annot=True, cmap='Blues', 
            xticklabels=['Articles', 'Corpus', 'News'],
            yticklabels=['Articles', 'Corpus', 'News'], ax=ax1)
ax1.set_title('Similarity WITHOUT Stopword Removal')

sns.heatmap(similarity_with, annot=True, cmap='Greens',
            xticklabels=['Articles', 'Corpus', 'News'],
            yticklabels=['Articles', 'Corpus', 'News'], ax=ax2)
ax2.set_title('Similarity WITH Stopword Removal')

plt.tight_layout()
plt.show()

In [None]:
print(f"Average similarity without: {similarity_without.mean():.3f}")
print(f"Average similarity with: {similarity_with.mean():.3f}")
print(f"Similarity change: {similarity_with.mean() - similarity_without.mean():.3f}")