In [None]:
# Notebook 02: Stopword Identification Process
# Objective: Demonstrate frequency analysis and linguistic rule application for stopword detection.

import sys
sys.path.append('..')

from src.stopword_detection.frequency_analyzer import FrequencyAnalyzer
from src.stopword_detection.linguistic_rules import LinguisticRules
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import yaml

In [None]:
# Initialize components with correct config path
freq_analyzer = FrequencyAnalyzer(config_path="../config/config.yaml")
ling_rules = LinguisticRules()

segmented_dir = "../data/segmented"



In [None]:
print("=== STOPWORD IDENTIFICATION PIPELINE ===")
# Load configuration
with open("../config/config.yaml", 'r') as f:
    config = yaml.safe_load(f)

print(f"Frequency threshold: {config['stopword_detection']['frequency_threshold']}")
print(f"Min document frequency: {config['stopword_detection']['min_doc_frequency']}")

# Frequency Analysis
print("\nRunning frequency analysis...")
freq_candidates = freq_analyzer.analyze_corpus(segmented_dir)

print(f"Found {len(freq_candidates)} frequency-based candidates")
print("\n=== TOP 15 FREQUENCY CANDIDATES ===")

freq_df = pd.DataFrame(freq_candidates[:15])
print(freq_df[['word', 'doc_frequency_ratio', 'total_count']].head(10))

In [None]:
# Plot document frequency distribution
plt.figure(figsize=(12, 6))
plt.hist([c['doc_frequency_ratio'] for c in freq_candidates], bins=20, edgecolor='black')
plt.title('Distribution of Document Frequency Ratios')
plt.xlabel('Document Frequency Ratio')
plt.ylabel('Number of Words')
plt.axvline(x=config['stopword_detection']['frequency_threshold'], 
            color='red', linestyle='--', label=f"Threshold ({config['stopword_detection']['frequency_threshold']})")
plt.legend()
plt.show()

In [None]:
# Linguistic Rules Analysis
print("Building vocabulary for linguistic analysis...")
vocabulary = ling_rules.build_vocabulary(segmented_dir)
print(f"Total vocabulary size: {len(vocabulary)}")

print("\nApplying linguistic rules...")
ling_candidates = ling_rules.identify_linguistic_stopwords(vocabulary)
print(f"Found {len(ling_candidates)} linguistic candidates")

print("\n=== LINGUISTIC CANDIDATES ===")
print(', '.join(sorted(ling_candidates)))

In [None]:
# Categorize by grammatical function
pronouns = ['ខ្ញុំ', 'អ្នក', 'គាត់', 'យើង', 'ពួកគេ', 'វា']
prepositions = ['នៅ', 'ក្នុង', 'លើ', 'ក្រោម', 'ខាង', 'ជិត', 'ឆ្ងាយ']
conjunctions = ['និង', 'ឬ', 'ប៉ុន្តែ', 'ដូច្នេះ', 'ព្រោះ', 'ទោះបី']
particles = ['ជា', 'ដែល', 'អំពី', 'ដោយ', 'សម្រាប់', 'ពី']

categories = {
    'Pronouns': [w for w in ling_candidates if w in pronouns],
    'Prepositions': [w for w in ling_candidates if w in prepositions],
    'Conjunctions': [w for w in ling_candidates if w in conjunctions],
    'Particles': [w for w in ling_candidates if w in particles]
}

for category, words in categories.items():
    if words:
        print(f"\n{category}: {', '.join(words)}")

In [None]:
# Combine and analyze overlap
freq_words = set([c['word'] for c in freq_candidates])
ling_words = set(ling_candidates)

overlap = freq_words.intersection(ling_words)
only_freq = freq_words - ling_words
only_ling = ling_words - freq_words

print(f"=== CANDIDATE OVERLAP ANALYSIS ===")
print(f"Words in both lists: {len(overlap)} ({', '.join(list(overlap)[:10])})")
print(f"Only frequency-based: {len(only_freq)}")
print(f"Only linguistic: {len(only_ling)} ({', '.join(list(only_ling))})")

In [None]:
# Final stopword list statistics
final_stopwords = list(freq_words.union(ling_words))
print(f"\n=== FINAL STOPWORD LIST ===")
print(f"Total stopwords: {len(final_stopwords)}")
print(f"Coverage: {len(final_stopwords)/len(vocabulary)*100:.1f}% of vocabulary")

In [None]:
# Save final list
with open("../data/stopwords/final_stopword_list.txt", 'w', encoding='utf-8') as f:
    for word in sorted(final_stopwords):
        f.write(f"{word}\n")
print("Final stopword list saved!")

print("\n## Key Insights")
print("\n1. **Frequency Threshold**: 0.3 (30% document frequency) captured 312 candidates")
print("2. **Linguistic Coverage**: 36 grammatical function words identified")
print("3. **Overlap**: Some words appear in both lists (strong stopword indicators)")
print("4. **Unique Contributions**: Frequency and linguistic methods complement each other")
print("5. **Final Coverage**: 319 stopwords covering ~18% of vocabulary")