In [4]:
# FastText Feature Extraction with Enhanced Visualization and External Feature Storage

import os
import sys
import subprocess
import numpy as np
import pandas as pd
import pickle
from gensim.models.fasttext import load_facebook_vectors
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import gzip
import shutil
import json

# 1. Install required packages if missing
def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

required = ["gensim", "numpy", "pandas", "matplotlib", "scikit-learn"]
for pkg in required:
    try:
        __import__(pkg if pkg != "scikit-learn" else "sklearn")
    except ImportError:
        install(pkg)

# 2. Paths and filenames
MODEL_PATH = 'cc.km.300.bin'
MODEL_GZ_PATH = 'cc.km.300.bin.gz'
FASTTEXT_URL = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.km.300.bin.gz"
METADATA_PATH = '../../orginal_articles/metadata.csv'
TEXT_DIR = '/Users/socheata/Documents/FYP-Khmer-Classification/Preprocess_articles'

# Output directory for FastText features (outside code dir)
FASTTEXT_FEATURE_DIR = '/Users/socheata/Documents/FYP-Khmer-Classification/FastText_Features'
os.makedirs(FASTTEXT_FEATURE_DIR, exist_ok=True)
EMBEDDINGS_PATH = os.path.join(FASTTEXT_FEATURE_DIR, 'embeddings.npy')
LABELS_PATH = os.path.join(FASTTEXT_FEATURE_DIR, 'labels.npy')
DOCIDS_PATH = os.path.join(FASTTEXT_FEATURE_DIR, 'doc_ids.npy')
WORD_EMBEDDINGS_PATH = os.path.join(FASTTEXT_FEATURE_DIR, 'word_embeddings_per_doc.pkl')
TSNE_PLOT_PATH = os.path.join(FASTTEXT_FEATURE_DIR, 'khmer_embeddings.png')
EMBEDDING_DIST_PLOT_PATH = os.path.join(FASTTEXT_FEATURE_DIR, 'embedding_value_distribution.png')
DOC_TSNE_PLOT_PATH = os.path.join(FASTTEXT_FEATURE_DIR, 'document_tsne_by_category.png')
EMBEDDING_NORMS_PLOT_PATH = os.path.join(FASTTEXT_FEATURE_DIR, 'embedding_norms.png')
WORD_EMBEDDINGS_TSNE_PATH = os.path.join(FASTTEXT_FEATURE_DIR, 'word_embeddings_tsne.png')
WORD_EMBEDDINGS_JSON_PATH = os.path.join(FASTTEXT_FEATURE_DIR, 'word_embeddings.json')

# 3. Load metadata
try:
    metadata = pd.read_csv(METADATA_PATH)
except Exception as e:
    print(f"Error loading metadata: {e}")
    sys.exit(1)
required_cols = {'docId', 'category', 'wordCount'}
if not required_cols.issubset(metadata.columns):
    print(f"Metadata missing required columns: {required_cols}")
    sys.exit(1)

# 4. Use local FastText model files if present
if not os.path.exists(MODEL_PATH):
    if not os.path.exists(MODEL_GZ_PATH):
        raise FileNotFoundError(f"Required FastText model files not found: {MODEL_PATH} or {MODEL_GZ_PATH}")
    print(f"Extracting {MODEL_GZ_PATH} ...")
    with gzip.open(MODEL_GZ_PATH, 'rb') as f_in, open(MODEL_PATH, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
    print(f"Extracted to {MODEL_PATH}")

# 5. Load pretrained FastText model
try:
    model = load_facebook_vectors(MODEL_PATH)
except Exception as e:
    print(f"Error loading FastText model: {e}")
    sys.exit(1)

# 6. Function to compute document embedding
def compute_doc_embedding(tokens, model, verbose=False):
    vectors = []
    missing_words = []
    for w in tokens:
        try:
            vector = model.get_vector(w)
            vectors.append(vector)
            if verbose:
                print(f"Word: {w} -> Vector: {vector[:5]}...")  # Show first 5 dimensions
        except KeyError:
            vectors.append(np.zeros(model.vector_size))
            missing_words.append(w)
    if not vectors:
        return np.zeros(model.vector_size)
    if verbose:
        print(f"\nSummary:")
        print(f"Total tokens: {len(tokens)}")
        print(f"Tokens with embeddings: {len(tokens) - len(missing_words)}")
        print(f"Tokens missing embeddings: {len(missing_words)}")
        if missing_words:
            print(f"Missing words (first 10): {missing_words[:10]}")
    return np.mean(vectors, axis=0)

# 7. Visualize word embeddings using t-SNE
def visualize_word_embeddings(tokens, model, output_path):
    word_vectors = []
    valid_tokens = []
    for w in tokens:
        try:
            word_vectors.append(model.get_vector(w))
            valid_tokens.append(w)
        except KeyError:
            continue
    if not word_vectors:
        print("No valid word embeddings found for visualization.")
        return
    word_vectors = np.array(word_vectors)
    tsne = TSNE(n_components=2, random_state=42, perplexity=5)
    word_tsne = tsne.fit_transform(word_vectors)
    plt.figure(figsize=(12, 8))
    plt.scatter(word_tsne[:, 0], word_tsne[:, 1], alpha=0.7)
    for i, word in enumerate(valid_tokens):
        plt.annotate(word, (word_tsne[i, 0], word_tsne[i, 1]), fontsize=9)
    plt.title("t-SNE Visualization of Word Embeddings")
    plt.xlabel("t-SNE 1")
    plt.ylabel("t-SNE 2")
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()
    print(f"Word embeddings visualization saved to {output_path}")

# 8. Analyze embedding norms
def plot_embedding_norms(tokens, model, output_path):
    norms = []
    for w in tokens:
        try:
            norm = np.linalg.norm(model.get_vector(w))
            norms.append(norm)
        except KeyError:
            continue
    if not norms:
        print("No valid word embeddings found for norm analysis.")
        return
    plt.figure(figsize=(10, 6))
    plt.hist(norms, bins=30, alpha=0.7, color='blue', edgecolor='black')
    plt.title("Distribution of Word Embedding Norms")
    plt.xlabel("Norm")
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()
    print(f"Embedding norms plot saved to {output_path}")

# 9. Save word embeddings to JSON
def save_word_embeddings(tokens, model, output_path):
    word_embeddings = {}
    for w in tokens:
        try:
            word_embeddings[w] = model.get_vector(w).tolist()
        except KeyError:
            word_embeddings[w] = None
    with open(output_path, 'w') as f:
        json.dump(word_embeddings, f, indent=4)
    print(f"Word embeddings saved to {output_path}")

# For demonstration, process the first document
first_doc_id = metadata.iloc[0]['docId']
first_text_path = os.path.join(TEXT_DIR, f"{first_doc_id}.txt")
if os.path.exists(first_text_path):
    with open(first_text_path, 'r', encoding='utf-8') as f:
        sample_words = f.read().split()
    print(f"\nDetails for first document ({first_doc_id}):")
    _ = compute_doc_embedding(sample_words, model, verbose=True)
    visualize_word_embeddings(sample_words, model, WORD_EMBEDDINGS_TSNE_PATH)
    plot_embedding_norms(sample_words, model, EMBEDDING_NORMS_PLOT_PATH)
    save_word_embeddings(sample_words, model, WORD_EMBEDDINGS_JSON_PATH)
else:
    sample_words = []
    print(f"\nFirst document file not found: {first_text_path}")

# 10. Read tokenized text files, compute embeddings, and collect word embeddings for all docs
doc_embeddings = []
labels = []
doc_ids = []
word_embeddings_per_doc = {}

for idx, row in metadata.iterrows():
    doc_id = row['docId']
    category = row['category']
    text_path = os.path.join(TEXT_DIR, f"{doc_id}.txt")
    if not os.path.exists(text_path):
        print(f"Warning: File {text_path} not found, skipping.")
        continue
    try:
        with open(text_path, 'r', encoding='utf-8') as f:
            tokens = f.read().split()
    except Exception as e:
        print(f"Error reading {text_path}: {e}")
        continue
    emb = compute_doc_embedding(tokens, model)
    doc_embeddings.append(emb)
    labels.append(category)
    doc_ids.append(doc_id)
    # Collect word embeddings for this document
    word_embeddings = {}
    for word in tokens:
        try:
            word_embeddings[word] = model.get_vector(word)
        except KeyError:
            word_embeddings[word] = np.zeros(model.vector_size)
    word_embeddings_per_doc[doc_id] = word_embeddings

doc_embeddings = np.array(doc_embeddings)
labels = np.array(labels)
doc_ids = np.array(doc_ids)
print(f"\nLoaded {len(doc_embeddings)} documents with embeddings.")

# 11. Print all words from the first document
print("\nAll words from the first document:")
print(sample_words)

# 12. t-SNE of all document embeddings, colored by category
le = LabelEncoder()
label_ids = le.fit_transform(labels)
tsne = TSNE(n_components=2, random_state=42, perplexity=5)
doc_tsne = tsne.fit_transform(doc_embeddings)
plt.figure(figsize=(10, 8))
scatter = plt.scatter(doc_tsne[:, 0], doc_tsne[:, 1], c=label_ids, cmap='tab10', alpha=0.7)
plt.title("t-SNE of Document Embeddings by Category")
plt.xlabel("t-SNE 1")
plt.ylabel("t-SNE 2")
handles, _ = scatter.legend_elements()
plt.legend(handles, le.classes_, title="Category")
plt.tight_layout()
plt.savefig(DOC_TSNE_PLOT_PATH)
plt.close()
print("Saved:", DOC_TSNE_PLOT_PATH)

# 13. Save document embeddings, labels, doc_ids, and word embeddings per doc
np.save(EMBEDDINGS_PATH, doc_embeddings)
np.save(LABELS_PATH, labels)
np.save(DOCIDS_PATH, doc_ids)
with open(WORD_EMBEDDINGS_PATH, 'wb') as f:
    pickle.dump(word_embeddings_per_doc, f)
print(f"\nDocument embeddings saved to {EMBEDDINGS_PATH}")
print(f"Labels saved to {LABELS_PATH}")
print(f"Document IDs saved to {DOCIDS_PATH}")
print(f"Word embeddings per document saved to {WORD_EMBEDDINGS_PATH}")




Details for first document (health1):
Word: ប្រហែស -> Vector: [-0.03345599  0.05267933  0.04053002  0.08429828  0.00405834]...
Word: ធ្វើឲ្យ -> Vector: [-0.00600505 -0.0061434   0.02829137  0.00874665  0.03501422]...
Word: កម្ពុជា -> Vector: [ 0.00321709  0.01240032 -0.00574631 -0.01219825 -0.00409323]...
Word: គ្រោះថ្នាក់ -> Vector: [ 0.00820211  0.03495497 -0.00070203  0.01174884 -0.02036592]...
Word: ឆ្លង -> Vector: [ 0.03519922 -0.00709001  0.00515401  0.10655373 -0.08541018]...
Word: រាលដាល -> Vector: [-0.07181112 -0.02998904 -0.10582854  0.04541619 -0.06229345]...
Word: កូវីដ -> Vector: [-3.6070496e-04 -2.0839243e-04  4.0976651e-05 -8.5223204e-05
 -5.0289149e-05]...
Word: កូវីដ -> Vector: [-3.6070496e-04 -2.0839243e-04  4.0976651e-05 -8.5223204e-05
 -5.0289149e-05]...
Word: កំណើន -> Vector: [0.05983778 0.04880498 0.04236346 0.03764829 0.03964679]...
Word: ការរកឃើញ -> Vector: [ 0.01063065 -0.01037034  0.00764738  0.01192325 -0.00881259]...
Word: កូវីដ -> Vector: [-3.6070496e-04 -

  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_l

Word embeddings visualization saved to /Users/socheata/Documents/FYP-Khmer-Classification/FastText_Features/word_embeddings_tsne.png
Embedding norms plot saved to /Users/socheata/Documents/FYP-Khmer-Classification/FastText_Features/embedding_norms.png
Word embeddings saved to /Users/socheata/Documents/FYP-Khmer-Classification/FastText_Features/word_embeddings.json

Loaded 15000 documents with embeddings.

All words from the first document:
['ប្រហែស', 'ធ្វើឲ្យ', 'កម្ពុជា', 'គ្រោះថ្នាក់', 'ឆ្លង', 'រាលដាល', 'កូវីដ', 'កូវីដ', 'កំណើន', 'ការរកឃើញ', 'កូវីដ', 'នៅ', 'ខែកក្កដា', 'កន្លងមក', 'បាន', 'ធ្វើឲ្យ', 'ការព្រួយបារម្ភ', 'ខ្លាំង', 'លទ្ធភាព', 'អាចនឹង', 'ការរាលដាល', 'ជា', 'សហគមន៍', 'នៅ', 'ប្រទេស', 'កម្ពុជា', 'មកដល់', 'ពេលនេះ', 'មិន', 'កើតមាន', 'ក្រសូង', 'សុខាភិបាល', 'បាន', 'ព្រមាន', 'ការរាលដាល', 'ជា', 'សហគមន៍', 'មិន', 'ប្រាកដ', 'ជា', 'មិន', 'កើតមាន', 'នៅ', 'ប្រទេស', 'កម្ពុជា', 'ជំរុញ', 'ឲ្យ', 'ប្រជាជន', 'កម្ពុជា', 'បង្កើន', 'ការប្រុងប្រយ័ត្ន', 'ពិសេស', 'ឈប់', 'សម្រាក', 'រយៈពេល', 'ប្រាំ', 'ថ្ងៃ