In [None]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve, GridSearchCV
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.decomposition import TruncatedSVD
from imblearn.over_sampling import SMOTE
from collections import Counter
import joblib
import warnings
warnings.filterwarnings('ignore')

# Fix Khmer font rendering issues
import matplotlib as mpl
import matplotlib.font_manager as fm

def setup_khmer_font():
    try:
        # Try Mac Khmer font
        khmer_font_path = '/Library/Fonts/Khmer MN.ttc'
        if not os.path.exists(khmer_font_path):
            print("Khmer MN font not found. Installing Noto Sans Khmer...")
            !pip install font-noto
            import font_noto
            khmer_font_path = font_noto.REGULAR
        
        # Register font
        khmer_font_prop = fm.FontProperties(fname=khmer_font_path)
        plt.rcParams['font.family'] = 'sans-serif'
        plt.rcParams['font.sans-serif'] = [khmer_font_prop.get_name()] + plt.rcParams['font.sans-serif']
        print(f"Khmer font configured successfully: {khmer_font_prop.get_name()}")
        return True
    except Exception as e:
        print(f"Warning: Could not configure Khmer font: {e}")
        print("Visualizations may not display Khmer characters correctly")
        return False

# Set up Khmer font
has_khmer_font = setup_khmer_font()

# Define paths
FASTTEXT_DIR = '/Users/socheata/Documents/FYP-Khmer-Classification/FastText_Features'
MODELS_DIR = '/Users/socheata/Documents/FYP-Khmer-Classification/Models/fasttext_models'
os.makedirs(MODELS_DIR, exist_ok=True)

# Load FastText features and labels
print("Loading FastText features and labels...")
X = np.load(os.path.join(FASTTEXT_DIR, 'embeddings.npy'))           # shape: (n_samples, 300)
y = np.load(os.path.join(FASTTEXT_DIR, 'labels.npy'))               # shape: (n_samples,)
doc_ids = np.load(os.path.join(FASTTEXT_DIR, 'doc_ids.npy'))        # shape: (n_samples,)
label_encoder = joblib.load(os.path.join(FASTTEXT_DIR, 'label_encoder.pkl'))

# Load word embeddings per document for enhanced features
print("Loading word-level embeddings from pickle file...")
with open(os.path.join(FASTTEXT_DIR, 'word_embeddings_per_doc.pkl'), 'rb') as f:
    word_embeddings_per_doc = pickle.load(f)
print(f"Loaded word embeddings for {len(word_embeddings_per_doc)} documents")

# If y is integer-encoded, decode to string labels
if np.issubdtype(y.dtype, np.integer):
    y = label_encoder.inverse_transform(y)
categories = list(label_encoder.classes_)
print(f"Loaded {X.shape[0]} samples with {X.shape[1]}-dim FastText features.")
print(f"Categories: {categories}")

# Function to extract additional features from word embeddings
def extract_word_embedding_features(doc_ids, word_emb_dict):
    """Extract statistical features from word embeddings for each document"""
    features = []
    for doc_id in doc_ids:
        doc_id_str = str(doc_id)
        if doc_id_str in word_emb_dict and len(word_emb_dict[doc_id_str]) > 0:
            # Get all word embeddings for this document
            word_vectors = np.array(list(word_emb_dict[doc_id_str].values()))
            
            # Calculate statistical features from word embeddings
            features_dict = {
                'mean': np.mean(word_vectors, axis=0),
                'variance': np.var(word_vectors, axis=0),  
                'max': np.max(word_vectors, axis=0),
                'min': np.min(word_vectors, axis=0),
                'document_length': np.array([len(word_emb_dict[doc_id_str])])
            }
            
            # Dimensionality reduction for variance, max, min to keep feature count manageable
            reduced_features = np.concatenate([
                features_dict['mean'][:50],      # First 50 dimensions of mean vector
                features_dict['variance'][:20],  # First 20 dimensions of variance
                features_dict['max'][:15],       # First 15 dimensions of max values
                features_dict['min'][:15],       # First 15 dimensions of min values
                features_dict['document_length'] # Document length (word count)
            ])
            features.append(reduced_features)
        else:
            # Use zeros for documents with no word embeddings
            placeholder = np.zeros(101)  # 50+20+15+15+1
            features.append(placeholder)
    
    return np.array(features)

# Extract enhanced features
print("Extracting enhanced features from word embeddings...")
word_level_features = extract_word_embedding_features(doc_ids, word_embeddings_per_doc)
print(f"Generated word-level statistical features: {word_level_features.shape[1]} dimensions")

# Combine with document embeddings
X_enhanced = np.hstack((X, word_level_features))
print(f"Combined feature set dimensions: {X_enhanced.shape[1]}")

# Split the data
X_train, X_test, y_train, y_test, doc_ids_train, doc_ids_test = train_test_split(
    X_enhanced, y, doc_ids, test_size=0.2, random_state=42, stratify=y
)
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

# Normalize features for better performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Check class distribution
class_counts = Counter(y_train)
print("\nClass distribution in training set:")
for cls, count in class_counts.most_common():
    print(f"{cls}: {count}")

# Apply SMOTE for class balancing if needed
if len(set(class_counts.values())) > 1:
    print("\nApplying SMOTE to balance classes...")
    smote = SMOTE(random_state=42)
    try:
        X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)
        print(f"After SMOTE: {X_train_balanced.shape[0]} samples")
        # Use balanced dataset if SMOTE succeeds
        X_train_scaled = X_train_balanced
        y_train = y_train_balanced
    except Exception as e:
        print(f"SMOTE failed: {e}. Continuing with imbalanced data.")

# For MNB, we need non-negative features
X_train_shifted = X_train_scaled - X_train_scaled.min()
X_test_shifted = X_test_scaled - X_train_scaled.min()

# Rest of your existing functions...
# Function to generate learning curves
def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=5, n_jobs=None, 
                        train_sizes=np.linspace(.1, 1.0, 5)):
    # ...existing code...

# Function to train and evaluate a model with cross-validation
def train_and_evaluate_model(model_name, model, X_train, y_train, X_test, y_test):
    # ...existing code...

print("\n" + "="*80)
print("Training models with enhanced FastText features")
print("="*80)

# Hyperparameter tuning for SVM
print("\nPerforming grid search for SVM hyperparameters...")
param_grid_svm = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.01, 0.1],
    'kernel': ['rbf', 'linear']
}

grid_svm = GridSearchCV(
    SVC(probability=True, random_state=42),
    param_grid_svm,
    cv=3,
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=1
)
grid_svm.fit(X_train_scaled, y_train)
print(f"Best SVM parameters: {grid_svm.best_params_}")
svm_tuned = grid_svm.best_estimator_

# SVM with Enhanced Features
svm_results = train_and_evaluate_model(
    "SVM (Enhanced FastText)", svm_tuned, X_train_scaled, y_train, X_test_scaled, y_test
)

# Random Forest with Enhanced Features
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    bootstrap=True,
    random_state=42,
    n_jobs=-1
)

rf_results = train_and_evaluate_model(
    "RandomForest (Enhanced FastText)", rf_model, X_train_scaled, y_train, X_test_scaled, y_test
)

# MultinomialNB with Enhanced Features
mnb_model = MultinomialNB()
mnb_results = train_and_evaluate_model(
    "MNB (Enhanced FastText)", mnb_model, X_train_shifted, y_train, X_test_shifted, y_test
)

# Create Ensemble model
ensemble_model = VotingClassifier(
    estimators=[
        ('svm', svm_tuned),
        ('rf', rf_model)
    ],
    voting='soft',
    weights=[2, 1],  # Give more weight to SVM if it performs better
)

ensemble_results = train_and_evaluate_model(
    "Ensemble (SVM+RF)", ensemble_model, X_train_scaled, y_train, X_test_scaled, y_test
)

# Compare models
results = [svm_results, rf_results, mnb_results, ensemble_results]
model_comparison = pd.DataFrame(results)
print("\nModel Comparison (Enhanced FastText Features):")
comparison_cols = ['model_name', 'cv_mean', 'accuracy', 'precision', 'recall', 'f1', 'training_time']
print(model_comparison[comparison_cols])

# Plot model comparison
metrics = ['cv_mean', 'accuracy', 'precision', 'recall', 'f1']
metric_labels = ['CV Accuracy', 'Test Accuracy', 'Precision', 'Recall', 'F1 Score']
model_names = [result['model_name'] for result in results]

plt.figure(figsize=(15, 8))
x = np.arange(len(metrics))
width = 0.2  # Smaller width since we have more models
colors = ['royalblue', 'forestgreen', 'firebrick', 'darkorange']

for i, result in enumerate(results):
    values = [result[metric] for metric in metrics]
    plt.bar(x + (i-1.5)*width, values, width, label=result['model_name'], color=colors[i])

plt.xlabel('Metric')
plt.ylabel('Score')
plt.title('Model Performance Comparison: Enhanced FastText Features')
plt.xticks(x, metric_labels)
plt.legend(loc='lower left', bbox_to_anchor=(0, -0.15), ncol=2)
plt.ylim(0.7, 1.0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout(rect=[0, 0.05, 1, 1])
plt.savefig(os.path.join(MODELS_DIR, 'enhanced_fasttext_model_comparison.png'))
plt.show()

# Save best model and feature transformer
all_accuracies = [r['accuracy'] for r in results]
best_model_idx = np.argmax(all_accuracies)
best_model_name = results[best_model_idx]['model_name']

# Determine which model is the best based on index
if best_model_idx == 0:
    best_model = svm_tuned
elif best_model_idx == 1:
    best_model = rf_model
elif best_model_idx == 2:
    best_model = mnb_model
else:
    best_model = ensemble_model

# Save necessary components for production
joblib.dump(best_model, os.path.join(MODELS_DIR, 'best_model.joblib'))
joblib.dump(label_encoder, os.path.join(MODELS_DIR, 'label_encoder.joblib'))
joblib.dump(scaler, os.path.join(MODELS_DIR, 'feature_scaler.joblib'))

# Save function for feature extraction
with open(os.path.join(MODELS_DIR, 'feature_extraction_func.pkl'), 'wb') as f:
    pickle.dump(extract_word_embedding_features, f)

print(f"Best model ({best_model_name}) saved for production use")
print(f"Also saved: label encoder, feature scaler, and feature extraction function")

# Error analysis
def analyze_errors(model, X_test, y_test, categories, doc_ids_test, top_n=10):
    # ...existing code...

print("\n" + "="*80)
print("Error Analysis for Best Model")
print("="*80)

# Use the right features for the chosen model
if isinstance(best_model, MultinomialNB):
    analyze_errors(best_model, X_test_shifted, y_test, categories, doc_ids_test)
else:
    analyze_errors(best_model, X_test_scaled, y_test, categories, doc_ids_test)

print("\n" + "="*80)
print("Training and Evaluation Complete!")
print("="*80)
print(f"All models and supporting files saved in: {MODELS_DIR}")
print(f"Best model: {best_model_name}")
print("\nUse the following code to load and use the best model for prediction:")
print("```python")
print("import joblib")
print("import pickle")
print("import numpy as np")
print("# Load components")
print("model = joblib.load('path/to/best_model.joblib')")
print("label_encoder = joblib.load('path/to/label_encoder.joblib')")
print("scaler = joblib.load('path/to/feature_scaler.joblib')")
print("with open('path/to/feature_extraction_func.pkl', 'rb') as f:")
print("    extract_features = pickle.load(f)")
print("# For prediction:")
print("# 1. Get document embeddings (X_new) and word embeddings (word_emb)")
print("# 2. Extract word-level features")
print("word_features = extract_features(doc_ids, word_emb)")
print("# 3. Combine features")
print("X_combined = np.hstack((X_new, word_features))")
print("# 4. Scale features")
print("X_scaled = scaler.transform(X_combined)")
print("# 5. For MNB, shift values to be non-negative")
print("if isinstance(model, MultinomialNB):")
print("    X_scaled = X_scaled - X_scaled.min()")
print("# 6. Predict")
print("y_pred = model.predict(X_scaled)")
print("labels = label_encoder.inverse_transform(y_pred)")
print("```")

IndentationError: expected an indented block (913131084.py, line 161)