In [4]:
import os
import numpy as np
import pickle
from scipy.sparse import csr_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

OUTPUT_DIR = '/Users/socheata/Documents/FYP-Khmer-Classification/TF_IDF_Features/Selected_Features/'
MODEL_DIR = '/Users/socheata/Documents/FYP-Khmer-Classification/Models'
os.makedirs(MODEL_DIR, exist_ok=True)

def load_data():
    train_features = np.load(os.path.join(OUTPUT_DIR, 'train_features.npz'))
    X_train = csr_matrix(
        (train_features['data'], train_features['indices'], train_features['indptr']),
        shape=tuple(train_features['shape'])
    )
    valid_features = np.load(os.path.join(OUTPUT_DIR, 'valid_features.npz'))
    X_valid = csr_matrix(
        (valid_features['data'], valid_features['indices'], valid_features['indptr']),
        shape=tuple(valid_features['shape'])
    )
    with open(os.path.join(OUTPUT_DIR, 'label_encoder.pkl'), 'rb') as f:
        label_encoder = pickle.load(f)
    with open(os.path.join(OUTPUT_DIR, 'tfidf_training_metadata.pkl'), 'rb') as f:
        metadata = pickle.load(f)
    # Recreate y values
    PROCESSED_TEXTS_DIR = '/Users/socheata/Documents/FYP-Khmer-Classification/Preprocess_articles'
    METADATA_PATH = '/Users/socheata/Documents/FYP-Khmer-Classification/orginal_articles/metadata.csv'
    metadata_df = pd.read_csv(METADATA_PATH)
    doc_categories = dict(zip(metadata_df['docId'], metadata_df['category']))
    text_files = [f for f in os.listdir(PROCESSED_TEXTS_DIR) if f.endswith('.txt')]
    document_texts, document_categories = [], []
    for filename in text_files:
        doc_id = os.path.splitext(filename)[0]
        if doc_id not in doc_categories:
            continue
        with open(os.path.join(PROCESSED_TEXTS_DIR, filename), 'r', encoding='utf-8') as f:
            text = f.read()
        document_texts.append(text)
        document_categories.append(doc_categories[doc_id])
    from sklearn.model_selection import train_test_split
    train_texts, valid_texts, train_categories, valid_categories = train_test_split(
        document_texts, document_categories, test_size=0.3, random_state=42, stratify=document_categories
    )
    y_train = label_encoder.transform(train_categories)
    y_valid = label_encoder.transform(valid_categories)
    return X_train, X_valid, y_train, y_valid, label_encoder

def train_and_evaluate_models(X_train, X_valid, y_train, y_valid, label_encoder):
    class_names = label_encoder.classes_
    results = {}
    # MultinomialNB
    best_alpha, best_accuracy = None, 0
    for alpha in [0.01, 0.1, 0.5, 1.0]:
        mnb = MultinomialNB(alpha=alpha)
        mnb.fit(X_train, y_train)
        accuracy = accuracy_score(y_valid, mnb.predict(X_valid))
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_alpha = alpha
    mnb = MultinomialNB(alpha=best_alpha)
    mnb.fit(X_train, y_train)
    y_pred_mnb = mnb.predict(X_valid)
    accuracy_mnb = accuracy_score(y_valid, y_pred_mnb)
    report_mnb = classification_report(y_valid, y_pred_mnb, target_names=class_names, output_dict=True)
    cm_mnb = confusion_matrix(y_valid, y_pred_mnb)
    results['mnb'] = {
        'model': mnb, 'accuracy': accuracy_mnb, 'report': report_mnb,
        'confusion_matrix': cm_mnb, 'y_pred': y_pred_mnb
    }
    with open(os.path.join(MODEL_DIR, 'mnb_model.pkl'), 'wb') as f:
        pickle.dump(mnb, f)
    # SVM
    best_c, best_accuracy = None, 0
    for c in [0.1, 1.0, 10.0]:
        svm = LinearSVC(C=c, dual=False, max_iter=10000)
        svm.fit(X_train, y_train)
        accuracy = accuracy_score(y_valid, svm.predict(X_valid))
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_c = c
    svm = LinearSVC(C=best_c, dual=False, max_iter=10000)
    svm.fit(X_train, y_train)
    y_pred_svm = svm.predict(X_valid)
    accuracy_svm = accuracy_score(y_valid, y_pred_svm)
    report_svm = classification_report(y_valid, y_pred_svm, target_names=class_names, output_dict=True)
    cm_svm = confusion_matrix(y_valid, y_pred_svm)
    results['svm'] = {
        'model': svm, 'accuracy': accuracy_svm, 'report': report_svm,
        'confusion_matrix': cm_svm, 'y_pred': y_pred_svm
    }
    with open(os.path.join(MODEL_DIR, 'svm_model.pkl'), 'wb') as f:
        pickle.dump(svm, f)
    return results

def display_results(results, class_names):
    print("\n" + "="*70)
    print("MODEL COMPARISON")
    print("="*70)
    print("\nAccuracy Comparison:")
    print(f"MNB: {results['mnb']['accuracy']:.4f}")
    print(f"SVM: {results['svm']['accuracy']:.4f}")
    print("\nWeighted Average Metrics Comparison:")
    mnb_weighted = results['mnb']['report']['weighted avg']
    svm_weighted = results['svm']['report']['weighted avg']
    metrics = ['precision', 'recall', 'f1-score']
    print(f"{'Metric':<15} {'MNB':<10} {'SVM':<10}")
    print("-"*35)
    for metric in metrics:
        print(f"{metric:<15} {mnb_weighted[metric]:.4f}    {svm_weighted[metric]:.4f}")
    # Confusion matrices
    plt.figure(figsize=(16, 7))
    plt.subplot(1, 2, 1)
    sns.heatmap(results['mnb']['confusion_matrix'], annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names)
    plt.title('MNB Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.subplot(1, 2, 2)
    sns.heatmap(results['svm']['confusion_matrix'], annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names)
    plt.title('SVM Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.tight_layout()
    plt.savefig(os.path.join(MODEL_DIR, 'confusion_matrices.png'))
    plt.close()
    # Per-class metrics
    plt.figure(figsize=(14, 8))
    for i, metric in enumerate(metrics):
        plt.subplot(1, 3, i+1)
        mnb_scores = [results['mnb']['report'][cls][metric] for cls in class_names]
        svm_scores = [results['svm']['report'][cls][metric] for cls in class_names]
        x = np.arange(len(class_names))
        width = 0.35
        plt.bar(x - width/2, mnb_scores, width, label='MNB')
        plt.bar(x + width/2, svm_scores, width, label='SVM')
        plt.xlabel('Categories')
        plt.ylabel(metric.capitalize())
        plt.title(f'Per-Class {metric.capitalize()} Comparison')
        plt.xticks(x, class_names, rotation=45)
        plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(MODEL_DIR, 'metrics_comparison.png'))
    plt.close()
    print("Plots saved to Models directory.")

def main():
    X_train, X_valid, y_train, y_valid, label_encoder = load_data()
    results = train_and_evaluate_models(X_train, X_valid, y_train, y_valid, label_encoder)
    display_results(results, label_encoder.classes_)
    print("\nMODEL TRAINING AND EVALUATION COMPLETE")

if __name__ == "__main__":
    main()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/socheata/Documents/FYP-Khmer-Classification/TF_IDF_Features/Selected_Features/train_features.npz'

KeyError: 'train_labels'