In [None]:
# ========================================
# 0. Setup
# ========================================
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

In [None]:
# ========================================
# 1. Load embeddings + labels
# ========================================
# embeddings file
emb_df = pd.read_csv("embeddings/7-geminiApi-m/embeddings_int.csv")

# original dataset with labels
orig_df = pd.read_csv("data/En-Ba-Dataset(20k_4)/dataset_cleaned.csv")

if "Label" not in orig_df.columns:
    raise ValueError("Original dataset must have a 'Label' column.")

# ensure alignment
assert len(emb_df) == len(orig_df), "Row mismatch between embeddings and dataset!"

In [None]:
# ========================================
# 2. Prepare embeddings for training
# ========================================
X = emb_df.values
y = orig_df["Label"].values

print("Feature shape:", X.shape)
print("Labels shape:", y.shape)
print("Unique labels:", np.unique(y))

In [None]:
# ========================================
# 3. Train-Test Split
# ========================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", X_train.shape, "Test size:", X_test.shape)

In [None]:
# ========================================
# 4. Train multiple models
# ========================================
models = {
    "LogReg": LogisticRegression(max_iter=1000, solver="lbfgs", n_jobs=-1),
    "RandomForest": RandomForestClassifier(n_estimators=200, n_jobs=-1),
    "XGBoost": XGBClassifier(
        n_estimators=300,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method="hist",
        device="cuda" if device == "cuda" else "cpu",
        n_jobs=-1,
        eval_metric="mlogloss",
    ),
    "MLP": MLPClassifier(hidden_layer_sizes=(512, 256), max_iter=50),
    # "SVM": SVC(kernel="linear"),
}

results = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    results[name] = (acc, preds)
    print(f"{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, preds))

In [None]:
# ========================================
# 5. Compare Model Accuracies
# ========================================
accs = {name: acc for name, (acc, _) in results.items()}

# --- Accuracy Bar Chart with Labels ---
plt.figure(figsize=(8, 5))
bars = plt.bar(accs.keys(), accs.values())
plt.ylabel("Accuracy")
plt.title("Model Comparison on Test Set")

# Add value labels above bars
for bar in bars:
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        height + 0.01,
        f"{height:.3f}",
        ha="center",
        va="bottom",
    )

plt.ylim(0, 1.05)
plt.show()