## Train Base Classifiers and save them

In [1]:
import torch
import matplotlib.pyplot as plt
import os
from sklearn.metrics import accuracy_score
import joblib
import sys
import numpy as np


CLIP_FEATURES_DIR = "clip_features_new"
TRAIN = CLIP_FEATURES_DIR + "/train_features.pt"
VAL = CLIP_FEATURES_DIR + "/val_features.pt"
SCALER = "scaler_model_new.joblib"
PCA = "pca_model_new.joblib"
LIME = "top_k_lime_indices.joblib"

MIN_SAMPLES = 3

In [2]:
def load_features(file_path):
    data = torch.load(file_path)
    return data["image_features"], data["text_features"], data["filenames"], data["labels"]


# Load train and validation features
train_img_features, train_txt_features, _, train_labels = load_features(TRAIN)
val_img_features, val_txt_features, _, val_labels = load_features(VAL)

In [3]:
# Combine image and text features for training
X_train = torch.cat((train_img_features, train_txt_features), dim=1)
X_val = torch.cat((val_img_features, val_txt_features), dim=1)

# Flatten features into a 2D matrix (samples x features)
X_train = X_train.view(X_train.size(0), -1).numpy()
X_val = X_val.view(X_val.size(0), -1).numpy()

print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")

# Convert labels to NumPy arrays
y_train = train_labels.numpy()
y_val = val_labels.numpy()  

# Load scaler and PCA models
scaler = joblib.load(SCALER)
pca = joblib.load(PCA)
lime = joblib.load(LIME)

# Scale and transform the features
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)

X_train_pca = pca.transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)

X_train_lime = X_train_scaled[:, lime]
X_val_lime = X_val_scaled[:, lime]

print(f"X_train_pca shape: {X_train_pca.shape}")
print(f"X_val_pca shape: {X_val_pca.shape}")
print(f"X_train_lime shape: {X_train_lime.shape}")
print(f"X_val_lime shape: {X_val_lime.shape}")

X_train shape: (23548, 1024)
X_val shape: (11708, 1024)
X_train_pca shape: (23548, 488)
X_val_pca shape: (11708, 488)
X_train_lime shape: (23548, 250)
X_val_lime shape: (11708, 250)


In [4]:
unique, counts = np.unique(y_train, return_counts=True)
class_counts = dict(zip(unique, counts))

# Filter mask for valid classes
valid_classes = [cls for cls, count in class_counts.items() if count >= MIN_SAMPLES]

# Create mask for training examples with valid classes
mask = np.isin(y_train, valid_classes)

# Filter data
X_train_pca = X_train_pca[mask]
X_train_lime = X_train_lime[mask]

y_train = y_train[mask]

In [5]:
print(len(valid_classes))

80


In [6]:
from classifiers import (
    SVMClassifier, RBFClassifier, RandomForestClassifier, NaiveBayesClassifier, 
    LogisticRegressionClassifier, LDAClassifier, KNNClassifier, DecisionTreeClassifier,
    AdaBoostClassifier, GBMClassifier, XGBoostClassifier
)


# Initialize classifiers
classifiers = {
    "SVM": SVMClassifier(),
    "RBF": RBFClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": NaiveBayesClassifier(),
    "Logistic Regression": LogisticRegressionClassifier(),
    "LDA": LDAClassifier(),
    "KNN": KNNClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Gradient Boosting": GBMClassifier(),
    "XGBoost": XGBoostClassifier()
}


# Train and evaluate classifiers
results = {}
for name, clf in classifiers.items():
    print(f"Training {name}...")
    
    clf.train(X_train_pca, y_train)
        
    y_pred = clf.classify(X_val_pca)
    print(f"Evaluating {name}...")
    accuracy = accuracy_score(y_val, y_pred)
    results[name] = accuracy
    print(f"{name} Accuracy: {accuracy:.4f}")
    clf.save(model_dir="models_pca_cal_sig_new")

Training SVM...
Evaluating SVM...
SVM Accuracy: 0.3143
Model saved to: models_pca_cal_sig_new/SVM.joblib
Label encoder saved to: models_pca_cal_sig_new/label_encoder.joblib
Training RBF...
Evaluating RBF...
RBF Accuracy: 0.3170
Model saved to: models_pca_cal_sig_new/RBF.joblib
Label encoder saved to: models_pca_cal_sig_new/label_encoder.joblib
Training Random Forest...
Evaluating Random Forest...
Random Forest Accuracy: 0.3036
Model saved to: models_pca_cal_sig_new/RandomForest.joblib
Label encoder saved to: models_pca_cal_sig_new/label_encoder.joblib
Training Naive Bayes...
Evaluating Naive Bayes...
Naive Bayes Accuracy: 0.3083
Model saved to: models_pca_cal_sig_new/NaiveBayes.joblib
Label encoder saved to: models_pca_cal_sig_new/label_encoder.joblib
Training Logistic Regression...
Evaluating Logistic Regression...
Logistic Regression Accuracy: 0.2748
Model saved to: models_pca_cal_sig_new/LogisticRegression.joblib
Label encoder saved to: models_pca_cal_sig_new/label_encoder.joblib
Tr

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluating XGBoost...
XGBoost Accuracy: 0.3098
Model saved to: models_pca_cal_sig_new/XGBoost.joblib
Label encoder saved to: models_pca_cal_sig_new/label_encoder.joblib


In [7]:
print(results)

{'SVM': 0.3143149982917663, 'RBF': 0.31704817218995557, 'Random Forest': 0.3036385377519645, 'Naive Bayes': 0.3082507687051589, 'Logistic Regression': 0.2747693884523403, 'LDA': 0.32319781346088144, 'KNN': 0.30509053638537753, 'Decision Tree': 0.2372736590365562, 'AdaBoost': 0.20259651520327981, 'Gradient Boosting': 0.20037581141100103, 'XGBoost': 0.30978817902289035}


In [8]:
svm = SVMClassifier()
svm.load(model_dir="models_lime_cal")


Loaded model from: models_lime_cal/SVM.joblib
Loaded label encoder from: models_lime_cal/label_encoder.joblib


In [9]:
# Test the classify method on the first sample of X_val_pca


print(svm.classify(X_val_lime[1].reshape(1, -1)))

res = svm.classify_proba(X_val_lime[1].reshape(1, -1))

print(np.argmax(res, axis=1))
print(res)
print(len(res[0]))



[56]
[55]
[[1.32778957e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 4.34440149e-04 1.84140776e-03
  3.31307334e-04 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 1.84092580e-04 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 2.31898676e-04 0.00000000e+00 2.41757061e-04
  0.00000000e+00 0.00000000e+00 0.00000000e+00 1.02258620e-02
  0.00000000e+00 4.93778696e-03 2.27655886e-02 5.43810693e-02
  9.01563267e-03 9.96506192e-03 7.16584686e-02 7.18028745e-04
  5.56139165e-03 4.55012527e-03 1.23169402e-03 3.64412632e-01
  2.13124243e-01 0.00000000e+00 1.88541679e-02 7.35137062e-04
  9.64770472e-03 4.53648166e-04 3.15951256e-04 0.00000000e+0