## Train Base Classifiers and save them

In [None]:
import torch
import matplotlib.pyplot as plt
import os
from sklearn.metrics import accuracy_score
import joblib
import sys
import numpy as np


CLIP_FEATURES_DIR = "clip_features_new"
TRAIN = CLIP_FEATURES_DIR + "/train_features.pt"
VAL = CLIP_FEATURES_DIR + "/val_features.pt"
SCALER = "scaler_model_new.joblib"
PCA = "pca_model_new.joblib"
LIME = "top_k_lime_indices.joblib"

MIN_SAMPLES = 3

In [2]:
def load_features(file_path):
    data = torch.load(file_path)
    return data["image_features"], data["text_features"], data["filenames"], data["labels"]


# Load train and validation features
train_img_features, train_txt_features, _, train_labels = load_features(TRAIN)
val_img_features, val_txt_features, _, val_labels = load_features(VAL)

In [3]:
# Combine image and text features for training
X_train = torch.cat((train_img_features, train_txt_features), dim=1)
X_val = torch.cat((val_img_features, val_txt_features), dim=1)

# Flatten features into a 2D matrix (samples x features)
X_train = X_train.view(X_train.size(0), -1).numpy()
X_val = X_val.view(X_val.size(0), -1).numpy()

print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")

# Convert labels to NumPy arrays
y_train = train_labels.numpy()
y_val = val_labels.numpy()  

# Load scaler and PCA models
scaler = joblib.load(SCALER)
pca = joblib.load(PCA)
lime = joblib.load(LIME)

# Scale and transform the features
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)

X_train_pca = pca.transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)

X_train_lime = X_train_scaled[:, lime]
X_val_lime = X_val_scaled[:, lime]

print(f"X_train_pca shape: {X_train_pca.shape}")
print(f"X_val_pca shape: {X_val_pca.shape}")
print(f"X_train_lime shape: {X_train_lime.shape}")
print(f"X_val_lime shape: {X_val_lime.shape}")

X_train shape: (7919, 1024)
X_val shape: (1985, 1024)
X_train_pca shape: (7919, 563)
X_val_pca shape: (1985, 563)
X_train_lime shape: (7919, 250)
X_val_lime shape: (1985, 250)


In [4]:
unique, counts = np.unique(y_train, return_counts=True)
class_counts = dict(zip(unique, counts))

# Filter mask for valid classes
valid_classes = [cls for cls, count in class_counts.items() if count >= MIN_SAMPLES]

# Create mask for training examples with valid classes
mask = np.isin(y_train, valid_classes)

# Filter data
X_train_pca = X_train_pca[mask]
X_train_lime = X_train_lime[mask]

y_train = y_train[mask]

In [5]:
print(len(valid_classes))

77


In [None]:
from classifiers import (
    SVMClassifier, RBFClassifier, RandomForestClassifier, NaiveBayesClassifier, 
    LogisticRegressionClassifier, LDAClassifier, KNNClassifier, DecisionTreeClassifier,
    AdaBoostClassifier, GBMClassifier, XGBoostClassifier
)


# Initialize classifiers
classifiers = {
    "SVM": SVMClassifier(),
    "RBF": RBFClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": NaiveBayesClassifier(),
    "Logistic Regression": LogisticRegressionClassifier(),
    "LDA": LDAClassifier(),
    "KNN": KNNClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Gradient Boosting": GBMClassifier(),
    "XGBoost": XGBoostClassifier()
}


# Train and evaluate classifiers
results = {}
for name, clf in classifiers.items():
    print(f"Training {name}...")
    
    clf.train(X_train_pca, y_train)
        
    y_pred = clf.classify(X_val_pca)
    print(f"Evaluating {name}...")
    accuracy = accuracy_score(y_val, y_pred)
    results[name] = accuracy
    print(f"{name} Accuracy: {accuracy:.4f}")
    clf.save(model_dir="models_pca_cal_sig_new")

Training SVM...
Evaluating SVM...
SVM Accuracy: 0.6050
Model saved to: models_pca_cal_sig/SVM.joblib
Label encoder saved to: models_pca_cal_sig/label_encoder.joblib
Training RBF...
Evaluating RBF...
RBF Accuracy: 0.6191
Model saved to: models_pca_cal_sig/RBF.joblib
Label encoder saved to: models_pca_cal_sig/label_encoder.joblib
Training Random Forest...
Evaluating Random Forest...
Random Forest Accuracy: 0.6000
Model saved to: models_pca_cal_sig/RandomForest.joblib
Label encoder saved to: models_pca_cal_sig/label_encoder.joblib
Training Naive Bayes...
Evaluating Naive Bayes...
Naive Bayes Accuracy: 0.5557
Model saved to: models_pca_cal_sig/NaiveBayes.joblib
Label encoder saved to: models_pca_cal_sig/label_encoder.joblib
Training Logistic Regression...
Evaluating Logistic Regression...
Logistic Regression Accuracy: 0.6186
Model saved to: models_pca_cal_sig/LogisticRegression.joblib
Label encoder saved to: models_pca_cal_sig/label_encoder.joblib
Training LDA...
Evaluating LDA...
LDA Accu

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluating XGBoost...
XGBoost Accuracy: 0.6131
Model saved to: models_pca_cal_sig/XGBoost.joblib
Label encoder saved to: models_pca_cal_sig/label_encoder.joblib


In [7]:
print(results)

{'SVM': 0.6050377833753149, 'RBF': 0.6191435768261965, 'Random Forest': 0.6, 'Naive Bayes': 0.5556675062972293, 'Logistic Regression': 0.618639798488665, 'LDA': 0.5889168765743074, 'KNN': 0.6231738035264484, 'Decision Tree': 0.46750629722921916, 'AdaBoost': 0.4080604534005038, 'Gradient Boosting': 0.4896725440806045, 'XGBoost': 0.6130982367758187}


In [8]:
svm = SVMClassifier()
svm.load(model_dir="models_lime_cal")


Loaded model from: models_lime_cal/SVM.joblib
Loaded label encoder from: models_lime_cal/label_encoder.joblib


In [None]:
# Test the classify method on the first sample of X_val_pca


print(svm.classify(X_val_lime[1].reshape(1, -1)))

res = svm.classify_proba(X_val_lime[1].reshape(1, -1))

print(np.argmax(res, axis=1))
print(res)
print(len(res[0]))



[19]
[[6.28693762e-02 0.00000000e+00 5.58591085e-03 3.14590865e-04
  0.00000000e+00 0.00000000e+00 0.00000000e+00 1.50066036e-02
  1.83105044e-03 0.00000000e+00 6.76077263e-04 0.00000000e+00
  0.00000000e+00 0.00000000e+00 3.52029596e-03 1.04010378e-02
  6.40007774e-03 1.22605285e-02 7.63347156e-01 8.46811707e-02
  8.92850365e-03 0.00000000e+00 3.57720119e-04 0.00000000e+00
  0.00000000e+00 0.00000000e+00 1.29685341e-04 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 5.43786459e-03
  0.00000000e+00 7.33809704e-04 0.00000000e+00 0.00000000e+00
  0.00000000e+00 4.58155377e-04 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 2.06152347e-04 0.00000000e+00
  5.87558153e-04 0.00000000e+00 3.77940849e-04 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 3.02326280e-03 2.61095091e-04 8.56488486e-03
  0