## Train Base Classifiers and save them

In [1]:
import torch
import matplotlib.pyplot as plt
import os
from sklearn.metrics import accuracy_score
import joblib
import sys
import numpy as np


CLIP_FEATURES_DIR = "clip_features"
TRAIN = CLIP_FEATURES_DIR + "/train_features.pt"
VAL = CLIP_FEATURES_DIR + "/val_features.pt"
SCALER = "scaler_model.joblib"
PCA = "pca_model.joblib"
LIME = "top_k_lime_indices.joblib"

MIN_SAMPLES = 3

In [2]:
def load_features(file_path):
    data = torch.load(file_path)
    return data["image_features"], data["text_features"], data["filenames"], data["labels"]


# Load train and validation features
train_img_features, train_txt_features, _, train_labels = load_features(TRAIN)
val_img_features, val_txt_features, _, val_labels = load_features(VAL)

In [3]:
# Combine image and text features for training
X_train = torch.cat((train_img_features, train_txt_features), dim=1)
X_val = torch.cat((val_img_features, val_txt_features), dim=1)

# Flatten features into a 2D matrix (samples x features)
X_train = X_train.view(X_train.size(0), -1).numpy()
X_val = X_val.view(X_val.size(0), -1).numpy()

print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")

# Convert labels to NumPy arrays
y_train = train_labels.numpy()
y_val = val_labels.numpy()  

# Load scaler and PCA models
scaler = joblib.load(SCALER)
pca = joblib.load(PCA)
lime = joblib.load(LIME)

# Scale and transform the features
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)

X_train_pca = pca.transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)

X_train_lime = X_train_scaled[:, lime]
X_val_lime = X_val_scaled[:, lime]

print(f"X_train_pca shape: {X_train_pca.shape}")
print(f"X_val_pca shape: {X_val_pca.shape}")
print(f"X_train_lime shape: {X_train_lime.shape}")
print(f"X_val_lime shape: {X_val_lime.shape}")

X_train shape: (7919, 1024)
X_val shape: (1985, 1024)
X_train_pca shape: (7919, 563)
X_val_pca shape: (1985, 563)
X_train_lime shape: (7919, 250)
X_val_lime shape: (1985, 250)


In [None]:
unique, counts = np.unique(y_train, return_counts=True)
class_counts = dict(zip(unique, counts))

# Filter mask for valid classes
valid_classes = [cls for cls, count in class_counts.items() if count >= MIN_SAMPLES]

# Create mask for training examples with valid classes
mask = np.isin(y_train, valid_classes)

# Filter data
X_train_pca = X_train_pca[mask]
X_train_lime = X_train_lime[mask]

y_train = y_train[mask]

In [None]:
print(len(valid_classes))

77


In [None]:
from classifiers import (
    SVMClassifier, RBFClassifier, RandomForestClassifier, NaiveBayesClassifier, 
    LogisticRegressionClassifier, LDAClassifier, KNNClassifier, DecisionTreeClassifier,
    AdaBoostClassifier, GBMClassifier, XGBoostClassifier
)


# Initialize classifiers
classifiers = {
    "SVM": SVMClassifier(),
    "RBF": RBFClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": NaiveBayesClassifier(),
    "Logistic Regression": LogisticRegressionClassifier(),
    "LDA": LDAClassifier(),
    "KNN": KNNClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Gradient Boosting": GBMClassifier(),
    "XGBoost": XGBoostClassifier()
}


# Train and evaluate classifiers
results = {}
for name, clf in classifiers.items():
    print(f"Training {name}...")
    
    clf.train(X_train_lime, y_train)
        
    y_pred = clf.classify(X_val_lime)
    print(f"Evaluating {name}...")
    accuracy = accuracy_score(y_val, y_pred)
    results[name] = accuracy
    print(f"{name} Accuracy: {accuracy:.4f}")
    clf.save(model_dir="models_lime")

Training SVM...
Evaluating SVM...
SVM Accuracy: 0.6398
Model saved to: models_lime_cal/SVM.joblib
Label encoder saved to: models_lime_cal/label_encoder.joblib
Training RBF...
Evaluating RBF...
RBF Accuracy: 0.6398
Model saved to: models_lime_cal/RBF.joblib
Label encoder saved to: models_lime_cal/label_encoder.joblib
Training Random Forest...
Evaluating Random Forest...
Random Forest Accuracy: 0.6045
Model saved to: models_lime_cal/RandomForest.joblib
Label encoder saved to: models_lime_cal/label_encoder.joblib
Training Naive Bayes...
Evaluating Naive Bayes...
Naive Bayes Accuracy: 0.4448
Model saved to: models_lime_cal/NaiveBayes.joblib
Label encoder saved to: models_lime_cal/label_encoder.joblib
Training Logistic Regression...
Evaluating Logistic Regression...
Logistic Regression Accuracy: 0.5783
Model saved to: models_lime_cal/LogisticRegression.joblib
Label encoder saved to: models_lime_cal/label_encoder.joblib
Training LDA...
Evaluating LDA...
LDA Accuracy: 0.5390
Model saved to: m

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluating XGBoost...
XGBoost Accuracy: 0.6146
Model saved to: models_lime_cal/XGBoost.joblib
Label encoder saved to: models_lime_cal/label_encoder.joblib


In [7]:
print(results)

{'SVM': 0.6397984886649875, 'RBF': 0.6397984886649875, 'Random Forest': 0.6045340050377834, 'Naive Bayes': 0.4448362720403023, 'Logistic Regression': 0.5783375314861461, 'LDA': 0.5390428211586902, 'KNN': 0.6181360201511334, 'Decision Tree': 0.39093198992443323, 'AdaBoost': 0.3783375314861461, 'Gradient Boosting': 0.5138539042821159, 'XGBoost': 0.6146095717884131}


In [8]:
svm = SVMClassifier()
svm.load(model_dir="models_lime_cal")


Loaded model from: models_lime_cal/SVM.joblib
Loaded label encoder from: models_lime_cal/label_encoder.joblib


In [9]:
# Test the classify method on the first sample of X_val_pca


print(svm.classify(X_val_pca[1].reshape(1, -1)))

res = svm.classify_proba(X_val_pca[1].reshape(1, -1))

print(res)
print(len(res[0]))



ValueError: X has 563 features, but SVC is expecting 250 features as input.