# Model Evaluation

In [1]:
import os
import pandas as pd
import torch
from torchvision.datasets import ImageFolder
from transformers import AutoModelForImageClassification, SiglipForImageClassification
from tqdm.notebook import tqdm

from src.transformers import test_transforms

print("CUDA Available:", torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

CUDA Available: True


In [2]:
# Define paths
CHECKPOINT_DIR = "checkpoints"
TEST_SET_DIR = "test_set"

# Load test dataset
test_ds = ImageFolder(TEST_SET_DIR, transform=test_transforms)
test_ds.samples.sort(key=lambda x: int(os.path.basename(x[0]).split(".")[0][3:])  )

In [3]:
test_ds.samples

[('test_set\\test_set\\img1.jpeg', 0),
 ('test_set\\test_set\\img2.jpeg', 0),
 ('test_set\\test_set\\img3.jpeg', 0),
 ('test_set\\test_set\\img4.png', 0),
 ('test_set\\test_set\\img5.jpeg', 0),
 ('test_set\\test_set\\img6.jpeg', 0),
 ('test_set\\test_set\\img7.jpeg', 0),
 ('test_set\\test_set\\img8.jpeg', 0),
 ('test_set\\test_set\\img9.jpeg', 0),
 ('test_set\\test_set\\img10.jpeg', 0),
 ('test_set\\test_set\\img11.jpeg', 0),
 ('test_set\\test_set\\img12.jpeg', 0),
 ('test_set\\test_set\\img13.jpeg', 0),
 ('test_set\\test_set\\img14.jpeg', 0),
 ('test_set\\test_set\\img15.jpeg', 0),
 ('test_set\\test_set\\img16.jpeg', 0),
 ('test_set\\test_set\\img17.jpeg', 0),
 ('test_set\\test_set\\img18.jpeg', 0),
 ('test_set\\test_set\\img19.jpeg', 0),
 ('test_set\\test_set\\img20.jpeg', 0),
 ('test_set\\test_set\\img21.jpeg', 0),
 ('test_set\\test_set\\img22.jpeg', 0),
 ('test_set\\test_set\\img23.jpeg', 0),
 ('test_set\\test_set\\img24.jpeg', 0),
 ('test_set\\test_set\\img25.jpeg', 0),
 ('test_se

In [4]:
MODELS = {
    "siglip2_other": "checkpoints/siglip2/other/model",
    "siglip2_full": "checkpoints/siglip2/wit/model",
    "mobilenet_other": "checkpoints/mobilenet/other/model",
    "mobilenet_full": "checkpoints/mobilenet/wit/model",
}

In [5]:
from torch.utils.data import DataLoader
from safetensors.torch import load_file

BATCH_SIZE = 64
results = {}

for model_key, model_path in MODELS.items():
    try:
        if "mobilenet" in model_key:
            model = AutoModelForImageClassification.from_pretrained(
                model_path,  # num_labels=2, ignore_mismatched_sizes=True
            )
        elif "siglip2" in model_key:
            model = SiglipForImageClassification.from_pretrained(
                model_path,  # num_labels=2, ignore_mismatched_sizes=True
            )
        else:
            print(f"Unknown model type for {model_key}")
            continue

        weights_path = os.path.join(model_path, "model.safetensors")
        if not os.path.exists(weights_path):
            print(
                f"Error loading model {model_key}: No such file or directory: '{weights_path}'"
            )
            continue

        state_dict = load_file(weights_path)
        model.load_state_dict(state_dict)

        model = model.to(device)
        model.eval()

        loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)
        print(f"Evaluating {model_key} on test set...")
        all_preds = []
        with torch.no_grad():
            for imgs, _ in tqdm(loader, desc=f"Predicting {model_key}"):
                imgs = imgs.to(device)
                logits = model(imgs).logits
                preds = torch.argmax(logits, dim=1).cpu().tolist()
                all_preds.extend(preds)

        results[model_key] = all_preds
        del model
        torch.cuda.empty_cache()

    except Exception as e:
        print(f"Error loading model {model_key}: {str(e)}")
        continue

Evaluating siglip2_other on test set...


Predicting siglip2_other:   0%|          | 0/8 [00:00<?, ?it/s]

Evaluating siglip2_full on test set...


Predicting siglip2_full:   0%|          | 0/8 [00:00<?, ?it/s]

Evaluating mobilenet_other on test set...


Predicting mobilenet_other:   0%|          | 0/8 [00:00<?, ?it/s]

Evaluating mobilenet_full on test set...


Predicting mobilenet_full:   0%|          | 0/8 [00:00<?, ?it/s]

In [6]:
df = pd.DataFrame(results)

for col in df.columns:
    print(f"Value counts for {col}:")
    print(df[col].value_counts())
    print()

Value counts for siglip2_other:
siglip2_other
1    360
0    140
Name: count, dtype: int64

Value counts for siglip2_full:
siglip2_full
1    265
0    235
Name: count, dtype: int64

Value counts for mobilenet_other:
mobilenet_other
0    264
1    236
Name: count, dtype: int64

Value counts for mobilenet_full:
mobilenet_full
1    302
0    198
Name: count, dtype: int64



In [7]:
# in half cases they do not agree
print(
    f"Number of rows with different predictions across models: {len(df[df.nunique(axis=1) > 1])}"
)

pd.DataFrame(df[df.nunique(axis=1) > 1].value_counts().sort_index(ascending=False))

Number of rows with different predictions across models: 250


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
siglip2_other,siglip2_full,mobilenet_other,mobilenet_full,Unnamed: 4_level_1
1,1,1,0,3
1,1,0,1,71
1,1,0,0,11
1,0,1,1,19
1,0,1,0,21
1,0,0,1,27
1,0,0,0,49
0,1,1,1,11
0,1,1,0,2
0,1,0,1,5


In [8]:
full_df = df[["siglip2_full", "mobilenet_full"]]

pd.DataFrame(
    full_df[full_df.nunique(axis=1) > 1].value_counts().sort_index(ascending=False)
)

Unnamed: 0_level_0,Unnamed: 1_level_0,count
siglip2_full,mobilenet_full,Unnamed: 2_level_1
1,0,19
0,1,56


As mobilenet full had best tests results, return its results as the truth.

In [9]:
df["mobilenet_full"].to_csv("results.txt", index_label="ID")