In [1]:
# Imports and settings

import os
import pandas as pd
from PIL import Image
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

from concurrent.futures import ThreadPoolExecutor

In [2]:
# Load metadata CSV

df = pd.read_csv("balanced_metadata.csv")

# Filter for images that contain an animal
df = df[df["has_animal"] == 1].reset_index(drop=True)

image_path_prefix = "Orinoquia_Carma_Traps/orinoquia_camera_traps_images/public/"

df = df.iloc[:, 1:]

print(f"Dataframe length: {len(df)}")

df.head()

Dataframe length: 19794


Unnamed: 0,id,datetime,file_name,seq_id,frame_num,seq_num_frames,location,category_id,has_animal,species
0,N25_102EK113_04280033.JPG,2020-04-28 02:18:59+00:00,N25/102EK113/04280033.JPG,398e7044-7160-11ec-820f-5cf3706028c2,5,12,N25,3,1,collared_peccary
1,N27_102EK113_06210753.JPG,2020-06-21 15:23:08+00:00,N27/102EK113/06210753.JPG,396b00d8-7160-11ec-b898-5cf3706028c2,2,3,N27,2,1,black_agouti
2,M01_100EK113_02180585.JPG,2020-02-18 18:12:48+00:00,M01/100EK113/02180585.JPG,39731938-7160-11ec-a7c8-5cf3706028c2,8,30,M01,2,1,black_agouti
3,N09_100EK113_01120102.JPG,2020-01-12 12:39:51+00:00,N09/100EK113/01120102.JPG,39b31844-7160-11ec-be06-5cf3706028c2,2,6,N09,1,1,human
4,M04_100EK113_01220825.JPG,2020-01-22 21:43:45+00:00,M04/100EK113/01220825.JPG,39a50b1b-7160-11ec-9853-5cf3706028c2,5,6,M04,3,1,collared_peccary


In [3]:
# load image tensor as (N, C, H, W)
# N: image count = len(df) = 19,794
# C: channels = 3
# H: height = 224
# W: width = 224

TARGET_SIZE = (224, 224)  # For ResNet18
def load_image_for_resnet(full_path):
    with Image.open(full_path) as im:
        im = im.convert("RGB").resize(TARGET_SIZE)
        arr = np.asarray(im, dtype=np.uint8)

        # Convert (H, W, C) to (C, H, W)
        return np.transpose(arr, (2, 0, 1))

paths = (os.path.join(image_path_prefix, p) for p in df["file_name"])

with ThreadPoolExecutor(max_workers=os.cpu_count()) as exe:
    X = np.asarray(list(exe.map(load_image_for_resnet, paths)))

print("Image tensor shape:", X.shape)  # Should be (N, 3, 224, 224)

np.save("artifact/orinoquia_resnet18_imageset.npy", X)

Image tensor shape: (19794, 3, 224, 224)


In [4]:
# if artifact is saved
X = np.load("artifact/orinoquia_resnet18_imageset.npy")

In [5]:
display(X)

array([[[[ 20,  20,  20, ...,  19,  19,  18],
         [ 20,  20,  20, ...,  18,  19,  19],
         [ 20,  20,  20, ...,  20,  19,  20],
         ...,
         [184, 211, 215, ..., 255, 255, 255],
         [184, 211, 215, ..., 255, 255, 255],
         [180, 209, 215, ..., 235, 234, 233]],

        [[ 20,  20,  20, ...,  19,  19,  18],
         [ 20,  20,  20, ...,  18,  19,  19],
         [ 20,  20,  20, ...,  20,  19,  20],
         ...,
         [106,  98,  92, ..., 251, 251, 251],
         [106,  98,  93, ..., 253, 253, 253],
         [101,  97,  93, ..., 231, 230, 230]],

        [[ 20,  20,  20, ...,  19,  19,  18],
         [ 20,  20,  20, ...,  18,  19,  19],
         [ 20,  20,  20, ...,  20,  19,  20],
         ...,
         [ 76,  54,  43, ..., 255, 255, 255],
         [ 77,  54,  43, ..., 255, 255, 255],
         [ 71,  53,  43, ..., 235, 234, 234]]],


       [[[ 48,  87, 126, ..., 104, 148,  76],
         [ 39,  77, 104, ...,  99,  64,  40],
         [ 49,  70,  91, ..., 

In [34]:
from typing import Tuple

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from torchvision import transforms as T

# ---------- helpers ----------------------------------------------------------
# always-present tail of the pipeline
base_transform = T.Compose([
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],  # ResNet-18 values
                std=[0.229, 0.224, 0.225]),
])

def merge_transforms(augment: T.Compose | None = None) -> T.Compose:
    """
    Return a Compose that is `augment` followed by the common base_transform.
    If `augment` is None, you just get the base_transform.
    """
    if augment is None:
        return base_transform
    return T.Compose(augment.transforms + base_transform.transforms)


# ---------- dataset ----------------------------------------------------------
class CameraTrapDataset(Dataset):
    def __init__(
        self,
        df: pd.DataFrame,
        images: np.ndarray,           # shape (N, 3, 224, 224)
        index: pd.Index,
        transform: T.Compose | None = None,
    ):
        self.df = df
        self.images = images
        self.index = index
        self.transform = merge_transforms(transform) if transform is not None else base_transform

    def __len__(self) -> int:
        return len(self.index)

    def __getitem__(self, idx: int):
        row_idx = self.index[idx]
        row = self.df.iloc[row_idx]

        img = self.images[row_idx] # ndarray (3, 224, 224)
        img = T.ToPILImage()(img.transpose((1, 2, 0)))
        img = self.transform(img) # tensor (3, 224, 224)

        label = torch.tensor(row["has_animal"], dtype=torch.float32) # ()

        return img, label

def make_train_val_index(
    df: pd.DataFrame, train_ratio: float = 0.9
) -> Tuple[pd.Index, pd.Index]:
    shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)
    split = int(train_ratio * len(shuffled))
    return shuffled.index[:split], shuffled.index[split:]

train_idx, val_idx = make_train_val_index(df)

train_ds = CameraTrapDataset(df, X, train_idx, transform=T.Compose([
    T.RandomHorizontalFlip(),
    T.RandomRotation(10),
]))

val_ds = CameraTrapDataset(df, X, val_idx, transform=None)

print(f"Training size: {len(train_ds)}")
print(f"Validation size: {len(val_ds)}")

Training size: 17814
Validation size: 1980


In [35]:
from time import time

t0 = time()
img, label = train_ds[0]
print("Single image load time:", time() - t0)

Single image load time: 0.002100229263305664


In [36]:
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=0)
val_dl = DataLoader(val_ds, batch_size=32, shuffle=False, num_workers=0)

imgs, labels = next(iter(train_dl))

# imgs (BATCH_SIZE, 3, 224, 224)
# labels (BATCH_SIZE)
# 0 = no animal, 1 = animal
print(imgs.shape, labels.shape, labels.unique())

torch.Size([32, 3, 224, 224]) torch.Size([32]) tensor([1.])


In [39]:
from torchvision import models
from tqdm import tqdm

LR = 1e-4

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Using Device {DEVICE}")

resnet = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)

# freeze all layers
for p in resnet.parameters():
    p.requires_grad = False

in_features = resnet.fc.in_features
resnet.fc = nn.Linear(in_features, 1)
resnet = resnet.to(DEVICE)

optimizer = optim.Adam(resnet.fc.parameters(), lr=LR)
criterion = nn.BCEWithLogitsLoss()

@torch.no_grad()
def run_eval_epoch(model, dataloader, criterion):
    model.eval()
    running_loss, correct, total = 0.0, 0, 0

    for imgs, labels in tqdm(dataloader, desc="Val  ", leave=False):
        imgs   = imgs.to(DEVICE)
        labels = labels.to(DEVICE).unsqueeze(1)

        logits = model(imgs)
        loss   = criterion(logits, labels)

        running_loss += loss.item() * imgs.size(0)
        preds = (logits.sigmoid() >= 0.5).int()
        correct += (preds == labels.int()).sum().item()
        total   += imgs.size(0)

    return running_loss / total, correct / total

Using Device cuda


In [40]:
EVAL_EVERY = 100          # batches
NUM_EPOCHS = 20

history = {
    "step":        [],
    "epoch":       [],
    "train_loss":  [],
    "val_loss":    [],
    "train_acc":   [],
    "val_acc":     [],
}

global_step = 0
for epoch in range(1, NUM_EPOCHS + 1):
    resnet.train()
    running_loss, correct, seen = 0.0, 0, 0

    pbar = tqdm(train_dl, desc=f"Epoch {epoch}", leave=False)
    for imgs, labels in pbar:
        global_step += 1

        imgs   = imgs.to(DEVICE)
        labels = labels.to(DEVICE).unsqueeze(1)

        optimizer.zero_grad()
        logits = resnet(imgs)
        loss   = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * imgs.size(0)
        preds   = (logits.sigmoid() >= 0.5).int()
        correct += (preds == labels.int()).sum().item()
        seen    += imgs.size(0)

        if global_step % EVAL_EVERY == 0:
            train_loss = running_loss / seen
            train_acc  = correct / seen

            val_loss, val_acc = run_eval_epoch(
                model=resnet,
                dataloader=val_dl,
                criterion=criterion,
            )

            history["step"].append(global_step)
            history["epoch"].append((epoch - 1) + seen / len(train_dl.dataset))
            history["train_loss"].append(train_loss)
            history["val_loss"].append(val_loss)
            history["train_acc"].append(train_acc)
            history["val_acc"].append(val_acc)

            pbar.set_postfix({
                "tr_loss": f"{train_loss:.3f}",
                "val_loss": f"{val_loss:.3f}",
                "val_acc": f"{val_acc:.3f}",
            })

    val_loss, val_acc = run_eval_epoch(
        model=resnet,
        dataloader=val_dl,
        criterion=criterion,
    )
    train_loss = running_loss / seen
    train_acc  = correct / seen

    history["step"].append(global_step)
    history["epoch"].append(epoch)
    history["train_loss"].append(train_loss)
    history["val_loss"].append(val_loss)
    history["train_acc"].append(train_acc)
    history["val_acc"].append(val_acc)

    print(f"[{epoch:02}/{NUM_EPOCHS}] "
          f"train loss {train_loss:.4f}  acc {train_acc:.3f} | "
          f"val loss {val_loss:.4f}  acc {val_acc:.3f}")

df_history = pd.DataFrame(history)

plt.figure(figsize=(6,4))
plt.plot(df_history["step"], df_history["train_loss"], label="Train Loss")
plt.plot(df_history["step"], df_history["val_loss"],   label="Val Loss")
plt.xlabel("Global step")
plt.ylabel("Loss")
plt.title("Training / Validation loss vs. step")
plt.legend(); plt.grid(True); plt.show()

                                                                                                        

[01/20] train loss 0.0520  acc 0.996 | val loss 0.0054  acc 1.000


                                                                                                        

[02/20] train loss 0.0030  acc 1.000 | val loss 0.0018  acc 1.000


                                                                                                        

[03/20] train loss 0.0013  acc 1.000 | val loss 0.0009  acc 1.000


                                                                                                        

[04/20] train loss 0.0007  acc 1.000 | val loss 0.0006  acc 1.000


                                                                                                        

[05/20] train loss 0.0005  acc 1.000 | val loss 0.0004  acc 1.000


                                                                                                        

KeyboardInterrupt: 