Classify:

In [None]:
.\venv\Scripts\activate
python -m speciesnet.scripts.run_model --folders ".\test_images\" --predictions_json "results/output_speciesnet_pl.json" --country POL

In [1]:
import numpy as np
import pandas as pd
import torch.nn as nn
import torch
from ultralytics import YOLO
from PIL import Image
from torch import tensor
from torchvision.transforms import transforms, InterpolationMode

#### One image checks

In [2]:
DFYOLO_NAME = 'deepfaune-yolov8s_960'
DFYOLO_PATH = '../deepfaune/models/'
DFYOLO_WEIGHTS = DFYOLO_PATH + 'deepfaune-yolov8s_960.pt'
SPECIESNET_PATH = 'models/speciesnet-pytorch-v4.0.1a-v1/'
SPECIESNET_MODEL = SPECIESNET_PATH + 'always_crop_99710272_22x8_v12_epoch_00148.pt'
BACKBONE = 'efficientnetv2_m'  # not neccessary

LABELS_FILE = 'models/speciesnet-pytorch-v4.0.1a-v1/always_crop_99710272_22x8_v12_epoch_00148.labels.txt'

In [3]:
CROP_SIZE = 480 # default 480??
DFYOLO_WIDTH = 960 # image width
DFYOLO_THRES = 0.6
DFYOLOHUMAN_THRES = 0.4 # boxes with human above this threshold are saved
DFYOLOCOUNT_THRES = 0.6


with open(LABELS_FILE, "r", encoding="utf-8") as f:
    txt_animalclasses = [line.strip() for line in f if line.strip()]


class Detector:
    def __init__(self, name=DFYOLO_NAME, threshold=None, countthreshold=None, humanthreshold=None):
        print("Using "+DFYOLO_NAME+" with weights at "+DFYOLO_WEIGHTS+", in resolution 960x960")
        self.yolo = YOLO(DFYOLO_WEIGHTS)
        self.imgsz = DFYOLO_WIDTH
        self.threshold = DFYOLO_THRES if threshold is None else threshold
        self.countthreshold = DFYOLOCOUNT_THRES if countthreshold is None else countthreshold
        self.humanthreshold = DFYOLOHUMAN_THRES if humanthreshold is None else humanthreshold

    def bestBoxDetection(self, filename_or_imagecv):
        try:
            results = self.yolo(filename_or_imagecv, verbose=False, imgsz=self.imgsz)
        except FileNotFoundError:
            print(f"File '{filename_or_imagecv}' not found")
            return None, 0, np.zeros(4), 0, []
        except Exception as err:
            print(err)
            return None, 0, np.zeros(4), 0, []

        # orig_img a numpy array (cv2) in BGR
        imagecv = results[0].cpu().orig_img
        detection = results[0].cpu().numpy().boxes

        # Are there any relevant boxes?
        if not len(detection.cls) or detection.conf[0] < self.threshold:
            # No. Image considered as empty
            return None, 0, np.zeros(4), 0, []
        else:
            # Yes. Non empty image
            pass
        # Is there a relevant animal box? 
        try:
            # Yes. Selecting the best animal box
            kbox = np.where((detection.cls==0) & (detection.conf>self.threshold))[0][0]
        except IndexError:
            # No: Selecting the best box for another category (human, vehicle)
            kbox = 0

        # categories are 1=animal, 2=person, 3=vehicle and the empty category 0=empty
        category = int(detection.cls[kbox]) + 1
        box = detection.xyxy[kbox] # xmin, ymin, xmax, ymax

        # Is this an animal box ?
        if category == 1:
            # Yes: cropped image is required for classification
            croppedimage = cropSquareCVtoPIL(imagecv, box.copy())
        else: 
            # No: return none
            return None, 0, np.zeros(4), 0, []
        
        ## animal count
        if category == 1:
            count = sum((detection.conf>self.countthreshold) & (detection.cls==0)) # only above a threshold
        else:
            count = 0
        ## human boxes
        ishuman = (detection.cls==1) & (detection.conf>=self.humanthreshold)
        if any(ishuman==True):
            humanboxes = detection.xyxy[ishuman,]
        else:
            humanboxes = []

        return croppedimage, category, box, count, humanboxes


def cropSquareCVtoPIL(imagecv, box):
    x1, y1, x2, y2 = box
    xsize = (x2-x1)
    ysize = (y2-y1)
    if xsize>ysize:
        y1 = y1-int((xsize-ysize)/2)
        y2 = y2+int((xsize-ysize)/2)
    if ysize>xsize:
        x1 = x1-int((ysize-xsize)/2)
        x2 = x2+int((ysize-xsize)/2)
    height, width, _ = imagecv.shape
    croppedimagecv = imagecv[max(0,int(y1)):min(int(y2),height),max(0,int(x1)):min(int(x2),width)]
    croppedimage = Image.fromarray(croppedimagecv[:,:,(2,1,0)]) # converted to PIL BGR image
    return croppedimage


class Classifier:

    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = torch.load(SPECIESNET_MODEL, map_location=self.device, weights_only=False)
        self.model.eval()
        print(f'Speciesnet loaded onto {self.device}')

        # transform image to form usable by network
        self.transforms = transforms.Compose([
            transforms.Resize(size=(CROP_SIZE, CROP_SIZE), interpolation=InterpolationMode.BICUBIC),
            transforms.ToTensor()
            # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def predictOnBatch(self, batchtensor, withsoftmax=True):
        batchtensor = batchtensor.to(self.device)
        with torch.no_grad():
            logits = self.model(batchtensor)
            preds = logits.softmax(dim=1) if withsoftmax else logits
        return preds.cpu().numpy()

    def preprocessImage(self, croppedimage):
        return self.transforms(croppedimage).unsqueeze(dim=0)  # batch dimension

In [18]:
images = pd.read_csv('../y_clean.csv', index_col=0)
images.RelativePath = images.RelativePath.str.replace('\\', '/')
image = images.iloc[1000,:]
image_path = '../../pictures/' + image.RelativePath + '/' + image.File

In [19]:
image

File            2023-07-22 19-22-33.JPG
RelativePath     02_WYSZOWATKA/B/Lato/2
species                         roedeer
Name: 1000, dtype: object

In [20]:
detector = Detector()
classifier = Classifier()

cropped_img, category, box, count, humanboxes = detector.bestBoxDetection(image_path)
batch = classifier.preprocessImage(cropped_img)
batch = batch.permute(0, 2, 3, 1).contiguous()
preds = classifier.predictOnBatch(batch)
top_idx = np.argmax(preds)
print(f"Predicted class: {txt_animalclasses[top_idx]} (confidence {preds[0][top_idx]:.3f})")

Using deepfaune-yolov8s_960 with weights at ../deepfaune/models/deepfaune-yolov8s_960.pt, in resolution 960x960
Predicted class: 317171d7-d306-4e71-9a4a-33e62012076b;mammalia;cetartiodactyla;cervidae;capreolus;capreolus;european roe deer (confidence 0.910)


#### PÄ™tla klasyfikacji

##### Parallel

In [4]:
class Classifier:

    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = torch.load(SPECIESNET_MODEL, map_location=self.device, weights_only=False)
        self.model.eval()
        self.model.half()
        print(f'Speciesnet loaded onto {self.device}')

        # transform image to form usable by network
        self.transforms = transforms.Compose([
            transforms.Resize(size=(CROP_SIZE, CROP_SIZE), interpolation=InterpolationMode.BICUBIC),
            transforms.ToTensor()
            # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def predictOnBatch(self, batch_tensor, withsoftmax=True):
        batch_tensor = batch_tensor.to(self.device).half()
        with torch.no_grad():
            logits = self.model(batch_tensor)
            preds = logits.softmax(dim=1) if withsoftmax else logits
        return preds.cpu().numpy()

    def preprocessImage(self, croppedimage):
        return self.transforms(croppedimage).unsqueeze(dim=0)  # batch dimension

In [5]:
from datetime import datetime
detector = Detector()
classifier = Classifier()

Using deepfaune-yolov8s_960 with weights at ../deepfaune/models/deepfaune-yolov8s_960.pt, in resolution 960x960
Speciesnet loaded onto cuda


In [8]:
from torch.utils.data import DataLoader, Dataset


def collate_non_null(batch):
    """Collate function that filters out None entries."""
    return [item for item in batch if item[0] is not None]

class ImageDataset(Dataset):
    def __init__(self, dataframe, transform):
        self.df = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = '../../pictures/' + row['RelativePath'] + '/' + row['File']
        croppedimage, _, _, _, _ = detector.bestBoxDetection(image_path)
        if croppedimage is None:
            return None, image_path
        return self.transform(croppedimage), image_path

images = pd.read_csv('../y_clean.csv', index_col=0)
images.RelativePath = images.RelativePath.str.replace('\\', '/')

# build dataset & dataloader
dataset = ImageDataset(images, classifier.transforms)
loader = DataLoader(dataset, batch_size=200, num_workers=0, pin_memory=True, collate_fn=collate_non_null)

In [None]:
results = pd.DataFrame({'image': [], 'detected_animal': [], 'confidence': []})
for batch in loader:
    if len(batch) == 0:
        continue
    tensors, paths = zip(*batch)
    batch_tensor = torch.stack(tensors).to(classifier.device)
    batch_tensor = batch_tensor.permute(0, 2, 3, 1).contiguous()
    preds = classifier.predictOnBatch(batch_tensor)
    top_idx = np.argmax(preds, axis=1)
    for path, idx, conf in zip(paths, top_idx, preds[range(len(preds)), top_idx]):
        output = txt_animalclasses[idx].rsplit(';')[-1]
        results.loc[len(results)] = [path, output, conf]

In [None]:
results

In 5:30: 1250 images xD

##### Series

In [None]:
CROP_SIZE = 480 # default 480??
DFYOLO_WIDTH = 960 # image width
DFYOLO_THRES = 0.6
DFYOLOHUMAN_THRES = 0.4 # boxes with human above this threshold are saved
DFYOLOCOUNT_THRES = 0.6


with open(LABELS_FILE, "r", encoding="utf-8") as f:
    txt_animalclasses = [line.strip() for line in f if line.strip()]


class Detector:
    def __init__(self, name=DFYOLO_NAME, threshold=None, countthreshold=None, humanthreshold=None):
        print("Using "+DFYOLO_NAME+" with weights at "+DFYOLO_WEIGHTS+", in resolution 960x960")
        self.yolo = YOLO(DFYOLO_WEIGHTS)
        self.imgsz = DFYOLO_WIDTH
        self.threshold = DFYOLO_THRES if threshold is None else threshold
        self.countthreshold = DFYOLOCOUNT_THRES if countthreshold is None else countthreshold
        self.humanthreshold = DFYOLOHUMAN_THRES if humanthreshold is None else humanthreshold

    def bestBoxDetection(self, filename_or_imagecv):
        try:
            results = self.yolo(filename_or_imagecv, verbose=False, imgsz=self.imgsz)
        except FileNotFoundError:
            print(f"File '{filename_or_imagecv}' not found")
            return None, 0, np.zeros(4), 0, []
        except Exception as err:
            print(err)
            return None, 0, np.zeros(4), 0, []

        # orig_img a numpy array (cv2) in BGR
        imagecv = results[0].cpu().orig_img
        detection = results[0].cpu().numpy().boxes

        # Are there any relevant boxes?
        if not len(detection.cls) or detection.conf[0] < self.threshold:
            # No. Image considered as empty
            return None, 0, np.zeros(4), 0, []
        else:
            # Yes. Non empty image
            pass
        # Is there a relevant animal box? 
        try:
            # Yes. Selecting the best animal box
            kbox = np.where((detection.cls==0) & (detection.conf>self.threshold))[0][0]
        except IndexError:
            # No: Selecting the best box for another category (human, vehicle)
            kbox = 0

        # categories are 1=animal, 2=person, 3=vehicle and the empty category 0=empty
        category = int(detection.cls[kbox]) + 1
        box = detection.xyxy[kbox] # xmin, ymin, xmax, ymax

        # Is this an animal box ?
        if category == 1:
            # Yes: cropped image is required for classification
            croppedimage = cropSquareCVtoPIL(imagecv, box.copy())
        else: 
            # No: return none
            return None, 0, np.zeros(4), 0, []
        
        ## animal count
        if category == 1:
            count = sum((detection.conf>self.countthreshold) & (detection.cls==0)) # only above a threshold
        else:
            count = 0
        ## human boxes
        ishuman = (detection.cls==1) & (detection.conf>=self.humanthreshold)
        if any(ishuman==True):
            humanboxes = detection.xyxy[ishuman,]
        else:
            humanboxes = []

        return croppedimage, category, box, count, humanboxes


def cropSquareCVtoPIL(imagecv, box):
    x1, y1, x2, y2 = box
    xsize = (x2-x1)
    ysize = (y2-y1)
    if xsize>ysize:
        y1 = y1-int((xsize-ysize)/2)
        y2 = y2+int((xsize-ysize)/2)
    if ysize>xsize:
        x1 = x1-int((ysize-xsize)/2)
        x2 = x2+int((ysize-xsize)/2)
    height, width, _ = imagecv.shape
    croppedimagecv = imagecv[max(0,int(y1)):min(int(y2),height),max(0,int(x1)):min(int(x2),width)]
    croppedimage = Image.fromarray(croppedimagecv[:,:,(2,1,0)]) # converted to PIL BGR image
    return croppedimage


class Classifier:

    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = torch.load(SPECIESNET_MODEL, map_location=self.device, weights_only=False)
        self.model.eval()
        print(f'Speciesnet loaded onto {self.device}')

        # transform image to form usable by network
        self.transforms = transforms.Compose([
            transforms.Resize(size=(CROP_SIZE, CROP_SIZE), interpolation=InterpolationMode.BICUBIC),
            transforms.ToTensor()
            # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def predictOnBatch(self, batchtensor, withsoftmax=True):
        batchtensor = batchtensor.to(self.device)
        with torch.no_grad():
            logits = self.model(batchtensor)
            preds = logits.softmax(dim=1) if withsoftmax else logits
        return preds.cpu().numpy()

    def preprocessImage(self, croppedimage):
        return self.transforms(croppedimage).unsqueeze(dim=0)  # batch dimension


detector = Detector()
classifier = Classifier()

Using deepfaune-yolov8s_960 with weights at ../deepfaune/models/deepfaune-yolov8s_960.pt, in resolution 960x960
Speciesnet loaded onto cuda


In [6]:
images = pd.read_csv('../y_clean.csv', index_col=0)
images.RelativePath = images.RelativePath.str.replace('\\', '/')

In [None]:
results = pd.DataFrame({'image': [], 'detected_animal': [], 'confidence': []})
currently_classifying = images.iloc[0,1].rsplit('/')[0]
print('Currently: ' + currently_classifying)

for _, row in images.iterrows():
    if currently_classifying != row['RelativePath'].rsplit('/')[0]:
        currently_classifying = row['RelativePath'].rsplit('/')[0]
        print('Currently: ' + currently_classifying)
    image_path = '../../pictures/' + row['RelativePath'] + '/' + row['File']
    cropped_img, category, box, count, humanboxes = detector.bestBoxDetection(image_path)

    if cropped_img is not None:
        batch = classifier.preprocessImage(cropped_img)
        batch = batch.permute(0, 2, 3, 1).contiguous()
        preds = classifier.predictOnBatch(batch)
        top_idx = np.argmax(preds)
        output = txt_animalclasses[top_idx].rsplit(';')[-1]

        results.loc[len(results)] = [image_path, output, preds[0][top_idx]]
    else:
        results.loc[len(results)] = [image_path, 'empty', 0]

now = datetime.now().strftime('%Y_%m_%d_%H_%M')
results.to_csv('results/results_speciesnet_' + now + '.csv')

In 5:30 minutes: 1925

Total: 175 minutes

In [None]:
now = datetime.now().strftime('%Y_%m_%d_%H_%M')
results.to_csv('results/results_speciesnet_' + now + '.csv')

##### Series, no YOLO

(doesn't work, so probably the detector should be before the classifier)

In [10]:
DFYOLO_NAME = 'deepfaune-yolov8s_960'
DFYOLO_PATH = '../deepfaune/models/'
DFYOLO_WEIGHTS = DFYOLO_PATH + 'deepfaune-yolov8s_960.pt'
SPECIESNET_PATH = 'models/speciesnet-pytorch-v4.0.1a-v1/'
SPECIESNET_MODEL = SPECIESNET_PATH + 'always_crop_99710272_22x8_v12_epoch_00148.pt'
BACKBONE = 'efficientnetv2_m'  # not neccessary

LABELS_FILE = 'models/speciesnet-pytorch-v4.0.1a-v1/always_crop_99710272_22x8_v12_epoch_00148.labels.txt'

CROP_SIZE = 480 # default 480??
DFYOLO_WIDTH = 960 # image width
DFYOLO_THRES = 0.6
DFYOLOHUMAN_THRES = 0.4 # boxes with human above this threshold are saved
DFYOLOCOUNT_THRES = 0.6


with open(LABELS_FILE, "r", encoding="utf-8") as f:
    txt_animalclasses = [line.strip() for line in f if line.strip()]


class Classifier:

    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = torch.load(SPECIESNET_MODEL, map_location=self.device, weights_only=False)
        self.model.eval()
        print(f'Speciesnet loaded onto {self.device}')

        # transform image to form usable by network
        self.transforms = transforms.Compose([
            transforms.Resize(size=(CROP_SIZE, CROP_SIZE), interpolation=InterpolationMode.BICUBIC),
            transforms.ToTensor()
            # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def predictOnBatch(self, batchtensor, withsoftmax=True):
        batchtensor = batchtensor.to(self.device)
        with torch.no_grad():
            logits = self.model(batchtensor)
            preds = logits.softmax(dim=1) if withsoftmax else logits
        return preds.cpu().numpy()

    def preprocessImage(self, image):
        return self.transforms(image).unsqueeze(dim=0)  # batch dimension


classifier = Classifier()

Speciesnet loaded onto cuda


In [11]:
images = pd.read_csv('../y_clean.csv', index_col=0)
images.RelativePath = images.RelativePath.str.replace('\\', '/')

In [12]:
results = pd.DataFrame({'image': [], 'detected_animal': [], 'confidence': []})
currently_classifying = images.iloc[0,1].rsplit('/')[0]
print('Currently: ' + currently_classifying)

for _, row in images.iterrows():
    if currently_classifying != row['RelativePath'].rsplit('/')[0]:
        currently_classifying = row['RelativePath'].rsplit('/')[0]
        print('Currently: ' + currently_classifying)
    image_path = '../../pictures/' + row['RelativePath'] + '/' + row['File']
    img = Image.open(image_path).convert("RGB")

    batch = classifier.preprocessImage(img)
    # batch = batch.permute(0, 2, 3, 1).contiguous()
    preds = classifier.predictOnBatch(batch)
    top_idx = np.argmax(preds)
    output = txt_animalclasses[top_idx].rsplit(';')[-1]

    results.loc[len(results)] = [image_path, output, preds[0][top_idx]]

now = datetime.now().strftime('%Y_%m_%d_%H_%M')
results.to_csv('results/results_speciesnet_' + now + '.csv')

Currently: 01_CZARNE


RuntimeError: Given groups=1, weight of size [24, 3, 3, 3], expected input[1, 480, 4, 481] to have 3 channels, but got 480 channels instead

#### Concat

In [18]:
d1 = pd.read_csv('results/results_speciesnet_2025_11_05_20_10.csv', index_col=0)[:29000]
d2 = pd.read_csv('results/results_speciesnet_2025_11_05_22_27.csv', index_col=0)

In [23]:
dc = pd.concat([d1, d2], ignore_index=True)
dc.to_csv('results/results_speciesnet_2025_11_06_7_52.csv')