<a href="https://colab.research.google.com/github/Stdunson/InsectDetectionProject/blob/main/InsectThings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Steps for Bioscan Script:
1. Download data
2. Remove the DNA and BIN data, remove all where family is not present
3. Load ResNet
4. Split Sets
5. Train, Validate
6. Test and Gather Data

# Steps for Insect Foundation Script:
1. Download data
2. Parse JSON File, remove all where family is not present
3. Load ResNet
4. Split Sets
5. Train, Validate
6. Test and Gather Data

#Inconsistencies
1. Not every data picture has every taxonomic level, so there has to be a way to filter that during gathering. The lowest level that every image consistently has is order, which is really high. We're gonna focus on family-level
2. One dataset is JSON and the other is raw images. Shouldn't matter too much buecause I'm gonna have two different training algorithms



In [4]:
#Installs
!pip3 install gdown
!pip3 install wget
!pip3 install utils
!pip3 install bioscan-dataset



In [5]:
#Basic Imports
import os
import itertools
import gdown

#Pytorch
import torch
import torchvision
from torchvision import models
from torchvision import transforms
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.nn as nn
import torch.optim as optim

#Resnet
from torchvision.models import resnet50, ResNet50_Weights


# Data Downloading

In [6]:
#Bioscan
from bioscan_dataset import BIOSCAN5M

# Define the transformations for ResNet50
# ResNet50_Weights.IMAGENET1K_V2.transforms() provides the recommended transformations for the pre-trained ResNet50
transform = ResNet50_Weights.IMAGENET1K_V2.transforms()

bioscan_dataset_train = BIOSCAN5M("~/Datasets/bioscan-5m", download=True, split="train", modality="image", target_type="family", target_format="index", transform=transform)
bioscan_dataset_test = BIOSCAN5M("~/Datasets/bioscan-5m", download=True, split="test", modality="image", target_type="family", target_format="index", transform=transform)
bioscan_dataset_val = BIOSCAN5M("~/Datasets/bioscan-5m", download=True, split="val", modality="image", target_type="family", target_format="index", transform=transform)
#To get class from int: idx_to_class

#Information about bioscan_dataset: https://github.com/bioscan-ml/dataset

File missing: /root/Datasets/bioscan-5m/bioscan5m/metadata/csv/BIOSCAN_5M_Insect_Dataset_metadata.csv


100%|██████████| 2.07G/2.07G [00:21<00:00, 98.1MB/s]


Image directory missing: /root/Datasets/bioscan-5m/bioscan5m/images/cropped_256


100%|██████████| 2.22G/2.22G [00:23<00:00, 95.1MB/s]


Metadata CSV file already downloaded and verified
Directory missing: /root/Datasets/bioscan-5m/bioscan5m/images/cropped_256/test


100%|██████████| 1.47G/1.47G [00:20<00:00, 72.1MB/s]


Metadata CSV file already downloaded and verified
Images already downloaded and verified


# Bioscan Things

In [7]:
#Initialize Resnet
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

bioscan_model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2).to(device)
print(f"ResNet50 model loaded on device: {device}")

Using device: cuda
Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth


100%|██████████| 97.8M/97.8M [00:00<00:00, 225MB/s]


ResNet50 model loaded on device: cuda


In [8]:
#Data loaders

BATCH_SIZE = 32
NUM_WORKERS = 2

# 1. Initialize DataLoader for bioscan_dataset_train
train_dataloader = DataLoader(
    bioscan_dataset_train,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS
)

# 2. Initialize DataLoader for bioscan_dataset_val
val_dataloader = DataLoader(
    bioscan_dataset_val,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS
)

# 3. Initialize DataLoader for bioscan_dataset_test
test_dataloader = DataLoader(
    bioscan_dataset_test,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS
)

print(f"Created DataLoader with batch size: {BATCH_SIZE}")

# Verify one batch
for images, labels in train_dataloader:
    print(f"Batch image shape: {images.shape}")
    print(f"Batch label shape: {len(labels)}")
    break # Just take one batch to verify

Created DataLoader with batch size: 32
Batch image shape: torch.Size([32, 3, 224, 224])
Batch label shape: 32


In [9]:
#Freeze ResNet50 layers

num_classes = 934

#Get amt of in features of last layes
in_features = bioscan_model.fc.in_features
print(f"Original ResNet50 FC layer in_features: {in_features}")

#Replace Last Layer
bioscan_model.fc = nn.Linear(in_features, num_classes)
print(f"Replaced bioscan_model.fc with new nn.Linear layer (in_features={in_features}, out_features={num_classes})")

#Freeze parameters of pre-trained bioscan_model, then unfreeze only the parameters of classification head
for param in bioscan_model.parameters():
    param.requires_grad = False
print("Froze all parameters of the base ResNet50 model.")
for param in bioscan_model.fc.parameters():
    param.requires_grad = True
print("Unfroze parameters of the new classification head (bioscan_model.fc).")

#Verify
trainable_params = [name for name, param in bioscan_model.named_parameters() if param.requires_grad]
print(f"Number of trainable parameters: {len(trainable_params)}")
print(f"Trainable layers: {trainable_params}")

Original ResNet50 FC layer in_features: 2048
Replaced bioscan_model.fc with new nn.Linear layer (in_features=2048, out_features=934)
Froze all parameters of the base ResNet50 model.
Unfroze parameters of the new classification head (bioscan_model.fc).
Number of trainable parameters: 2
Trainable layers: ['fc.weight', 'fc.bias']


In [10]:
#Define loss funtction & optimizer

criterion = nn.CrossEntropyLoss()
print(f"Loss function (criterion) set to: {criterion}")

optimizer = optim.Adam(bioscan_model.fc.parameters(), lr=0.001)
print(f"Optimizer set to: {optimizer}")

Loss function (criterion) set to: CrossEntropyLoss()
Optimizer set to: Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    decoupled_weight_decay: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)


In [14]:
#Define training loop

dataloaders = {'train': train_dataloader, 'validation': val_dataloader, 'test': test_dataloader}

def train_model(model, criterion, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-' * 10)

        for phase in ['train', 'validation']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            running_corrects = 0

            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device).long() # Cast labels to torch.Long

                outputs = model(inputs)
                loss = criterion(outputs, labels)

                if phase == 'train':
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                _, preds = torch.max(outputs, 1)
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)

            print('{} loss: {:.4f}, acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

    return model

In [15]:
#Train Model
bioscan_model.to(device)
bioscan_model = train_model(bioscan_model, criterion, optimizer, num_epochs=10)

Epoch 1/10
----------
train loss: 1.3305, acc: 0.6941
validation loss: 0.8976, acc: 0.7747
Epoch 2/10
----------
train loss: 0.8288, acc: 0.7840
validation loss: 0.8230, acc: 0.7920
Epoch 3/10
----------
train loss: 0.7095, acc: 0.8090
validation loss: 0.8149, acc: 0.7928
Epoch 4/10
----------
train loss: 0.6485, acc: 0.8225
validation loss: 0.8127, acc: 0.7975
Epoch 5/10
----------
train loss: 0.6086, acc: 0.8311
validation loss: 0.8342, acc: 0.7920
Epoch 6/10
----------
train loss: 0.5818, acc: 0.8371
validation loss: 0.8282, acc: 0.7909
Epoch 7/10
----------
train loss: 0.5603, acc: 0.8416
validation loss: 0.8274, acc: 0.7983
Epoch 8/10
----------
train loss: 0.5470, acc: 0.8439
validation loss: 0.8668, acc: 0.7922
Epoch 9/10
----------
train loss: 0.5332, acc: 0.8477
validation loss: 0.8506, acc: 0.7941
Epoch 10/10
----------
train loss: 0.5271, acc: 0.8486
validation loss: 0.8616, acc: 0.7958


In [18]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [19]:
!mkdir models

mkdir: cannot create directory ‘models’: File exists


In [20]:
#Save Model
torch.save(bioscan_model.state_dict(), 'models/bioscan_model.pth')

In [21]:
#Define evaluation/testing loop
def evaluate_model(model, dataloader):
    model.eval()
    running_loss = 0.0
    running_corrects = 0

    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs = inputs.to(device)
            labels = labels.to(device).long() # Cast labels to torch
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            _, preds = torch.max(outputs, 1)
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels)


    epoch_loss = running_loss / len(dataloader.dataset)
    epoch_acc = running_corrects.double() / len(dataloader.dataset)

    print('Test loss: {:.4f}, acc: {:.4f}'.format(epoch_loss, epoch_acc))




In [22]:
#Evaluate Model

model = bioscan_model
dataloader = test_dataloader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

evaluate_model(model, dataloader)

Test loss: 1.3684, acc: 0.6922
