<a href="https://colab.research.google.com/github/Stdunson/InsectDetectionProject/blob/main/InsectThings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Steps for Bioscan Script:
1. Download data
2. Remove the DNA and BIN data, remove all where family is not present
3. Load ResNet
4. Split Sets
5. Train, Validate
6. Test and Gather Data

# Steps for Insect Foundation Script:
1. Download data
2. Parse JSON File, remove all where family is not present
3. Load ResNet
4. Split Sets
5. Train, Validate
6. Test and Gather Data

#Inconsistencies
1. Not every data picture has every taxonomic level, so there has to be a way to filter that during gathering. The lowest level that every image consistently has is order, which is really high. We're gonna focus on family-level
2. One dataset is JSON and the other is raw images. Shouldn't matter too much buecause I'm gonna have two different training algorithms



In [7]:
#Installs
!pip3 install gdown
!pip3 install wget
!pip3 install utils
!pip3 install bioscan-dataset



In [8]:
#Basic Imports
import os
import itertools
import gdown

#Pytorch
import torch
import torchvision
from torchvision import models
from torchvision import transforms
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.nn as nn
import torch.optim as optim

#Resnet
from torchvision.models import resnet50, ResNet50_Weights


# Data Downloading

In [9]:
#Bioscan
from bioscan_dataset import BIOSCAN5M

# Define the transformations for ResNet50
# ResNet50_Weights.IMAGENET1K_V2.transforms() provides the recommended transformations for the pre-trained ResNet50
transform = ResNet50_Weights.IMAGENET1K_V2.transforms()

bioscan_dataset_train = BIOSCAN5M("~/Datasets/bioscan-5m", download=True, split="train", modality="image", target_type="family", target_format="text", transform=transform)
bioscan_dataset_test = BIOSCAN5M("~/Datasets/bioscan-5m", download=True, split="test", modality="image", target_type="family", target_format="text", transform=transform)
bioscan_dataset_val = BIOSCAN5M("~/Datasets/bioscan-5m", download=True, split="val", modality="image", target_type="family", target_format="text", transform=transform)

#Information about bioscan_dataset: https://github.com/bioscan-ml/dataset

Metadata CSV file already downloaded and verified
Images already downloaded and verified
Metadata CSV file already downloaded and verified
Images already downloaded and verified
Metadata CSV file already downloaded and verified
Images already downloaded and verified


# Bioscan Things

In [10]:
#Initialize Resnet
bioscan_model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)

In [39]:
#Data loaders

BATCH_SIZE = 32
NUM_WORKERS = 2

# 1. Initialize DataLoader for bioscan_dataset_train
train_dataloader = DataLoader(
    bioscan_dataset_train,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS
)

# 2. Initialize DataLoader for bioscan_dataset_val
val_dataloader = DataLoader(
    bioscan_dataset_val,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS
)

# 3. Initialize DataLoader for bioscan_dataset_test
test_dataloader = DataLoader(
    bioscan_dataset_test,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS
)

print(f"Created DataLoader with batch size: {BATCH_SIZE}")

# Verify one batch
for images, labels in train_dataloader:
    print(f"Batch image shape: {images.shape}")
    print(f"Batch label shape: {len(labels)}")
    break # Just take one batch to verify

Created DataLoader with batch size: 32
Batch image shape: torch.Size([32, 3, 224, 224])
Batch label shape: 32


In [38]:
#Freeze ResNet50 layers

num_classes = 934

#Get amt of in features of last layes
in_features = bioscan_model.fc.in_features
print(f"Original ResNet50 FC layer in_features: {in_features}")

#Replace Last Layer
bioscan_model.fc = nn.Linear(in_features, num_classes)
print(f"Replaced bioscan_model.fc with new nn.Linear layer (in_features={in_features}, out_features={num_classes})")

#Freeze parameters of pre-trained bioscan_model, then unfreeze only the parameters of classification head
for param in bioscan_model.parameters():
    param.requires_grad = False
print("Froze all parameters of the base ResNet50 model.")
for param in bioscan_model.fc.parameters():
    param.requires_grad = True
print("Unfroze parameters of the new classification head (bioscan_model.fc).")

#Verify
trainable_params = [name for name, param in bioscan_model.named_parameters() if param.requires_grad]
print(f"Number of trainable parameters: {len(trainable_params)}")
print(f"Trainable layers: {trainable_params}")

Original ResNet50 FC layer in_features: 2048
Replaced bioscan_model.fc with new nn.Linear layer (in_features=2048, out_features=934)
Froze all parameters of the base ResNet50 model.
Unfroze parameters of the new classification head (bioscan_model.fc).
Number of trainable parameters: 2
Trainable layers: ['fc.weight', 'fc.bias']


In [None]:
#Define loss funtction & optimizer

criterion = nn.CrossEntropyLoss()
print(f"Loss function (criterion) set to: {criterion}")

optimizer = optim.Adam(bioscan_model.fc.parameters(), lr=0.001)
print(f"Optimizer set to: {optimizer}")

In [12]:
#Define training loop



In [None]:
#Define evaluation loop

