EfficientNet training, using the efficientnet_b0 model. 
This model is a good starting point for image classification tasks.
It is a good balance between accuracy and computational efficiency.
It is also a good starting point for transfer learning.

I will load the efficientnet_b0 model then perform transfer learning on it for the task of hand gesture detection.



In [2]:
# Importing the necessary libraries, using pytorch.
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms, models
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torchvision.models as models
import torchvision.transforms.functional as F

  Referenced from: <FB2FD416-6C4D-3621-B677-61F07C02A3C5> /Users/fergusproctor/miniforge3/envs/hand_gesture_detection/lib/python3.9/site-packages/torchvision/image.so
  warn(


In [3]:
# load model, and set it to work on the GPU if available.
model = models.efficientnet_b0(pretrained=True)
model.to('cuda' if torch.cuda.is_available() else 'cpu')

# print the model architecture
print(model)

Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth" to /Users/fergusproctor/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-7f5810bc.pth
100.0%


EfficientNet(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): SiLU(inplace=True)
    )
    (1): Sequential(
      (0): MBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
            (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
          )
          (1): SqueezeExcitation(
            (avgpool): AdaptiveAvgPool2d(output_size=1)
            (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
            (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
            (activation): SiLU(inplace=True)
            (scale_activation): Sigmoid()
          )
          (2): Conv2dNormActivat

In [13]:
# test the model out on some test images

# Load and preprocess some test images
from PIL import Image
import os
import numpy as np

# Define image preprocessing
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Set model to evaluation mode
model.eval()

# Load ImageNet class labels
import json
import urllib.request

# Download ImageNet class labels if not already present
labels_url = "https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt"
labels_file = "imagenet_classes.txt"

if not os.path.exists(labels_file):
    urllib.request.urlretrieve(labels_url, labels_file)
    
with open(labels_file, "r") as f:
    categories = [s.strip() for s in f.readlines()]

# Function to predict on a single image
def predict_image(image_path):
    # Load and preprocess the image
    input_image = Image.open(image_path)
    input_tensor = preprocess(input_image)
    input_batch = input_tensor.unsqueeze(0)
    
    # Move to same device as model
    input_batch = input_batch.to('cuda' if torch.cuda.is_available() else 'cpu')
    
    with torch.no_grad():
        output = model(input_batch)
    
    # Get top 5 predictions
    probabilities = torch.nn.functional.softmax(output[0], dim=0)
    top5_prob, top5_catid = torch.topk(probabilities, 5)
    
    # Print results
    print(f"\nPredictions for {os.path.basename(image_path)}:")
    for i in range(5):
        print(f"{categories[top5_catid[i]]:>20}: {top5_prob[i].item()*100:.2f}%")

# Test on some sample images if they exist, get sample images from root of repo



sample_images = [
    "test_dog.jpeg",
    "test_cat.jpeg",
    "test_bird.jpg",
    "95.jpg",
]

print("Note: Please ensure test images exist in the current directory")
print("or modify the image paths accordingly.")

for img_path in sample_images:
    if os.path.exists(img_path):
        predict_image(img_path)
    else:
        print(f"\nWarning: {img_path} not found")


Note: Please ensure test images exist in the current directory
or modify the image paths accordingly.

Predictions for test_dog.jpeg:
       Border collie: 16.99%
              collie: 6.48%
     Tibetan mastiff: 4.97%
            Pembroke: 3.58%
    golden retriever: 3.07%

Predictions for test_cat.jpeg:
               tabby: 83.14%
           tiger cat: 7.99%
        Egyptian cat: 4.30%
         Persian cat: 0.37%
                lynx: 0.10%

Predictions for test_bird.jpg:
              bulbul: 31.83%
           goldfinch: 24.12%
           chickadee: 3.04%
               junco: 1.75%
                 jay: 1.44%

Predictions for 95.jpg:
            Band Aid: 55.86%
           harmonica: 2.78%
          sunglasses: 2.09%
  cellular telephone: 1.97%
              hotdog: 1.51%


In [15]:
# define number of classes for ASL
num_classes = 24

# modify the final layer of the model
# Replace the entire classifier's last Linear layer
model.classifier[1] = nn.Linear(in_features=1280, out_features=num_classes)

# print the model architecture to verify
print(model)

EfficientNet(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): SiLU(inplace=True)
    )
    (1): Sequential(
      (0): MBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
            (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
          )
          (1): SqueezeExcitation(
            (avgpool): AdaptiveAvgPool2d(output_size=1)
            (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
            (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
            (activation): SiLU(inplace=True)
            (scale_activation): Sigmoid()
          )
          (2): Conv2dNormActivat