<a href="https://colab.research.google.com/github/Rumeysakeskin/Custom-Object-Detection-PyTorch/blob/main/object_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install and unzip dataset
import zipfile, urllib.request, shutil
url = "https://github.com/Stroma-Vision/machine-learning-challenge/releases/download/v0.1/challenge.zip" 
file_name = 'challenge.zip'

with urllib.request.urlopen(url) as response, open(file_name, 'wb') as out_file:
    shutil.copyfileobj(response, out_file)
    with zipfile.ZipFile(file_name) as zf:
        zf.extractall()

In [3]:
import json
import cv2
import os
from torch.utils.data import Dataset, DataLoader
import torch
import torchvision.transforms as T
import torch.nn as nn
import glob as glob
import numpy as np
# from PIL import Image

from tqdm import tqdm
import matplotlib.pyplot as plt
plt.style.use('ggplot')

CREATE IMAGE DATASET FROM VIDEO

In [4]:
def create_image_dataset(data_dir, data_path, annotations):

    frame_counter = 0
    ret = True
    
    # Load the video and extract the frames
    capture = cv2.VideoCapture(data_dir)
    
    # Get the total number of frames in the video
    total_frames = int(capture.get(cv2.CAP_PROP_FRAME_COUNT)) #test: 1800 frames

    # Calculate the interval between frames to keep
    # Determine which frames to keep and which to discard based on the interval value
    # interval = int(capture.get(cv2.CAP_PROP_FPS) / FPS) # FPS = 30

    while ret: # Break the loop if the video has ended
        ret, frame = capture.read() # frame shape 640x640x3
        if not ret:
          break
        frame_name = annotations["images"][frame_counter]['file_name']
        cv2.imwrite(f'{data_path}/{frame_name}', frame)
        frame_counter +=1

    print(f"{total_frames} frames saved to {data_path}")

        # plt.imshow(frame[:,:,::-1])  # Plot frames
        # plt.show()        

In [5]:
# Load video dataset and annotation files
def load_annotations(file_path):
  with open(file_path, 'r') as f:
        annotations = json.load(f)
  return annotations
  
annotations = [("test", "/content/challenge/annotations/instances_test.json"),
               ("val", "/content/challenge/annotations/instances_val.json"),
               ("train", "/content/challenge/annotations/instances_train.json")]

videos = [("test", "/content/challenge/images/test/test.mp4"),
               ("val", "/content/challenge/images/val/val.mp4"),
               ("train", "/content/challenge/images/train/train.mp4")]

for annotation in annotations:
    name, path = annotation
    locals()[f"{name}_annotations"] = load_annotations(path)

for video in videos:
    name, path = video
    locals()[f"{name}_data"] = path

In [6]:
# Create image data folders from videos
data_dirs = ["train_data","test_data","val_data"]
if not os.path.exists('./frames'):
    os.mkdir('./frames/')
    for data_dir in data_dirs:
      os.mkdir(f'./frames/{data_dir}/')  

# Create images from video frames
create_image_dataset(train_data, "./frames/train_data", train_annotations) # data, data_path, annotations
create_image_dataset(val_data, "./frames/val_data", val_annotations) 
create_image_dataset(test_data, "./frames/test_data", test_annotations) 

7200 frames saved to ./frames/train_data
1800 frames saved to ./frames/val_data
1800 frames saved to ./frames/test_data


PREPARE DATASET

In [7]:
class VideoDataset(Dataset):
    def __init__(self, data_dir, annotations, transform=None):
        self.data_dir = data_dir
        self.annotations = annotations
        self.transform = transform
        
        # get all the image paths in sorted order
        self.image_paths = glob.glob(f"{self.data_dir}/*.jpg")
        self.all_images = [image_path.split('/')[-1] for image_path in self.image_paths]
        self.all_images = sorted(self.all_images)
        
    def __len__(self):
        # return len(self.annotations)
        return len(self.all_images)
        
# loads a video and extracts frames from it
    def __getitem__(self, idx):

        image_name = self.all_images[idx]
        image_path = os.path.join(self.data_dir, image_name)

        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32) 
        image /= 255.0
        image = torch.as_tensor(image)
        image = np.transpose(image, (2, 0, 1)) #Image shape: tensor[(3x640x640)]

        # Get the annotations for each frame
        target = {}

        x, y, w, h = self.annotations["annotations"][idx]['bbox'] # [x, y, width, height] --> [x_min, y_min, x_max, y_max]

        target["boxes"] = torch.as_tensor([x, y, x + w, y + h], dtype=torch.float32).unsqueeze(0)  # Add a batch dimension
        target["labels"] = torch.as_tensor([self.annotations["annotations"][idx]['category_id'] - 1], dtype=torch.int64) # Change the labels to 0 and 1 as there are 2 classes
        target["image_id"] = torch.as_tensor([self.annotations["annotations"][idx]['image_id']])
        target["area"] = torch.as_tensor([self.annotations["annotations"][idx]['area']], dtype=torch.float32)
        target["iscrowd"] = torch.as_tensor([self.annotations["annotations"][idx]['iscrowd']], dtype=torch.int64)
        # target["track_id"] = torch.as_tensor([self.annotations["annotations"][idx]['track_id']])
       
        # Apply the transformations if provided
        # if self.transform is not None:
        #   image = self.transform(image)
        #   target = self.transform(target)
       
        return image, target

In [8]:
def get_transform(train):
    """
    The transform pipeline makes it possible to perform operations such as 
    normalization, data augmentation, and other preprocessing steps that can 
    help to improve the performance of a neural network during training.
    """
    transforms = []
    transforms.append(T.ToTensor())
    return T.Compose(transforms)

def collate_fn(batch):
    """
    To handle the data loading as different images may have different number 
    of objects and to handle varying size tensors as well.
    """
    return tuple(zip(*batch))

DATALOADERS

In [9]:
# Create the dataset object for videos and their corresponding annotations

train_dataset = VideoDataset("./frames/train_data", train_annotations, get_transform(True))
val_dataset = VideoDataset("./frames/val_data", val_annotations, get_transform(False))

# Define the dataloader load the data in batches during training and validation
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=0,collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=True, num_workers=0, collate_fn=collate_fn)

FASTER RCNN MODEL

In [10]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

def fasterrcnn_model(num_classes):
  # load a model pre-trained on COCO
  model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")
  # get number of input features for the classifier
  in_features = model.roi_heads.box_predictor.cls_score.in_features
  # replace the pre-trained head with a new one
  model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
  return model

TRAINING AND VALIDATING

In [11]:
def train(train_data_loader, model):
    global train_itr
    global train_loss_list
    
     # initialize tqdm progress bar
    prog_bar = tqdm(train_data_loader, total=len(train_data_loader))
    
    for i, data in enumerate(prog_bar):
        optimizer.zero_grad()
        images, targets = data
        images = list(image.to(DEVICE) for image in images)
        targets = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        loss_value = losses.item()
        train_loss_list.append(loss_value)
        losses.backward()
        optimizer.step()
        train_itr += 1
    
        # update the loss value beside the progress bar for each iteration
        prog_bar.set_description(desc=f"Loss: {loss_value:.4f}")
    return train_loss_list

def validate(valid_data_loader, model):
    
    global val_itr
    global val_loss_list
    
    # initialize tqdm progress bar
    prog_bar = tqdm(valid_data_loader, total=len(valid_data_loader))
    
    for i, data in enumerate(prog_bar):
        images, targets = data
        
        images = list(image.to(DEVICE) for image in images)
        targets = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]
        
        with torch.no_grad():
            loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        loss_value = losses.item()
        val_loss_list.append(loss_value)
        val_itr += 1
        # update the loss value beside the progress bar for each iteration
        prog_bar.set_description(desc=f"Loss: {loss_value:.4f}")
    return val_loss_list

In [None]:
NUM_CLASSES = 2
NUM_EPOCHS = 2
SAVE_MODEL_EPOCH = 2
OUT_DIR = 'outputs'

if not os.path.exists(OUT_DIR):
    os.mkdir(OUT_DIR)

DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("DEVICE:", DEVICE)

model = fasterrcnn_model(num_classes=NUM_CLASSES)

model = model.to(DEVICE)

params = [p for p in model.parameters() if p.requires_grad]
# Define criterion, optimizer, and scheduler
criterion = nn.CrossEntropyLoss()  # standard crossentropy loss for classification
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) # the scheduler divides the lr by 10 every 10 epochs

train_itr = 1
val_itr = 1
train_loss_list = []
val_loss_list = []


for epoch in range(NUM_EPOCHS):
    print(f"\nEPOCH {epoch+1} of {NUM_EPOCHS}")

    train_loss = train(train_dataloader, model)
    val_loss = validate(val_dataloader, model)

    if (epoch+1) % SAVE_MODEL_EPOCH == 0: # save model after every n epochs
        torch.save(model.state_dict(), f"{OUT_DIR}/model{epoch+1}.pth")
        print(f"MODEL SAVED IN {OUT_DIR}/model{epoch+1}.pth ... \n")

    # create two subplots, one for each, training and validation
    figure_1, train_ax = plt.subplots()
    figure_2, valid_ax = plt.subplots()

    if (epoch+1) == NUM_EPOCHS: # save loss plots and model once at the end
        train_ax.plot(train_loss, color='blue')
        train_ax.set_xlabel('iterations')
        train_ax.set_ylabel('train loss')
        valid_ax.plot(val_loss, color='red')
        valid_ax.set_xlabel('iterations')
        valid_ax.set_ylabel('validation loss')
        figure_1.savefig(f"{OUT_DIR}/train_loss_{epoch+1}.png")
        figure_2.savefig(f"{OUT_DIR}/valid_loss_{epoch+1}.png")


DEVICE: cpu


Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
100%|██████████| 160M/160M [00:00<00:00, 178MB/s]



EPOCH 1 of 2


Loss: 0.2059:   1%|          | 3/450 [15:25<38:26:07, 309.55s/it]

LOAD TRAINED MODEL

In [19]:
model = fasterrcnn_model(num_classes=NUM_CLASSES).to(DEVICE)
model.load_state_dict(torch.load(f"{OUT_DIR}/model{epoch+1}.pth", map_location=DEVICE))
model.eval()

Test instances: 1800


FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

OBJECT DETECTION INFERENCE ON VIDEO

In [1]:
CLASSES = ["bolt", "nut"]

# define the detection threshold...
# ... any detection having score below this will be discarded
detection_threshold = 0.8

capture = cv2.VideoCapture("/content/challenge/images/test/test.mp4") 
ret = True
# Preparing variable for writer that we will use to write processed frames
writer= None
# Preparing variables for spatial dimensions of the frames
h, w = None, None

while ret: # Break the loop if the video has ended
    ret, frame = capture.read()
    if not ret:
      break
    if w is None or h is None:  
      h, w = frame.shape[:2]

    image = frame
    orig_image = image.copy()

    image = torch.as_tensor(image)

    image = np.transpose(image, (2, 0, 1)) # torch.Size([3, 640, 640])
    image = torch.tensor(image, dtype=torch.float).cuda()
    image = torch.unsqueeze(image, 0) # torch.Size([1, 3, 640, 640]) add batch size

    with torch.no_grad():
        outputs = model(image)
    
    # load all detection to CPU for further operations
    outputs = [{k: v.to('cpu') for k, v in t.items()} for t in outputs]
    # carry further only if there are detected boxes
    if len(outputs[0]['boxes']) != 0:
        print("tt")
        boxes = outputs[0]['boxes'].data.numpy()
        scores = outputs[0]['scores'].data.numpy()
        # filter out boxes according to `detection_threshold`
        boxes = boxes[scores >= detection_threshold].astype(np.int32)
        draw_boxes = boxes.copy()
        # get all the predicited class names
        pred_classes = [CLASSES[i] for i in outputs[0]['labels'].cpu().numpy()]
        
        # draw the bounding boxes and write the class name on top of it
        for j, box in enumerate(draw_boxes):
            cv2.rectangle(orig_image,
                        (int(box[0]), int(box[1])),
                        (int(box[2]), int(box[3])),
                        (0, 0, 255), 2)
            cv2.putText(orig_image, pred_classes[j], 
                        (int(box[0]), int(box[1]-5)),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 
                        2, lineType=cv2.LINE_AA)

    # Writing processed frame into the file
    # Initializing writer
    # we do it only once from the very beginning when we get spatial dimensions of the frames
    if writer is None:
        # Constructing code of the codec to be used in the function VideoWriter
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')

        # Writing current processed frame into the video file
        writer = cv2.VideoWriter('output_test_video.mp4', fourcc, 30,
                                 (orig_image.shape[1], orig_image.shape[0]), True)

    # Write processed current frame to the file
    writer.write(orig_image)

# Releasing video reader and writer
capture.release()
writer.release()

NameError: ignored