In [1]:
import torch

from utils.dataloader import VOCDataLoaderPerson
loader = VOCDataLoaderPerson(train=False, batch_size=1)

In [None]:
import torch
from tinyyolov2 import TinyYoloV2
from utils.yolo import nms, filter_boxes
from utils.viz import display_result

# make an instance with 20 classes as output
net = TinyYoloV2(num_classes=1)

# load pretrained weights
sd = torch.load("voc_pretrained.pt")
net.load_state_dict(sd)

#put network in evaluation mode
net.eval()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
def fuse_conv_and_bn(conv, bn):
    with torch.no_grad():
        # Fuse conv and bn layers
        fusedconv = torch.nn.Conv2d(conv.in_channels,
                                    conv.out_channels,
                                    kernel_size=conv.kernel_size,
                                    stride=conv.stride,
                                    padding=conv.padding,
                                    bias=True).to(device)

        # Prepare filters
        w_conv = conv.weight.clone().view(conv.out_channels, -1)
        w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var))).to(device)

        fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.size()))

        # Prepare spatial bias
        if conv.bias is None:
            b_conv = torch.zeros(conv.weight.size(0)).to(device)
        else:
            b_conv = conv.bias

        b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps)).to(device)

        fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn)

        return fusedconv

In [None]:
from tinyyolov2NoBN import TinyYoloV2NoBN

# Initialize the original model and load the weights
original_modelnet = TinyYoloV2(num_classes=1).to(device)

# load pretrained weights
original_modelnet = torch.load("voc_pretrained.pt")

# Create a new model without BatchNorm layers
fused_model = TinyYoloV2NoBN(num_classes=1).to(device)

# Fuse each conv and bn layer
fused_model.conv1 = fuse_conv_and_bn(original_model.conv1, original_model.bn1)
fused_model.conv2 = fuse_conv_and_bn(original_model.conv2, original_model.bn2)
fused_model.conv3 = fuse_conv_and_bn(original_model.conv3, original_model.bn3)
fused_model.conv4 = fuse_conv_and_bn(original_model.conv4, original_model.bn4)
fused_model.conv5 = fuse_conv_and_bn(original_model.conv5, original_model.bn5)
fused_model.conv6 = fuse_conv_and_bn(original_model.conv6, original_model.bn6)
fused_model.conv7 = fuse_conv_and_bn(original_model.conv7, original_model.bn7)
fused_model.conv8 = fuse_conv_and_bn(original_model.conv8, original_model.bn8)

# Copy the final conv layer directly (since it doesn't have BN)
fused_model.conv9 = original_model.conv9

#fused_model.eval()
# Save the fused model state dict
#fused_weights_path = 'path/to/your/fused_weights.pth'
torch.save(fused_model.state_dict(), 'fusedyolov2.pt')



In [None]:
print(fused_model.state_dict())

In [None]:
import tqdm
for idx, (input, target) in tqdm.tqdm(enumerate(loader), total=len(loader)):
    
    #input is a 1 x 3 x 320 x 320 image
    output = net(input)
    "output is of a tensor of size 32 x 125 x 10 x 10"
    #output is a 32 x 125 x 10 x 10 tensor
    
    #filter boxes based on confidence score (class_score*confidence)
    output = filter_boxes(output, 0.1)
    
    #filter boxes based on overlap
    output = nms(output, 0.25)
    
    display_result(input, output, target, file_path='yolo_prediction.png')

In [None]:
#########finetune

In [None]:
import torch

# A subset of VOCDataLoader just for one class (person) (0)
from utils.dataloader import VOCDataLoaderPerson

loader = VOCDataLoaderPerson(train=True, batch_size=1, shuffle=True)
loader_test = VOCDataLoaderPerson(train=False, batch_size=1)

from tinyyolov2 import TinyYoloV2
from utils.loss import YoloLoss
import tqdm

In [None]:
# We define a tinyyolo network with only two possible classes
net = TinyYoloV2(num_classes=1)
sd = torch.load("voc_pretrained.pt")

#We load all parameters from the pretrained dict except for the last layer
net.load_state_dict({k: v for k, v in sd.items() if not '9' in k}, strict=False)
net.eval()

# Definition of the loss
criterion = YoloLoss(anchors=net.anchors)

#We only train the last layer (conv9)
for key, param in net.named_parameters():
    if any(x in key for x in ['1', '2', '3', '4', '5', '6', '7']):
        param.requires_grad = False

optimizer = torch.optim.Adam(filter(lambda x: x.requires_grad, net.parameters()), lr=0.001)

In [None]:
from utils.ap import precision_recall_levels, ap, display_roc
from utils.yolo import nms, filter_boxes

NUM_TEST_SAMPLES = 5
NUM_EPOCHS = 2
test_AP = []

for epoch in range(NUM_EPOCHS):
    if epoch != 0:
        for idx, (input, target) in tqdm.tqdm(enumerate(loader), total=len(loader)):

            optimizer.zero_grad()

            #Yolo head is implemented in the loss for training, therefore yolo=False
            output = net(input, yolo=False)
            loss, _ = criterion(output, target)
            loss.backward()
            optimizer.step()

            
    test_precision = []
    test_recall = []
    with torch.no_grad():
        for idx, (input, target) in tqdm.tqdm(enumerate(loader_test), total=NUM_TEST_SAMPLES):
            output = net(input, yolo=True)
            
            #The right threshold values can be adjusted for the target application
            output = filter_boxes(output, 0.0)
            output = nms(output, 0.5)
            
            precision, recall = precision_recall_levels(target[0], output[0])
            test_precision.append(precision)
            test_recall.append(recall)
            if idx == NUM_TEST_SAMPLES:
                break
                
    #Calculation of average precision with collected samples
    test_AP.append(ap(test_precision, test_recall))
    print('average precision', test_AP)

    #plot ROC
    display_roc(test_precision, test_recall)
    
    state_dict = net.state_dict()
    torch.save(state_dict, 'zhz_sr.pt')
            
            

In [1]:
########open camera

In [None]:
from utils.camera import CameraDisplay
import time
import cv2
now = time.time()

In [None]:
# Define a callback function (your detection pipeline)
# Make sure to first load all your pipeline code and only at the end init the camera

def callback(image):
    global now

    fps = f"{int(1/(time.time() - now))}"
    now = time.time()
    image = image[0:320,0:320, :]
    cv2.putText(image, "fps="+fps, (2, 25), cv2.FONT_HERSHEY_SIMPLEX, 1,
                (100, 255, 0), 2, cv2.LINE_AA)
    return image

In [None]:
# Initialize the camera with the callback
cam = CameraDisplay(callback)

In [None]:
# The camera stream can be started with cam.start()
# The callback gets asynchronously called (can be stopped with cam.stop())
cam.start()

In [None]:
# The camera should always be stopped and released for a new camera is instantiated (calling CameraDisplay(callback) again)
cam.stop()
cam.release()

In [None]:
import torch
import cv2
import time
from utils.camera import CameraDisplay
import numpy as np

In [None]:
from tinyyolov2 import TinyYoloV2
from utils.yolo import nms, filter_boxes
from utils.viz import display_result

# make an instance with 20 classes as output
model = TinyYoloV2(num_classes=20)

# load pretrained weights
sd = torch.load("voc_pretrained.pt")
model.load_state_dict(sd)

#put network in evaluation mode
model.eval()

In [None]:
from typing import List 
# Function to preprocess image for YOLOv2
def preprocess(image):
    # Resize to model input size, normalize, etc.
    image = cv2.resize(image, (416, 416))
    image = image / 255.0  # Normalize to [0, 1]
    image = np.transpose(image, (2, 0, 1))  # Change to CHW
    image = torch.from_numpy(image).float().unsqueeze(0)  # Add batch dimension
    return image

# Function to postprocess YOLOv2 output
def postprocess(output, conf_thresh=0.5, iou_thresh=0.4):
    # Implement the postprocessing steps to get bounding boxes
    # This is a simplified placeholder, adjust according to your model's output
    boxes = []
    # Assume output is in the shape of (batch_size, num_boxes, 5+num_classes)
    output = output[0]  # Remove batch dimension
    for detection in output:
        scores = detection[5:]
        class_id = np.argmax(scores)
        confidence = scores[class_id]
        if confidence > conf_thresh:
            box = detection[:4]  # Extract bounding box coordinates
            x, y, w, h = box
            x1 = int((x - w / 2) * 416)
            y1 = int((y - h / 2) * 416)
            x2 = int((x + w / 2) * 416)
            y2 = int((y + h / 2) * 416)
            boxes.append((x1, y1, x2, y2, confidence, class_id))
    return boxes

def display_result_img(image: np.ndarray, output: List[torch.Tensor]) -> np.ndarray:
    ima_shape = image.shape[:2]
    
    if output:
        bboxes = torch.stack(output, dim=0)
        for i in range(bboxes.shape[1]):
            if bboxes[0, i, -1] >= 0:
                cx = int(bboxes[0, i, 0] * ima_shape[1])
                cy = int(bboxes[0, i, 1] * ima_shape[0])
                
                w = int(bboxes[0, i, 2] * ima_shape[1])
                h = int(bboxes[0, i, 3] * ima_shape[0])
                
                cv2.rectangle(image, (cx - w // 2, cy - h // 2), (cx + w // 2, cy + h // 2), (0, 0, 255), 2)
                cv2.putText(image, f"Class {int(bboxes[0, i, 4])}", (cx - w // 2, cy - h // 2 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)
                
    return image

In [None]:
now = time.time()

In [None]:
def callback(image):
    global now

    fps = f"{int(1/(time.time() - now))}"
    now = time.time()
    # Change to CHW (channels, height, width)
    #image = image.transpose(2, 0, 1)
    #image = image[0:320, 0:320, :]
    
    
    # Preprocess the image
    input_image = preprocess(image)
    
    # Run the model
    #with torch.no_grad():
    #    output = model(input_image)
    
    # Postprocess the output
    #boxes = postprocess(output)
    
    #image = torch.from_numpy(image).float().unsqueeze(0)
    
    with torch.no_grad():
        output = model(input_image)
    output = filter_boxes(output, 0.1)
    output = nms(output, 0.25)
    #boxes = postprocess(output)
    #boxes = output
    #display_result(input, output, target
    
    # Draw bounding boxes
    #for box in boxes:
        # Draw the box and label on the image
        #x1, y1, x2, y2, conf, cls = box
        #cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
        #cv2.putText(image, f"{cls}: {conf:.2f}", (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    image = display_result_img(image, output)
    
    # Draw FPS on the image
    cv2.putText(image, "fps=" + fps, (2, 25), cv2.FONT_HERSHEY_SIMPLEX, 1, (100, 255, 0), 2, cv2.LINE_AA)
    return image

In [None]:
cam = CameraDisplay(callback)

In [None]:
cam.start()

In [None]:
cam.stop()
cam.release()
