Parse xml files to create positive and negative datasets with labels and boundary box information to train model on

In [80]:
import pathlib
import xml.etree.ElementTree as ET
 
import numpy as np
 
def read_voc_xml(xmlfile: str) -> dict:
    """read the Pascal VOC XML and return (filename, object name, bounding box)
    where bounding box is a vector of (xmin, ymin, xmax, ymax). The pixel
    coordinates are 1-based.
    """
    root = ET.parse(xmlfile).getroot()
    boxes = {"filename": root.find("filename").text,
             "objects": []
            }
    for box in root.iter('object'):
        bb = box.find('bndbox')
        obj = {
            "name": box.find('name').text,
            "xmin": int(bb.find("xmin").text),
            "ymin": int(bb.find("ymin").text),
            "xmax": int(bb.find("xmax").text),
            "ymax": int(bb.find("ymax").text),
        }
        boxes["objects"].append(obj)
 
    return boxes
 
# Read Pascal VOC and write data
base_path = pathlib.Path("dataset-iiit-pet-master")
img_src = base_path / "images"
ann_src = base_path / "annotations" / "xmls"
 
negative = []
positive = []
for xmlfile in ann_src.glob("*.xml"):
    # load xml
    ann = read_voc_xml(str(xmlfile))
    if ann['objects'][0]['name'] == 'cat':
        # negative sample (cat)
        negative.append(str(img_src / ann['filename']))
    else:
        # positive sample (dog)
        bbox = []
        for obj in ann['objects']:
            x = obj['xmin']
            y = obj['ymin']
            w = obj['xmax'] - obj['xmin']
            h = obj['ymax'] - obj['ymin']
            bbox.append(f"{x} {y} {w} {h}")
        line = f"{str(img_src/ann['filename'])} {len(bbox)} {' '.join(bbox)}"
        positive.append(line)
 
# write the output to `negative.dat` and `postiive.dat`
with open("negative.dat", "w") as fp:
    fp.write("\n".join(negative))
 
with open("positive.dat", "w") as fp:
    fp.write("\n".join(positive))

Video Feed with Dog Detection

In [5]:
import cv2
import numpy as np

cap = cv2.VideoCapture(0)

model = 'dog_detect/cascade.xml'
classifier = cv2.CascadeClassifier(model)

# need a VideoWriter object and codec for its parameter
# codec specifies video format
fourcc = cv2.VideoWriter_fourcc(*'XVID')
out = cv2.VideoWriter('output.avi', fourcc, 20.0, (640, 480))
                      # out file, code,  frame rate, resolution

while True:
    ret, frame = cap.read()

    if not ret:
        print("ERROR: image not captured")
        break
        
    # Convert the image to grayscale
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    
    # Perform object detection
    objects, reject_levels, level_weights = classifier.detectMultiScale3(gray,
                                            scaleFactor=1.1, minNeighbors=50,
                                            minSize=(50, 50), outputRejectLevels=True)
    # for i in level_weights:
    level_weights = np.array(level_weights)
    best_guess = np.argmax(level_weights)
    # Draw rectangles around detected objects
    for confidence, (x, y, w, h) in zip(level_weights, objects):
        if confidence >= 2.75 and confidence != np.max(level_weights):
            cv2.rectangle(frame, (x, y), (x+w, y+h), (255, 0, 0), 2)
        if confidence >= 2.75 and confidence == np.max(level_weights):
            cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 0, 255), 2)

    cv2.imshow('Cam Stream', frame)
    out.write(frame)

    # cv2.waitKey(delay) returns ascii value of key pressed
    # & 0xFF is bitwise operation to only get lowest 8 bits of key pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# print(level_weights[best_guess])
cap.release()
out.release()
cv2.destroyAllWindows()

Function to parse xml files

In [16]:
import xml.etree.ElementTree as ET

def get_bbox_info(xmlfile):
    # parse the XML file
    root = ET.parse(xmlfile).getroot()

    # iterate over <object> tag
    obj = root.find('object')
    # get animal tag
    tag = obj.find('name').text
    # ann['objects'][0]['name'] == 'cat'
    # print(tag)
    bndbox = obj.find('bndbox') # get the bounding box element

    # extract bounding box coordinates
    xmin = int(bndbox.find('xmin').text)
    ymin = int(bndbox.find('ymin').text)
    xmax = int(bndbox.find('xmax').text)
    ymax = int(bndbox.find('ymax').text)
        
    return xmin, ymin, xmax, ymax, tag

Function to calculate how much a detection box overlaps a boundary box

In [28]:
# this function takes 2 arrays of values that 2 boundary boxes have in common. 
# They will calculate the area of the 2 boxes overlap and return the percentage of the bbox that is overlapped
def percentage_overlap(xml_object, common_x, common_y):
    # unpack bbox
    xmin, ymin, xmax, ymax, tag = xml_object
    # compute its area
    bbox = (xmax - xmin) * (ymax - ymin)
    # compute area that boxes have in common
    detection_box = common_x.size * common_y.size

    overlap = detection_box / bbox
    # print(overlap)

    return overlap

Function that makes decisions on instances (images) and fills a confusion matrix with the results

In [38]:
def confusion_matrix(matrix, xml, level_weights, objects, filename): 
    '''
    matrix format is [TN, FP]
                     [FN, TP]
    '''
    xmin, ymin, xmax, ymax, tag = get_bbox_info(xml)
    xml_object = (xmin, ymin, xmax, ymax, tag)
        
    # converts into np array
    level_weights = np.array(level_weights)
    # only want boxes that are at least as confident as value
    filtered_objects = []

    # find width and length of boundary box
    x_bbox_array = np.arange(xmin, xmax + 1)
    y_bbox_array = np.arange(ymin, ymax + 1)

    for confidence, (x, y, w, h) in zip(level_weights, objects):
        if confidence >= 2.75:
            filtered_objects.append((x, y, w, h))

    # If filtered_objects is empty then can leave early
    # False Negative
    if len(filtered_objects) == 0 and tag == 'dog':
        matrix[1][0] += 1
        return
    # True Negative
    if len(filtered_objects) == 0 and tag == 'cat':
        matrix[0][0] += 1
        return
            
    # See if boxes overlap enough to count as a positive, filtered_objects are all objects model is confident is a dog      
    for obj in filtered_objects:
        # arrays whos lengths will be used to compare the area of 2 squares
        # pixels are integers so can make array from x1 to x2 incrementing by one
        x_detected_array = np.arange(x, (x + w + 1))
        y_detected_array = np.arange(y, (y + h + 1))

        common_x = np.intersect1d(x_bbox_array, x_detected_array)
        common_y = np.intersect1d(y_bbox_array, y_detected_array)
        overlap = percentage_overlap(xml_object, common_x, common_y)
        
        # True Positive
        if overlap >= 0.25 and tag == 'dog':
            matrix[1][1] += 1
        # False Positive (might be part of dog but not dogs face)
        elif (overlap < 0.25 and tag == 'dog') or (tag == 'cat'):
            matrix[0][1] += 1
            print(filename)

Function that displays an image with the detection boxes and boundary box

In [40]:
def draw_boxes(img, xml, level_weights, objects): 

    xmin, ymin, xmax, ymax, tag = get_bbox_info(xml)
    level_weights = np.array(level_weights)

    # Draw boundary box
    cv2.rectangle(img, (xmin, ymin), (xmax, ymax), (0, 0, 255), 2)
    x_bbox_array = np.arange(xmin, xmax + 1)
    y_bbox_array = np.arange(ymin, ymax + 1)
    
    # Draw rectangles around detected objects
    for confidence, (x, y, w, h) in zip(level_weights, objects):
        # arrays whos lengths will be used to compare the area of 2 squares
        x_detected_array = np.arange(x, (x + w + 1))
        y_detected_array = np.arange(y, (y + h + 1))
        
        if confidence >= 2.75:
            cv2.rectangle(img, (x, y), (x+w, y+h), (255, 0, 0), 2)

Function to calculate relavent performance metrics from confusion matrix

In [30]:
def get_metrics(matrix):
    TN = matrix[0][0]
    FP = matrix[0][1]
    FN = matrix[1][0]
    TP = matrix[1][1]

    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    accuracy = (TP + TN) / (TN + FP + FN + TP)
    specificity = TN / (TN + FP) # True Negative Rate, ability to ID true negatives
    f1_score =  2 * (precision * recall) / (precision + recall)

    return precision, recall, accuracy, specificity, f1_score

Use trained model on entire dataset

In [42]:
import cv2
import numpy as np
import os
import xml.etree.ElementTree as ET
import time

# initialize confusion matrix
c_matrix = np.zeros((2, 2))

image_folder = 'dataset-iiit-pet-master/images'
xml_folder = 'dataset-iiit-pet-master/annotations/xmls'
model = 'dog_detect/cascade.xml'

img_files = os.listdir(image_folder)
 
classifier = cv2.CascadeClassifier(model) 

# time loop
start_time = time.time()
for file in os.listdir(xml_folder):
    
    xml_file_path = os.path.join(xml_folder, file)
    root = ET.parse(xml_file_path).getroot()
    filename = root.find('filename').text
    
    if filename in os.listdir(image_folder):
        img_file_path = os.path.join(image_folder, filename)
        # confusion_matrix(c_matrix, xml, level_weights, objects)
        img = cv2.imread(img_file_path)
        
        # Convert the image to grayscale
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        # Perform object detection
        objects, reject_levels, level_weights = classifier.detectMultiScale3(gray,
                                                scaleFactor=1.1, minNeighbors=50,
                                                minSize=(75, 75), outputRejectLevels=True)
        
        confusion_matrix(c_matrix, xml_file_path, level_weights, objects, filename)
    else:
        print(f"ERROR: image {img_file_path} not found.")
        
# stop timing
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time/60:.4f} minutes")

# print resulting confusion matrix
print(f"Confusion Matrix: \n")
print(f'[TN: {c_matrix[0][0]}, FP: {c_matrix[0][1]}]')
print(f'[FN: {c_matrix[1][0]}, TP: {c_matrix[1][1]}]')

Abyssinian_118.jpg
Abyssinian_16.jpg
Abyssinian_165.jpg
Abyssinian_170.jpg
Abyssinian_170.jpg
Abyssinian_182.jpg
Abyssinian_190.jpg
american_bulldog_106.jpg
american_bulldog_116.jpg
american_bulldog_119.jpg
american_bulldog_125.jpg
american_bulldog_126.jpg
american_bulldog_140.jpg
american_bulldog_176.jpg
american_bulldog_179.jpg
american_bulldog_196.jpg
american_bulldog_201.jpg
american_bulldog_202.jpg
american_pit_bull_terrier_10.jpg
american_pit_bull_terrier_105.jpg
basset_hound_129.jpg
basset_hound_13.jpg
basset_hound_134.jpg
basset_hound_138.jpg
basset_hound_142.jpg
basset_hound_145.jpg
basset_hound_146.jpg
basset_hound_148.jpg
basset_hound_149.jpg
basset_hound_154.jpg
basset_hound_161.jpg
basset_hound_172.jpg
basset_hound_173.jpg
basset_hound_180.jpg
basset_hound_180.jpg
basset_hound_183.jpg
beagle_103.jpg
beagle_107.jpg
beagle_115.jpg
beagle_12.jpg
beagle_131.jpg
beagle_133.jpg
beagle_14.jpg
beagle_181.jpg
beagle_182.jpg
beagle_190.jpg
beagle_192.jpg
beagle_193.jpg
Bengal_102.jp

Display Image with Boundary Box and Detection Boxes

In [20]:
img_name = 'yorkshire_terrier_184'
image = f'dataset-iiit-pet-master/images/{img_name}.jpg'
xml_path = f'dataset-iiit-pet-master/annotations/xmls/{img_name}.xml'
model = 'dog_detect/cascade.xml'

# read image
img = cv2.imread(image)

# Convert the image to grayscale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# Perform object detection
objects, reject_levels, level_weights = classifier.detectMultiScale3(gray,
                                        scaleFactor=1.1, minNeighbors=50,
                                        minSize=(75, 75), outputRejectLevels=True)

draw_boxes(img, xml_path, level_weights, objects)
cv2.imshow('Object Detection', img)
cv2.imwrite('parameters_check.jpg', img)
cv2.waitKey(0)
cv2.destroyAllWindows()

Display Relavent Performance Metrics

In [44]:
import matplotlib.pyplot as plt

precision, recall, accuracy, specificity, f1_score = get_metrics(c_matrix)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'Accuracy: {accuracy}')
print(f'Specificity: {specificity}')
print(f'F1_score: {f1_score}')

Precision: 0.43140495867768597
Recall: 0.11377506538796861
Accuracy: 0.3558265582655827
Specificity: 0.7535816618911175
F1_score: 0.18006209037599172
