In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from PIL import Image
import json
from torch.utils.data import DataLoader, Dataset
import os

class ObjectDetectionCNN(nn.Module):
    def __init__(self, num_classes, num_boxes=20):
        super(ObjectDetectionCNN, self).__init__()
        self.num_classes = num_classes
        self.num_boxes = num_boxes
        # Convolutional layers
        self.conv1 = nn.Conv2d(3, 8, kernel_size=5)
        self.pool1 = nn.MaxPool2d(kernel_size=2)
        self.conv2 = nn.Conv2d(8, 16, kernel_size=5)
        self.pool2 = nn.MaxPool2d(kernel_size=2)
        
        # Output size after convolutions
        self.final_conv_size = 16 * 61 * 61  # Update according to your network architecture and input size
        
        # Detection layer: class scores and bounding boxes
        self.detector = nn.Linear(self.final_conv_size, num_boxes * (4 + num_classes))
        
    def forward(self, x):
        x = self.pool1(torch.relu(self.conv1(x)))
        x = self.pool2(torch.relu(self.conv2(x)))
        
        # Flatten the features:
        x = torch.flatten(x, 1)
        
        # Detector layer
        x = self.detector(x)
        # Reshape to [batch_size, num_boxes, 4 (bbox) + num_classes]
        x = x.view(-1, self.num_boxes, 4 + self.num_classes)
        return x

def load_image(image_path, resize=True):
    with Image.open(image_path) as img:
        if resize:
            img = img.resize((256, 256))
        img = np.array(img, dtype=np.float32) / 255.0  # Normalize and ensure type is float32
        img = img.transpose((2, 0, 1))  # Rearrange to channel first
        return torch.from_numpy(img).float()  # Ensure the tensor is float


def load_labels(json_path, img_width=256, img_height=256, num_boxes=20):
    """Load labels for detection from JSON file."""
    labels = np.zeros((num_boxes, 5))  # Assuming one class, change 5 to 4 + num_classes if multiple classes
    with open(json_path, 'r') as file:
        data = json.load(file)
        for i, box in enumerate(data['boxes']):
            if i >= num_boxes:
                break
            x_center, y_center = float(box['x']), float(box['y'])
            width, height = float(box['width']), float(box['height'])
            # Normalize to [0, 1]
            x_min = (x_center - width / 2) / img_width
            y_min = (y_center - height / 2) / img_height
            x_max = (x_center + width / 2) / img_width
            y_max = (y_center + height / 2) / img_height
            # Labels for object detection: [class, x_min, y_min, x_max, y_max]
            labels[i] = [1, x_min, y_min, x_max, y_max]  # Assuming 'ball' class is 1
    return labels

# Example usage:
# model = ObjectDetectionCNN(num_classes=1)  # Adjust num_classes based on your dataset
# image = load_image('../images/image_1.png')
# labels = load_labels('../json_labeling/label_1.json')
# output = model(image.unsqueeze(0))  # Add batch dimension


# Define a simple dataset class
class DetectionDataset(Dataset):
    def __init__(self, image_dir, label_dir):
        self.image_paths = [os.path.join(image_dir, x) for x in sorted(os.listdir(image_dir))]
        self.label_paths = [os.path.join(label_dir, x) for x in sorted(os.listdir(label_dir))]

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = load_image(self.image_paths[idx])
        labels = load_labels(self.label_paths[idx])  # Assuming load_labels is also defined correctly
        return image, labels


# Initialize dataset and dataloader
dataset = DetectionDataset('../images', '../json_labeling')
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Initialize the model
model = ObjectDetectionCNN(num_classes=1)
model.train()

# Define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    running_loss = 0.0
    for images, labels in dataloader:
        images, labels = images.float(), labels.float()  # Ensure both are float32

        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(images)
        
        # Loss calculation
        loss = criterion(outputs, labels)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(dataloader)}')


print("Training complete.")


Epoch [1/5], Loss: 0.2888109941780567
Epoch [2/5], Loss: 0.24019828528165818
Epoch [3/5], Loss: 0.23112022310495375
Epoch [4/5], Loss: 0.2127714893221855
Epoch [5/5], Loss: 0.19394513189792634
Training complete.


In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches

def visualize_predictions(image, predictions, threshold=0.5):
    """
    Visualizes predictions by drawing bounding boxes on the image.
    `predictions` should be a tensor of shape [num_boxes, 5 (class_score, x_min, y_min, x_max, y_max)].
    """
    fig, ax = plt.subplots(1)
    # Display the image
    ax.imshow(image.permute(1, 2, 0))  # Convert from CHW to HWC for matplotlib

    for pred in predictions:
        score, x_min, y_min, x_max, y_max = pred
        if score > threshold:  # Only display predictions above a certain threshold
            rect = patches.Rectangle((x_min * 256, y_min * 256), (x_max - x_min) * 256, (y_max - y_min) * 256, linewidth=1, edgecolor='r', facecolor='none')
            ax.add_patch(rect)

    plt.show()
# Assuming you have a single image tensor and corresponding output from the model
image, _ = dataset[0]  # Get the first image and its label (ignored here)
image = image.unsqueeze(0)  # Add batch dimension
output = model(image)  # Get model output
visualize_predictions(image.squeeze(0), output.squeeze(0))  # Remove batch dimension for visualization


: 