### YOLO data format to RCNN format.

In [None]:
import torch
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.transforms import functional as F
from torch.utils.data import DataLoader, Dataset
import os
from PIL import Image

# Custom collate function
def collate_fn(batch):
    return tuple(zip(*batch))

# Custom dataset class
class CustomDataset(Dataset):
    def __init__(self, root, transforms=None):
        self.root = root
        self.transforms = transforms
        self.image_folder = os.path.join(root, "images")
        self.label_folder = os.path.join(root, "labels")

        # Create a list of image paths and corresponding label paths
        self.images = [f for f in os.listdir(self.image_folder) if f.endswith('.jpg')]
        self.labels = [f for f in os.listdir(self.label_folder) if f.endswith('.txt')]

        # Mapping images with their corresponding labels
        self.image_label_pairs = [
            (img, lbl) for img in self.images for lbl in self.labels
            if os.path.splitext(img)[0] == os.path.splitext(lbl)[0]
        ]

    def __getitem__(self, idx):
        img_name, label_name = self.image_label_pairs[idx]
        img_path = os.path.join(self.image_folder, img_name)
        label_path = os.path.join(self.label_folder, label_name)

        img = Image.open(img_path).convert("RGB")
        img_width, img_height = img.size

        # Load bounding boxes and labels
        boxes = []
        labels = []
        with open(label_path, "r") as f:
            for line in f.readlines():
                elements = line.strip().split()
                label = int(elements[0])
                x_center, y_center, width, height = map(float, elements[1:])
                
                # Convert normalized values to pixel coordinates
                x_min = (x_center - width / 2) * img_width
                y_min = (y_center - height / 2) * img_height
                x_max = (x_center + width / 2) * img_width
                y_max = (y_center + height / 2) * img_height
                
                boxes.append([x_min, y_min, x_max, y_max])
                labels.append(label)

        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)
        target = {"boxes": boxes, "labels": labels}

        if self.transforms:
            img = self.transforms(img)

        return img, target

    def __len__(self):
        return len(self.image_label_pairs)

In [None]:
def main():
    # Set up the dataset and data loader
    dataset = CustomDataset(root="dataset_zod/test", transforms=F.to_tensor)
    data_loader = DataLoader(dataset, batch_size=2, shuffle=True, num_workers=0, collate_fn=collate_fn)

    # Load a pre-trained Faster R-CNN model and modify it for your number of classes
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights='DEFAULT')
    num_classes = 11  # Update this with your actual number of classes (background included)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)

    # Define optimizer and learning rate
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

    # Set device
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    # Training loop
    num_epochs = 10
    for epoch in range(num_epochs):
        model.train()
        i = 0
        for images, targets in data_loader:
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

            i += 1
            print(f"Epoch {epoch+1}/{num_epochs}, Iteration {i}, Loss: {losses.item()}")

    print("Training completed.")

if __name__ == "__main__":
    main()

In [29]:
import torch
import torchvision
from torchvision.transforms import functional as F
from PIL import Image, ImageDraw, ImageFont
import os

# Path to the folder containing the images
image_folder = "/Users/varunravi/Desktop/OD/dataset_zod/valid/images"
output_folder = "/Users/varunravi/Desktop/OD/dataset_zod/valid/inference_results"  # Save results in a new folder
os.makedirs(output_folder, exist_ok=True)  # Create folder if it doesn't exist

# Load the trained Faster R-CNN model
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights='DEFAULT')
num_classes = 91  # Your number of classes (including background)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)

# Load the model's trained weights
# Assuming you have saved your trained model weights as 'model.pth'
model.load_state_dict(torch.load('/Users/varunravi/Desktop/OD/fastr-rcnn-trained.pth'))
model.eval()  # Set model to evaluation mode

# Set device
#device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
#model.to(device)

# Get the list of images and sort them
images = sorted([img for img in os.listdir(image_folder) if img.endswith('.jpg')])[:10]

# Perform inference on the first 100 images
for img_name in images:
    img_path = os.path.join(image_folder, img_name)
    image = Image.open(img_path).convert("RGB")
    image_tensor = F.to_tensor(image).unsqueeze(0)  # Add batch dimension and send to device

    # Perform inference
    with torch.no_grad():
        predictions = model(image_tensor)

    # Extract predictions
    pred_boxes = predictions[0]['boxes'].cpu().numpy()  # Bounding boxes
    pred_scores = predictions[0]['scores'].cpu().numpy()  # Confidence scores
    pred_labels = predictions[0]['labels'].cpu().numpy()  # Class labels

    # Set a confidence threshold
    threshold = 0.01
    selected_indices = [i for i, score in enumerate(pred_scores) if score > threshold]

    # Draw bounding boxes and labels on the image
    draw = ImageDraw.Draw(image)
    font = ImageFont.load_default()

    for i in selected_indices:
        box = pred_boxes[i]
        label = pred_labels[i]
        score = pred_scores[i]
        draw.rectangle(((box[0], box[1]), (box[2], box[3])), outline="red", width=3)
        draw.text((box[0], box[1]), f'{label}:{score:.2f}', fill="white", font=font)

    # Save the result image
    output_path = os.path.join(output_folder, f"inference_{img_name}")
    image.save(output_path)

    print(f"Processed {img_name}, saved to {output_path}")

print("Inference completed for the first 100 images.")

Processed 000021_india_2021-04-15T12:59:51.662875Z.jpg, saved to /Users/varunravi/Desktop/OD/dataset_zod/valid/inference_results/inference_000021_india_2021-04-15T12:59:51.662875Z.jpg
Processed 000051_golf_2021-04-29T05:38:42.111236Z.jpg, saved to /Users/varunravi/Desktop/OD/dataset_zod/valid/inference_results/inference_000051_golf_2021-04-29T05:38:42.111236Z.jpg
Processed 000055_india_2021-04-14T11:35:43.225463Z.jpg, saved to /Users/varunravi/Desktop/OD/dataset_zod/valid/inference_results/inference_000055_india_2021-04-14T11:35:43.225463Z.jpg
Processed 000089_india_2021-04-18T15:55:46.776483Z.jpg, saved to /Users/varunravi/Desktop/OD/dataset_zod/valid/inference_results/inference_000089_india_2021-04-18T15:55:46.776483Z.jpg
Processed 000096_india_2021-04-19T10:48:43.670665Z.jpg, saved to /Users/varunravi/Desktop/OD/dataset_zod/valid/inference_results/inference_000096_india_2021-04-19T10:48:43.670665Z.jpg
Processed 000228_india_2021-04-19T08:36:12.774650Z.jpg, saved to /Users/varunravi/