In [None]:
!wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip

--2024-12-10 03:07:08--  http://images.cocodataset.org/annotations/annotations_trainval2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 3.5.28.46, 52.217.114.57, 52.217.124.217, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|3.5.28.46|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 252907541 (241M) [application/zip]
Saving to: ‘annotations_trainval2017.zip.1’


2024-12-10 03:07:28 (12.4 MB/s) - ‘annotations_trainval2017.zip.1’ saved [252907541/252907541]



In [None]:
!unzip annotations_trainval2017.zip

Archive:  annotations_trainval2017.zip
replace annotations/instances_train2017.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace annotations/instances_val2017.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace annotations/captions_train2017.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: m
error:  invalid response [m]
replace annotations/captions_train2017.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace annotations/captions_val2017.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace annotations/person_keypoints_train2017.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace annotations/person_keypoints_val2017.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [None]:
!wget http://images.cocodataset.org/zips/val2017.zip

--2024-12-10 03:09:28--  http://images.cocodataset.org/zips/val2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 16.182.108.97, 3.5.28.193, 52.217.133.25, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|16.182.108.97|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 815585330 (778M) [application/zip]
Saving to: ‘val2017.zip.1’


2024-12-10 03:10:20 (15.1 MB/s) - ‘val2017.zip.1’ saved [815585330/815585330]



In [None]:
!unzip val2017.zip

Archive:  val2017.zip
replace val2017/000000212226.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace val2017/000000231527.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace val2017/000000578922.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace val2017/000000062808.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace val2017/000000119038.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace val2017/000000114871.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
!pip install pycocotools ultralytics




In [None]:
import os
import torch
import torchvision
from torchvision import transforms as T
from torch.utils.data import DataLoader, random_split
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from pycocotools.coco import COCO
from PIL import Image
import json
import random

# Set the device to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Define COCO dataset class for custom dataset (detect "person" only)
class CustomCocoDataset(torch.utils.data.Dataset):
    def __init__(self, root, annotation_file, transform=None):
        self.root = root
        self.coco = COCO(annotation_file)
        self.ids = list(self.coco.imgs.keys())
        self.transform = transform

    def __getitem__(self, idx):
        img_id = self.ids[idx]
        img_info = self.coco.loadImgs(img_id)[0]
        path = os.path.join(self.root, img_info['file_name'])
        image = Image.open(path).convert("RGB")

        # Get annotations for this image
        annotations = self.coco.loadAnns(self.coco.getAnnIds(imgIds=img_id))

        boxes = []
        labels = []
        for ann in annotations:
            # Filter out non-person classes (class ID = 1 for person in COCO)
            if ann['category_id'] == 1:  # "person" category in COCO
                x, y, width, height = ann['bbox']
                boxes.append([x, y, x + width, y + height])  # COCO format: [x_min, y_min, x_max, y_max]
                labels.append(ann['category_id'])

        # If there are no "person" annotations, skip this image
        if len(boxes) == 0:
            return None

        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.int64)

        target = {
            'boxes': boxes,
            'labels': labels,
            'image_id': torch.tensor([img_id]),
            'area': (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]),  # area of bounding boxes
            'iscrowd': torch.zeros(len(annotations), dtype=torch.int64)  # assumes no crowd annotations
        }

        if self.transform:
            image = self.transform(image)

        return image, target

    def __len__(self):
        return len(self.ids)

# Transformations to apply to the images (e.g., normalization, resizing)
transform = T.Compose([
    T.RandomHorizontalFlip(),
    T.RandomRotation(30),
    T.ToTensor(),
    T.Resize([800, 800]),  # Resize to standard size for Faster R-CNN
])

# Paths to the dataset and annotations
train_images_path = 'val2017'
train_annotations_path = 'annotations/instances_val2017.json'

# Load the custom dataset
train_dataset = CustomCocoDataset(root=train_images_path, annotation_file=train_annotations_path, transform=transform)

# Number of images per epoch
images_per_epoch = 1000

# Create a DataLoader with a batch size of 4 (or adjust to your preference)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))

# Load a pre-trained Faster R-CNN model
model_frcnn = fasterrcnn_resnet50_fpn(pretrained=True).to(device)
model_frcnn.eval()  # Set the model to evaluation mode first (to prevent changes to pre-trained weights)

# Replace the pre-trained head with a new one for custom dataset (1 class: "person")
in_features = model_frcnn.roi_heads.box_predictor.cls_score.in_features
model_frcnn.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes=2)  # 2 classes: background and person

# Move model to GPU
model_frcnn.to(device)

# Optimizer and learning rate
params = [p for p in model_frcnn.parameters() if p.requires_grad]
optimizer = torch.optim.Adam(params, lr=1e-4)

# Learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

# Training loop
def train_model():
    model_frcnn.train()  # Set model to training mode
    num_epochs = 10

    for epoch in range(num_epochs):
        running_loss = 0.0
        random.shuffle(train_dataset.ids)  # Shuffle the dataset

        for i in range(0, images_per_epoch, 4):  # Limit to 100 images per epoch, with batch size 4
            images, targets = [], []
            for j in range(i, min(i + 4, images_per_epoch)):
                data = train_dataset[j]
                if data:  # Skip images with no "person" annotation
                    image, target = data
                    images.append(image)
                    targets.append(target)

            images = [image.to(device) for image in images]
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            if not images or not targets:
              continue  # Skip this batch if empty
            # Forward pass
            loss_dict = model_frcnn(images, targets)

            losses = sum(loss for loss in loss_dict.values())  # Total loss
            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

            running_loss += losses.item()



        # Print training loss after every epoch
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss / images_per_epoch}")

        # Update learning rate
        lr_scheduler.step()

        # Save the model after each epoch
        torch.save(model_frcnn.state_dict(), f'fasterrcnn_person_epoch_{epoch + 1}.pth')

# Run the training loop
if __name__ == "__main__":
    print("Training Faster R-CNN for 'Person' class only...")
    train_model()

loading annotations into memory...
Done (t=1.45s)
creating index...
index created!
Training Faster R-CNN for 'Person' class only...
Epoch 1/10, Loss: 0.14813434050232172
Epoch 2/10, Loss: 0.13650826124846935
Epoch 3/10, Loss: 0.12810188481211662
Epoch 4/10, Loss: 0.12305251525342464
Epoch 5/10, Loss: 0.11842656765133143
Epoch 6/10, Loss: 0.11965451072901487
Epoch 7/10, Loss: 0.12895254723727703
Epoch 8/10, Loss: 0.1299205744192004
Epoch 9/10, Loss: 0.12156219039857387
Epoch 10/10, Loss: 0.11982213146984577


In [None]:
import os
import time
import json
import torch
from PIL import Image
import torchvision.transforms.functional as F
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.ops.boxes import box_iou
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
import torch
from torchvision.ops.boxes import box_iou
from sklearn.metrics import average_precision_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_frcnn = fasterrcnn_resnet50_fpn(pretrained=True).to(device)
model_frcnn.eval()  # Set the model to evaluation mode first (to prevent changes to pre-trained weights)


# Daftar kelas COCO
COCO_INSTANCE_CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane',
    'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
    'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
    'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
    'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
    'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
    'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass',
    'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
    'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
    'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet',
    'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

# Fungsi untuk mendeteksi objek hanya "person"
def detect_person_frcnn(image_path):
    image = Image.open(image_path).convert("RGB")

    # Preprocessing
    image_tensor = F.to_tensor(image).to(device)  # Kirim input ke perangkat yang sesuai

    # Inference
    with torch.no_grad():
        predictions = model_frcnn([image_tensor])

    # Filter hasil prediksi hanya untuk label "person" (label = 1)
    person_predictions = {
        "boxes": [],
        "scores": [],
        "labels": []
    }
    for box, score, label in zip(predictions[0]['boxes'], predictions[0]['scores'], predictions[0]['labels']):
        if label == 1 and score > 0.5:  # Label "person" dan skor > 0.5
            person_predictions["boxes"].append(box)
            person_predictions["scores"].append(score)
            person_predictions["labels"].append(label)

    # Konversi ke tensor
    person_predictions["boxes"] = torch.stack(person_predictions["boxes"]) if person_predictions["boxes"] else torch.tensor([]).to(device)
    person_predictions["scores"] = torch.tensor(person_predictions["scores"]).to(device)
    person_predictions["labels"] = torch.tensor(person_predictions["labels"]).to(device)

    return person_predictions, image  # Mengembalikan prediksi dan gambar asli

# Fungsi untuk menampilkan hasil prediksi
def plot_predictions(image, predictions):
    """Visualisasi hasil prediksi."""
    plt.figure(figsize=(10, 10))
    plt.imshow(image)

    # Jika ada prediksi, gambarkan bounding box
    for box, score in zip(predictions["boxes"], predictions["scores"]):
        coords = box.cpu().numpy()
        x1, y1, x2, y2 = coords
        rect = patches.Rectangle((x1, y1), x2 - x1, y2 - y1, linewidth=2, edgecolor='blue', facecolor='none')
        plt.gca().add_patch(rect)
        plt.text(x1, y1 - 5, f'Person: {score:.2f}', color='blue', fontsize=12, bbox=dict(facecolor='white', alpha=0.5))

    plt.axis('off')
    plt.show()

# Path folder COCO val2017
val2017_path = "val2017"  # Ubah ke path folder val2017 Anda
annotations_path = "annotations/instances_val2017.json"  # File JSON ground truth COCO

# Load ground truth COCO
with open(annotations_path, "r") as f:
    coco_data = json.load(f)

# Fungsi untuk mendapatkan ground truth dari file JSON COCO
def get_ground_truth(coco_data, image_id):
    annotations = [ann for ann in coco_data["annotations"] if ann["image_id"] == image_id and ann["category_id"] == 1]  # Hanya ambil "person"
    boxes = []
    labels = []
    for ann in annotations:
        bbox = ann["bbox"]
        # COCO format bbox [x, y, width, height] -> [x1, y1, x2, y2]
        x1, y1, w, h = bbox
        x2 = x1 + w
        y2 = y1 + h
        boxes.append([x1, y1, x2, y2])
        labels.append(ann["category_id"])
    return {"boxes": torch.tensor(boxes, dtype=torch.float32), "labels": torch.tensor(labels)}

# Dapatkan daftar file gambar
image_files = sorted(os.listdir(val2017_path))[:50]  # Batasi hanya 500 gambar
predictions = []
ground_truths = []

# Prediksi pada 500 gambar
for idx, image_file in enumerate(image_files):
    image_path = os.path.join(val2017_path, image_file)
    image_id = int(image_file.split(".")[0])  # Asumsi nama file adalah ID gambar

    # Deteksi objek
    pred, image = detect_person_frcnn(image_path)
    predictions.append(pred)
    # Ambil ground truth
    gt = get_ground_truth(coco_data, image_id)
    ground_truths.append(gt)

    # Visualisasi hanya untuk 3 gambar pertama
    if idx < 10:
        print(f"Visualizing prediction for {image_file}...")
        plot_predictions(image, pred)

    print(f"Processed {idx + 1}/{len(image_files)}: {image_file}")

def evaluate_person_predictions(predictions, ground_truths, iou_threshold=0.5):
    """
    Menghitung Precision dan Recall untuk label "person" (label = 1).
    predictions: daftar prediksi (list of dicts dengan 'boxes', 'scores', dan 'labels')
    ground_truths: daftar ground truth (list of dicts dengan 'boxes' dan 'labels')
    """
    total_true_positives = 0
    total_false_positives = 0
    total_ground_truths = 0

    for pred, gt in zip(predictions, ground_truths):
        # Pastikan ground truth ada di perangkat yang sama dengan prediksi
        gt_boxes = gt["boxes"].to(device)  # Pastikan ground truth ada di perangkat yang sama
        gt_labels = gt["labels"].to(device)

        pred_boxes = pred["boxes"]
        pred_scores = pred["scores"]
        pred_labels = pred["labels"]

        # Hitung IoU antara prediksi dan ground truth
        if len(pred_boxes) > 0 and len(gt_boxes) > 0:
            iou = box_iou(pred_boxes, gt_boxes)
        else:
            iou = torch.tensor([]).to(device)

        # Match prediksi dan ground truth berdasarkan IoU threshold
        matched_gt = set()
        true_positives = 0
        false_positives = 0

        for i, (box, score, label) in enumerate(zip(pred_boxes, pred_scores, pred_labels)):
            if score < 0.5:  # Hanya hitung prediksi dengan skor lebih dari 0.5
                continue

            # Cari ground truth dengan IoU tertinggi untuk prediksi ini
            if len(gt_boxes) > 0:
                iou_values = iou[i]
                max_iou, max_iou_idx = iou_values.max(0)

                if max_iou >= iou_threshold and max_iou_idx.item() not in matched_gt and label == 1:
                    true_positives += 1
                    matched_gt.add(max_iou_idx.item())
                else:
                    false_positives += 1
            else:
                false_positives += 1

        total_true_positives += true_positives
        total_false_positives += false_positives
        total_ground_truths += len(gt_boxes)

    precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0
    recall = total_true_positives / total_ground_truths if total_ground_truths > 0 else 0
    return precision, recall

# Evaluasi hasil
# Jika ground truth tersedia, tambahkan evaluasi mAP atau Precision-Recall
precision, recall = evaluate_person_predictions(predictions, ground_truths)
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}")

# Fungsi untuk menghitung Average Precision (AP) untuk masing-masing gambar
def compute_average_precision(predictions, ground_truths, iou_threshold=0.5):
    """
    Menghitung Average Precision (AP) untuk "person" (label = 1).
    predictions: daftar prediksi (list of dicts dengan 'boxes', 'scores', dan 'labels')
    ground_truths: daftar ground truth (list of dicts dengan 'boxes' dan 'labels')
    """
    all_scores = []
    all_labels = []
    all_detections = []
    all_gt_boxes = []

    for pred, gt in zip(predictions, ground_truths):
        # Ensure ground truth boxes and labels are tensors
        gt_boxes = gt["boxes"]
        gt_labels = gt["labels"]

        # Convert to tensor if they are lists
        if isinstance(gt_boxes, list):
            gt_boxes = torch.tensor(gt_boxes, dtype=torch.float32)
        if isinstance(gt_labels, list):
            gt_labels = torch.tensor(gt_labels, dtype=torch.int64)

        # Handle empty ground truths
        if len(gt_labels) > 0:
            person_indices = gt_labels == 1  # Boolean mask for "person"
            gt_boxes = gt_boxes[person_indices] if len(gt_boxes) > 0 else torch.empty((0, 4))
            gt_labels = gt_labels[person_indices]
        else:
            gt_boxes = torch.empty((0, 4))
            gt_labels = torch.empty(0, dtype=torch.int64)

        # Add predictions
        pred_boxes = pred["boxes"]
        pred_scores = pred["scores"]
        pred_labels = pred["labels"]

        # Store results
        all_gt_boxes.append(gt_boxes)
        all_detections.append(pred_boxes)
        all_scores.append(pred_scores)
        all_labels.append(pred_labels)

    return all_scores, all_labels, all_detections, all_gt_boxes


# Fungsi untuk menghitung mean Average Precision (mAP)
def compute_map(all_scores, all_labels, all_detections, all_gt_boxes, iou_threshold=0.5):
    """
    Menghitung mean Average Precision (mAP) pada dataset.
    """
    # Looping melalui setiap gambar untuk menghitung AP
    precision_list = []
    recall_list = []
    for scores, labels, detections, gt_boxes in zip(all_scores, all_labels, all_detections, all_gt_boxes):
        # Sorting berdasarkan confidence score
        sorted_indices = torch.argsort(scores, descending=True)
        sorted_detections = [detections[i] for i in sorted_indices]
        sorted_scores = [scores[i] for i in sorted_indices]
        sorted_labels = [labels[i] for i in sorted_indices]

        tp = np.zeros(len(sorted_detections))
        fp = np.zeros(len(sorted_detections))
        total_gt = len(gt_boxes)

        for idx, det in enumerate(sorted_detections):
            # Check if gt_boxes is empty
            if gt_boxes.nelement() == 0: # if gt_boxes is empty, skip the comparison
                fp[idx] = 1
                continue

            iou_values = box_iou(det.unsqueeze(0), gt_boxes)
            max_iou, max_iou_idx = iou_values.max(0)

            # Check if max_iou is a tensor and get its value if so
            max_iou_value = max_iou.item() if max_iou.numel() == 1 else max_iou.max().item()

            if max_iou_value >= iou_threshold:
                tp[idx] = 1
            else:
                fp[idx] = 1

        # Compute precision and recall
        tp_cumsum = np.cumsum(tp)
        fp_cumsum = np.cumsum(fp)
        # Check if tp_cumsum and fp_cumsum are empty before accessing elements
        if len(tp_cumsum) > 0 and len(fp_cumsum) > 0:
            precision = tp_cumsum / (tp_cumsum + fp_cumsum) if tp_cumsum[-1] + fp_cumsum[-1] > 0 else 0
            recall = tp_cumsum / total_gt if total_gt > 0 else 0
        else:
            precision = 0  # Handle empty cases by setting precision and recall to 0
            recall = 0
        if not isinstance(recall, (list, np.ndarray)):
            recall = [recall]  # Convert single value

        precision_list.append(precision)
        recall_list.append(recall)

    # Average precision at different recall thresholds
    average_precision_list = []
    for precision, recall in zip(precision_list, recall_list):
        # Check if recall has enough elements for interpolation
        if len(recall) < 2:  # If recall has fewer than 2 elements, skip interpolation
            average_precision = 0  # or assign a default value
        else:
            # Use interpolation to calculate AP
            recall_interp = np.linspace(0, 1, num=11)
            precision_interp = np.interp(recall_interp, recall, precision)
            average_precision = np.mean(precision_interp)
        average_precision_list.append(average_precision)  # Append AP to the list

    # Mean Average Precision (mAP)
    mAP = np.mean(average_precision_list)
    return mAP

# Dapatkan skor, label, dan kotak prediksi untuk evaluasi mAP
all_scores, all_labels, all_detections, all_gt_boxes = compute_average_precision(predictions, ground_truths)
# Tentukan perangkat
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Jika all_scores, all_labels, dll. adalah daftar, konversi setiap elemen menjadi tensor
all_scores = [torch.tensor(score).to(device) for score in all_scores]
all_labels = [torch.tensor(label).to(device) for label in all_labels]
all_detections = [torch.tensor(detection).to(device) for detection in all_detections]
all_gt_boxes = [torch.tensor(gt_box).to(device) for gt_box in all_gt_boxes]

# Sekarang, Anda dapat melanjutkan dengan perhitungan mAP
map_score = compute_map(all_scores, all_labels, all_detections, all_gt_boxes)
print(f"mAP: {map_score:.4f}")



