<a href="https://colab.research.google.com/github/RKDash7/XAI-Enhanced-YOLOv8-for-Transparent-and-Interpretable-Object-Detection-in-Critical-Applications/blob/main/Faster_R_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torchvision

import os
import torch
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torch.utils.data import Dataset, DataLoader
from PIL import Image

# ----------------------
# 1. Custom Dataset Class for YOLO TXT
# ----------------------
class YOLODataset(Dataset):
    def __init__(self, img_dir, label_dir, transforms=None):
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.transforms = transforms
        self.images = sorted(os.listdir(img_dir))

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.images[idx])
        label_path = os.path.join(self.label_dir, self.images[idx].replace(".jpg", ".txt"))

        img = Image.open(img_path).convert("RGB")
        w, h = img.size

        boxes = []
        labels = []

        # Read YOLO label file
        with open(label_path, "r") as f:
            for line in f.readlines():
                class_id, x_center, y_center, width, height = map(float, line.strip().split())

                # Convert normalized coords to absolute (xmin, ymin, xmax, ymax)
                xmin = (x_center - width / 2) * w
                ymin = (y_center - height / 2) * h
                xmax = (x_center + width / 2) * w
                ymax = (y_center + height / 2) * h

                boxes.append([xmin, ymin, xmax, ymax])
                labels.append(int(class_id) + 1)  # +1 because 0 is background in Faster R-CNN

        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)

        target = {
            "boxes": boxes,
            "labels": labels,
            "image_id": torch.tensor([idx]),
            "area": (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0]),
            "iscrowd": torch.zeros((len(labels),), dtype=torch.int64),
        }

        if self.transforms:
            img = self.transforms(img)

        return img, target

    def __len__(self):
        return len(self.images)

# ----------------------
# 2. Transform Function
# ----------------------
def get_transform():
    return torchvision.transforms.Compose([
        torchvision.transforms.ToTensor()
    ])

# ----------------------
# 3. Load Train & Validation Data
# ----------------------
train_dataset = YOLODataset("/content/Drowsiness-Detectin-Using-Yolov8-1/train/images/", "/content/Drowsiness-Detectin-Using-Yolov8-1/train/labels/", transforms=get_transform())
val_dataset   = YOLODataset("/content/Drowsiness-Detectin-Using-Yolov8-1/valid/images/", "/content/Drowsiness-Detectin-Using-Yolov8-1/valid/labels/", transforms=get_transform())

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))
val_loader   = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))

# ----------------------
# 4. Model Setup
# ----------------------
def get_model(num_classes):
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    return model

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = get_model(num_classes=4)  # Example: 1 class + background
model.to(device)

# ----------------------
# 5. Optimizer
# ----------------------
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

# ----------------------
# 6. Training Loop
# ----------------------
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for images, targets in train_loader:
        images = list(img.to(device) for img in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        total_loss += losses.item()

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}] Loss: {total_loss/len(train_loader):.4f}")

# ----------------------
# 7. Validation (Inference)
# ----------------------
model.eval()
with torch.no_grad():
    for images, _ in val_loader:
        images = list(img.to(device) for img in images)
        outputs = model(images)
        print(outputs)  # Contains 'boxes', 'labels', 'scores'

In [None]:
import torch
import cv2
import numpy as np
from PIL import Image
import torchvision.transforms as T
import os

# Redefine fig1 with paths that are guaranteed to exist
# Let's use the dummy images created during the data preparation step
fig1 = [
    "/content/Drowsiness-Detectin-Using-Yolov8-1/test/images/2623_jpg.rf.47b6274cf4abfc681aa03d278227689a.jpg",
    "/content/Drowsiness-Detectin-Using-Yolov8-1/test/images/P1042797_720_mp4-202_jpg.rf.ad48eb832c35734f7c8427aba3a353d8.jpg"
]

# Define the box_label function
def box_label(image, box, label='', color=(128, 128, 128), txt_color=(255, 255, 255)):
  lw = max(round(sum(image.shape) / 2 * 0.003), 2)
  p1, p2 = (int(box[0]), int(box[1])), (int(box[2]), int(box[3]))
  cv2.rectangle(image, p1, p2, color, thickness=lw, lineType=cv2.LINE_AA)
  if label:
    tf = max(lw - 1, 1)  # font thickness
    w, h = cv2.getTextSize(label, 0, fontScale=lw / 3, thickness=tf)[0]  # text width, height
    outside = p1[1] - h >= 3
    p2 = p1[0] + w, p1[1] - h - 3 if outside else p1[1] + h + 3
    cv2.rectangle(image, p1, p2, color, -1, cv2.LINE_AA)  # filled
    cv2.putText(image,
                label, (p1[0], p1[1] - 2 if outside else p1[1] + h + 2),
                0,
                lw / 3,
                txt_color,
                thickness=tf,
                lineType=cv2.LINE_AA)

# Define the plot_bboxes function
def plot_bboxes(image, boxes, labels=[], colors=[], score=True, conf=None):
  #Define COCO Labels
  if labels == []:
    labels = {0:u'Background', 1: u'Alert', 2: u'Microsleep', 3: u'Yawn'}
  #Define colors
  if colors == []:
    colors = [(89, 161, 197),(67, 161, 255),(19, 222, 24),(186, 55, 2)]

  #plot each boxes
  for box in boxes:
    #add score in label if score=True
    if score :
      # Ensure the class index is within the bounds of the labels dictionary
      class_index = int(box[-1])
      if class_index in labels:
          label = labels[class_index] + " " + str(round(100 * float(box[-2]),1)) + "%"
      else:
          label = f"Unknown Class {class_index} " + str(round(100 * float(box[-2]),1)) + "%"
    else :
      class_index = int(box[-1])
      if class_index in labels:
          label = labels[class_index]
      else:
          label = f"Unknown Class {class_index}"

    #filter every box under conf threshold if conf threshold setted
    if conf is not None:
      if box[-2] > conf:
        color = colors[class_index % len(colors)] # Use modulo for color indexing
        box_label(image, box, label, color)
    else:
      color = colors[class_index % len(colors)] # Use modulo for color indexing
      box_label(image, box, label, color)

  #show image
  image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

  try:
    import google.colab
    IN_COLAB = True
  except:
    IN_COLAB = False

  if IN_COLAB:
    cv2_imshow(image) #if used in Colab
  else :
    cv2.imshow("Inference Result", image) #if used in Python
    cv2.waitKey(0)
    cv2.destroyAllWindows()


# 1. Load the fine-tuned model weights onto the model architecture.
# Ensure the model architecture is defined (it was defined in the previous subtasks)
# Assuming 'model' object from previous steps is available.
# If not, you would need to re-define it here with the correct number of classes.
# For this example, we assume the model architecture is still in memory.

# Load the state dictionary
# Use the path where you saved your best model weights
# If 'fasterrcnn_finetuned_epoch_10.pth' is not the correct path, update it.
#model.load_state_dict(torch.load("fasterrcnn_finetuned_epoch_10.pth"))


# 2. Set the model to evaluation mode
#model.eval()

# Set the device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Select an image path from the reloaded fig1 list.
image_path = fig1[0] # Using the first dummy image

# 4. Load and preprocess the selected image.
img_pil = Image.open(image_path).convert("RGB")
transform = T.Compose([T.ToTensor()]) # Simple ToTensor transform
input_tensor = transform(img_pil).unsqueeze(0).to(device)

# 5. Perform inference by passing the preprocessed image tensor through the model.
# Make sure to wrap the inference in a torch.no_grad() block.
with torch.no_grad():
    prediction = model(input_tensor)

# 6. Process the raw output of the model to extract bounding boxes, labels, and confidence scores.
# The output 'prediction' is a list of dictionaries, one for each image in the batch.
# For a single image, we take the first dictionary: prediction[0]
# It contains 'boxes', 'labels', and 'scores'.
boxes = prediction[0]['boxes']
labels = prediction[0]['labels']
scores = prediction[0]['scores']

# Apply a confidence threshold and NMS (optional but recommended for cleaner results)
conf_threshold = 0.5 # You can adjust this threshold

# Filter based on confidence
keep = scores > conf_threshold
boxes = boxes[keep]
labels = labels[keep]
scores = scores[keep]


# Convert bounding boxes and image to numpy for visualization
img_np = np.array(img_pil)
boxes_np = boxes.cpu().numpy()
labels_np = labels.cpu().numpy()
scores_np = scores.cpu().numpy()


# 7. Visualize the results by drawing the detected bounding boxes and labels on the original image.
# Use the previously defined plot_bboxes function.
# The plot_bboxes function expects boxes in [xmin, ymin, xmax, ymax] format, which boxes_np is.
# It also expects labels and scores.
# You might need to map the integer labels back to class names if you want to display names.
# Assuming a simple mapping for demonstration (adjust based on your dataset's classes)
class_names = {1: 'Alert', 2: 'Microsleep',3:'Yawn'} # Adjust based on your 10 classes (including background). The dummy data only has class1 and class2.

# Create a list of dictionaries in the format expected by plot_bboxes
# plot_bboxes seems to expect a different format based on the provided code.
# Let's re-examine plot_bboxes signature: def plot_bboxes(image, boxes, labels=[], colors=[], score=True, conf=None):
# It iterates through 'boxes' and expects each element to be a tensor/array with [xmin, ymin, xmax, ymax, confidence, class_id].
# Let's format our detections accordingly.
detections_for_plotting = []
for i in range(len(boxes_np)):
    # Combine box, score, and label into one array
    detection_info = np.concatenate((boxes_np[i], [scores_np[i]], [labels_np[i]]))
    detections_for_plotting.append(detection_info)

# Call plot_bboxes
# Ensure cv2_imshow is available if running in Colab.
# If not in Colab, you might need to use matplotlib or cv2.imshow directly.
try:
    from google.colab.patches import cv2_imshow
    IN_COLAB = True
except:
    IN_COLAB = False
    # If not in Colab, define a simple show function or use matplotlib
    def cv2_imshow(img):
        cv2.imshow("Inference Result", img)
        cv2.waitKey(0)
        cv2.destroyAllWindows()


# Convert the image back to BGR if necessary for cv2 functions
img_bgr_for_plotting = cv2.cvtColor(img_np.copy(), cv2.COLOR_RGB2BGR)

# Call plot_bboxes with the formatted detections
plot_bboxes(img_bgr_for_plotting, detections_for_plotting, labels=class_names, score=True, conf=conf_threshold)

# 8. Display the image with the inference results.
# plot_bboxes already handles displaying the image using cv2_imshow or cv2.imshow
print("Inference complete. Displaying image with detections:")