## Read YOLO format laebl, and define function to train and evaluate model

In [1]:
import os
import torch
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.models.detection import fasterrcnn_resnet50_fpn, FasterRCNN_ResNet50_FPN_Weights
from torchmetrics.detection.mean_ap import MeanAveragePrecision

# Dataset class for YOLO format
class YOLODetectionDataset(Dataset):
    def __init__(self, image_dirs, label_dirs, transform=None):
        self.images = []
        self.labels = []
        for img_dir, lbl_dir in zip(image_dirs, label_dirs):
            for fname in os.listdir(img_dir):
                if not fname.lower().endswith(('.jpg','jpeg','.png')):
                    continue
                img_path = os.path.join(img_dir, fname)
                lbl_path = os.path.join(lbl_dir, os.path.splitext(fname)[0] + '.txt')
                
                # only keep if .txt exists _and_ is non‑empty
                if os.path.exists(lbl_path) and os.path.getsize(lbl_path) > 0:
                    self.images.append(img_path)
                    self.labels.append(lbl_path)
        self.transform = transform or transforms.ToTensor()
        
    def __len__(self):
        return len(self.images)
        
    def __getitem__(self, idx):
        img = Image.open(self.images[idx]).convert("RGB")
        img = self.transform(img)
        boxes, labels = [], []
        with open(self.labels[idx]) as f:
            for line in f:
                cls, x_c, y_c, w, h = map(float, line.split())
                ih, iw = img.shape[1], img.shape[2]
                x_c, y_c, w, h = x_c*iw, y_c*ih, w*iw, h*ih
                x_min = x_c - w/2; y_min = y_c - h/2
                x_max = x_c + w/2; y_max = y_c + h/2
                boxes.append([x_min, y_min, x_max, y_max])
                labels.append(1)  # Only one class
        target = {
            "boxes": torch.tensor(boxes, dtype=torch.float32),
            "labels": torch.tensor(labels, dtype=torch.int64)
        }
        return img, target

# Utility functions
def collate_fn(batch): return tuple(zip(*batch))

def train_model(loader, model, optimizer, device, epochs=10):
    model.to(device).train()
    for e in range(epochs):
        total_loss = 0
        for images, targets in loader:
            images = [img.to(device) for img in images]
            targs = [{k: v.to(device) for k,v in t.items()} for t in targets]
            loss_dict = model(images, targs)
            loss = sum(loss_dict.values())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {e+1}/{epochs}, Loss: {total_loss:.4f}")

# Evaluation (built it same as the YOLO)
def evaluate_model(loader, model, device, conf_thr=0.001):
    model.to(device).eval()
    # IoU thresholds 0.50:0.95
    mAP_default = MeanAveragePrecision()
    # Only IoU = 0.50
    mAP50 = MeanAveragePrecision(iou_thresholds=[0.5])

    n_images = 0
    n_instances = 0

    with torch.no_grad():
        for images, targets in loader:
            # count images & GT boxes
            n_images    += len(images)
            n_instances += sum(t["boxes"].shape[0] for t in targets)

            images = [img.to(device) for img in images]
            outputs = model(images)

            preds_default, preds_50, gts = [], [], []
            for out, tgt in zip(outputs, targets):
                # apply a confidence threshold
                keep = out["scores"] >= conf_thr

                boxes  = out["boxes"][keep].cpu()
                scores = out["scores"][keep].cpu()
                labels = out["labels"][keep].cpu()

                pred = {"boxes": boxes, "scores": scores, "labels": labels}
                preds_default.append(pred)
                preds_50.append(pred)

                gts.append({"boxes": tgt["boxes"], "labels": tgt["labels"]})

            # update both metrics
            mAP_default.update(preds_default, gts)
            mAP50.update(preds_50, gts)

    res_def = mAP_default.compute()
    res_50  = mAP50.compute()
    P = res_50["map"].item()
    R = res_50["mar_100"].item()
    mAP50v = res_50["map"].item()
    mAP5095v = res_def["map"].item()

    header = f"{'Class':<10}{'Images':>8}{'Instances':>12}{'Box(P)':>8}{'R':>8}{'mAP50':>8}{'mAP50-95':>12}"
    row = f"{'all':<10}{n_images:8d}{n_instances:12d}{P:8.3f}{R:8.3f}{mAP50v:8.3f}{mAP5095v:12.3f}"
    print(header)
    print(row)

    return {
        "Class": "all",
        "Images": n_images,
        "Instances": n_instances,
        "P": P,
        "R": R,
        "mAP50": mAP50v,
        "mAP50-95": mAP5095v
    }

# Paths
orig_img_dir = r"C:\Users\peggy\Desktop\Trash_detection\Dataset\roboflow_trash_dataset\train\images"
orig_lbl_dir = r"C:\Users\peggy\Desktop\Trash_detection\Dataset\roboflow_trash_dataset\train\labels"
val_img_dir = r"C:\Users\peggy\Desktop\Trash_detection\Dataset\roboflow_trash_dataset\valid\images"
val_lbl_dir = r"C:\Users\peggy\Desktop\Trash_detection\Dataset\roboflow_trash_dataset\valid\labels"
ai_img_dir = r"C:\Users\peggy\generated_trash_images\vott-json-export\images"
ai_lbl_dir = r"C:\Users\peggy\generated_trash_images\vott-json-export\labels"

save_dir = r"C:\Users\peggy\OneDrive\Desktop\Trash_detection\models"
os.makedirs(save_dir, exist_ok=True)

## Train the model

In [22]:
import time
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_ds1 = YOLODetectionDataset([orig_img_dir],[orig_lbl_dir])
train_loader1 = DataLoader(train_ds1, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_ds    = YOLODetectionDataset([val_img_dir],[val_lbl_dir])
val_loader= DataLoader(val_ds, batch_size=16, shuffle=False, collate_fn=collate_fn)

model1 = fasterrcnn_resnet50_fpn(weights = None, num_classes=2)
opt1   = torch.optim.SGD(model1.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005)
torch.save(model1.state_dict(), os.path.join(save_dir, "fasterrcnn_original.pth"))


print("=== Training on original data ===")
start_time = time.time()
train_model(train_loader1, model1, opt1, device, epochs=30)
end_time = time.time()
print("=== Evaluating original model ===")
evaluate_model(val_loader, model1, device)

orig_duration = end_time - start_time
print(f"Original data training took {orig_duration:.2f} seconds ({orig_duration/60:.2f} minutes)")

=== Training on original data ===
Epoch 1/30, Loss: 24.7681
Epoch 2/30, Loss: 19.5854
Epoch 3/30, Loss: 18.0669
Epoch 4/30, Loss: 16.7234
Epoch 5/30, Loss: 16.1960
Epoch 6/30, Loss: 15.4970
Epoch 7/30, Loss: 14.8551
Epoch 8/30, Loss: 14.7996
Epoch 9/30, Loss: 14.1542
Epoch 10/30, Loss: 13.7943
Epoch 11/30, Loss: 13.0350
Epoch 12/30, Loss: 12.5883
Epoch 13/30, Loss: 11.9334
Epoch 14/30, Loss: 11.8084
Epoch 15/30, Loss: 10.9434
Epoch 16/30, Loss: 10.5290
Epoch 17/30, Loss: 9.7844
Epoch 18/30, Loss: 11.0196
Epoch 19/30, Loss: 9.7792
Epoch 20/30, Loss: 9.1784
Epoch 21/30, Loss: 8.8238
Epoch 22/30, Loss: 8.6329
Epoch 23/30, Loss: 8.3715
Epoch 24/30, Loss: 7.9129
Epoch 25/30, Loss: 7.9295
Epoch 26/30, Loss: 7.2794
Epoch 27/30, Loss: 7.3696
Epoch 28/30, Loss: 6.7385
Epoch 29/30, Loss: 6.6272
Epoch 30/30, Loss: 6.6839
=== Evaluating original model ===
Class       Images   Instances  Box(P)       R   mAP50    mAP50-95
all            244         505   0.603   0.733   0.603       0.344
Original d

## Conclusion on the original dataset
Parameter: epochs=30, batch=16

## Result
number of image: 244

Precision: 0.603

Recall: 0.733

mAP@50: 0.603 

mAP@50-95: 0.344

## Time used
Training: 1158.29 s

### Save model

In [27]:
save_dir = r"C:\Users\peggy\Desktop\Trash_detection\Dataset\models"
os.makedirs(save_dir, exist_ok=True)
checkpoint_path = os.path.join(save_dir, "fasterrcnn_trained.pth")
torch.save(model1.state_dict(), checkpoint_path)

### Test using my dataset

In [11]:
import time
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
def evaluate_fasterrcnn(img_dir,lbl_dir, checkpoint_path, num_classes = 2, batch_size = 1, conf_thr = 0.001, device = "cuda"):
    # Build & load model
    model = fasterrcnn_resnet50_fpn(weights=None, num_classes=num_classes)
    in_feats = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_feats, num_classes)
    model.load_state_dict(torch.load(checkpoint_path, map_location=device))
    model.to(device)

    # Prepare dataLoader
    test_ds = YOLODetectionDataset([img_dir], [lbl_dir])
    test_loader = DataLoader(test_ds, batch_size=batch_size,
                             shuffle=False, collate_fn=collate_fn)

    start = time.time()
    results = evaluate_model(test_loader, model, device, conf_thr)
    elapsed = time.time() - start
    
    num_images = results["Images"]
    avg_per_image = elapsed / num_images
    
    print(f"Total evaluation time: {elapsed:.2f}s")
    print(f"Avg time per image: {avg_per_image:.4f}s ({avg_per_image*1000:.1f} ms)")

    return results

In [13]:
metrics2 = evaluate_fasterrcnn(
img_dir = r"C:\Users\peggy\Desktop\Trash_detection\Dataset\my_trash_dataset\images",
lbl_dir = r"C:\Users\peggy\Desktop\Trash_detection\Dataset\my_trash_dataset\labels",
checkpoint_path = r"C:\Users\peggy\Desktop\Trash_detection\Dataset\models\fasterrcnn_trained.pth",
)

  model.load_state_dict(torch.load(checkpoint_path, map_location=device))


Class       Images   Instances  Box(P)       R   mAP50    mAP50-95
all            185         253   0.699   0.862   0.699       0.392
Total evaluation time: 57.56s
Avg time per image: 0.3111s (311.1 ms)


## Result on my data
Precision: 0.699

Recall: 0.862

mAP50: 0.699

mAP50-95: 0.392

Speed: Avg inference time/image: 311.1 ms

## Train with AI generated Data

In [11]:
train_ds2 = YOLODetectionDataset([orig_img_dir, ai_img_dir],
                                 [orig_lbl_dir, ai_lbl_dir])
train_loader2 = DataLoader(train_ds2, batch_size=16, shuffle=True, collate_fn=collate_fn)

model2 = fasterrcnn_resnet50_fpn(weights=None, num_classes=2)
opt2   = torch.optim.SGD(model2.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005)
torch.save(model2.state_dict(), os.path.join(save_dir, "fasterrcnn_merged.pth"))

print("=== Training on merged data ===")
start_time = time.time()
train_model(train_loader2, model2, opt2, device, epochs=30)
end_time = time.time()
print("=== Evaluating merged model ===")
evaluate_model(val_loader, model2, device)

merged_duration = end_time - start_time
print(f"Merged data training took {merged_duration:.2f} seconds ({merged_duration/60:.2f} minutes)")

=== Training on merged data ===
Epoch 1/30, Loss: 30.5740
Epoch 2/30, Loss: 24.9909
Epoch 3/30, Loss: 22.2866
Epoch 4/30, Loss: 20.2479
Epoch 5/30, Loss: 19.5244
Epoch 6/30, Loss: 18.8146
Epoch 7/30, Loss: 18.0510
Epoch 8/30, Loss: 17.1518
Epoch 9/30, Loss: 16.7073
Epoch 10/30, Loss: 15.6495
Epoch 11/30, Loss: 14.7425
Epoch 12/30, Loss: 14.1078
Epoch 13/30, Loss: 13.7980
Epoch 14/30, Loss: 12.8606
Epoch 15/30, Loss: 12.2356
Epoch 16/30, Loss: 11.3058
Epoch 17/30, Loss: 11.1532
Epoch 18/30, Loss: 10.7390
Epoch 19/30, Loss: 10.1994
Epoch 20/30, Loss: 9.9895
Epoch 21/30, Loss: 9.1777
Epoch 22/30, Loss: 8.9881
Epoch 23/30, Loss: 8.9582
Epoch 24/30, Loss: 8.1195
Epoch 25/30, Loss: 7.8011
Epoch 26/30, Loss: 7.7971
Epoch 27/30, Loss: 7.7635
Epoch 28/30, Loss: 7.2568
Epoch 29/30, Loss: 7.2879
Epoch 30/30, Loss: 7.1140
=== Evaluating merged model ===
Class       Images   Instances  Box(P)       R   mAP50    mAP50-95
all            244         505   0.602   0.689   0.602       0.346
Merged data 

## Conclusion on the merged dataset
Parameter: epochs=200, imgsz=640, batch=16

## Result
number of image: 244

Precision: 0.602

Recall: 0.689

mAP@50: 0.602 

mAP@50-95: 0.346

## Time used
Training: 1869.89 s

### Save Model

In [23]:
checkpoint_path = os.path.join(save_dir, "fasterrcnn_trained_AI.pth")
torch.save(model2.state_dict(), checkpoint_path)

### Test with my dataset

In [17]:
metrics2 = evaluate_fasterrcnn(
img_dir = r"C:\Users\peggy\Desktop\Trash_detection\Dataset\my_trash_dataset\images",
lbl_dir = r"C:\Users\peggy\Desktop\Trash_detection\Dataset\my_trash_dataset\labels",
checkpoint_path = r"C:\Users\peggy\Desktop\Trash_detection\Dataset\models\fasterrcnn_trained_AI.pth",
)

  model.load_state_dict(torch.load(checkpoint_path, map_location=device))


Class       Images   Instances  Box(P)       R   mAP50    mAP50-95
all            185         253   0.700   0.850   0.700       0.402
Total evaluation time: 58.79s
Avg time per image: 0.3178s (317.8 ms)


## Result on my data
Precision: 0.700

Recall: 0.850

mAP50: 0.700

mAP50-95: 0.402

Speed: Avg inference time/image: 317.8 ms

## Data argumentation

In [21]:
import os
import torch
import numpy as np
from PIL import Image
from torch.utils.data import Dataset
from albumentations import (
    Compose, HorizontalFlip, RandomBrightnessContrast, Resize, Normalize,
    HueSaturationValue, ShiftScaleRotate, BboxParams, RandomScale, RandomCrop
)
from albumentations.pytorch import ToTensorV2

# Define transforms
train_transforms = Compose([
    RandomScale(scale_limit=(0.5, 2.0), p=0.7),
    RandomCrop(600, 600, p=0.5),
    Resize(800, 800, p=1.0),
    HorizontalFlip(p=0.5),
    RandomBrightnessContrast(p=0.5),
    HueSaturationValue(p=0.5),
    Normalize(mean=[0.0, 0.0, 0.0], std=[255.0, 255.0, 255.0], max_pixel_value=255.0),
    ToTensorV2()
], 
bbox_params=BboxParams(format='pascal_voc', label_fields=['labels'], min_area=0.0, min_visibility=0.0))

val_transforms = Compose([
    Normalize(mean=[0.0, 0.0, 0.0], std=[255.0, 255.0, 255.0], max_pixel_value=255.0),
    ToTensorV2()
], 
bbox_params=BboxParams(format='pascal_voc', label_fields=['labels'], min_area=0.0, min_visibility=0.0))

# New dataset class for augmented data
class AugmentedRCNNDataset(Dataset):
    def __init__(self, img_dir, lbl_dir, transforms=None):
        self.img_dir = img_dir
        self.lbl_dir = lbl_dir
        self.transforms = transforms
        self.images = []
        # Collect valid images with non-empty labels
        for fname in os.listdir(img_dir):
            if not fname.lower().endswith(('.jpg','.jpeg','.png')):
                continue
            lbl_path = os.path.join(lbl_dir, os.path.splitext(fname)[0] + '.txt')
            if os.path.exists(lbl_path) and os.path.getsize(lbl_path) > 0:
                self.images.append(fname)

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        fname = self.images[idx]
        img_path = os.path.join(self.img_dir, fname)
        lbl_path = os.path.join(self.lbl_dir, os.path.splitext(fname)[0] + '.txt')

        # Load original image and boxes
        img_np = np.array(Image.open(img_path).convert("RGB"))
        H_orig, W_orig = img_np.shape[:2]
        boxes_orig, labels_orig = [], []
        with open(lbl_path) as f:
            for line in f:
                cls, x_c, y_c, w, h = map(float, line.strip().split())
                # Convert YOLO format to Pascal VOC
                x_c *= W_orig
                y_c *= H_orig
                w *= W_orig
                h *= H_orig
                x0 = x_c - w/2
                y0 = y_c - h/2
                x1 = x_c + w/2
                y1 = y_c + h/2
                boxes_orig.append([x0, y0, x1, y1])
                labels_orig.append(1)  # Single class

        boxes, labels = boxes_orig.copy(), labels_orig.copy()
        img_transformed = img_np.copy()
        applied_augmentation = False

        if self.transforms:
            for _ in range(3):  # Retry augmentation up to 3 times
                # Apply augmentation
                try:
                    augmented = self.transforms(
                        image=img_np,
                        bboxes=boxes_orig,
                        labels=labels_orig
                    )
                    img_transformed = augmented['image']
                    boxes_aug = augmented['bboxes']
                    labels_aug = augmented['labels']
                    H, W = img_transformed.shape[:2]

                    # Process boxes: clamp and validate
                    valid_boxes = []
                    valid_labels = []
                    for box, lbl in zip(boxes_aug, labels_aug):
                        x0, y0, x1, y1 = box
                        # Clamp to image dimensions
                        x0 = max(0, min(x0, W))
                        x1 = max(0, min(x1, W))
                        y0 = max(0, min(y0, H))
                        y1 = max(0, min(y1, H))
                        # Check area
                        if x1 > x0 and y1 > y0:
                            valid_boxes.append([x0, y0, x1, y1])
                            valid_labels.append(lbl)
                    
                    if len(valid_boxes) > 0:
                        boxes = valid_boxes
                        labels = valid_labels
                        applied_augmentation = True
                        break
                except:
                    continue  # Retry on augmentation error

            if not applied_augmentation:
                # Fallback to original image with valid boxes
                H, W = H_orig, W_orig
                valid_boxes = []
                valid_labels = []
                for box, lbl in zip(boxes_orig, labels_orig):
                    x0, y0, x1, y1 = box
                    x0 = max(0, min(x0, W))
                    x1 = max(0, min(x1, W))
                    y0 = max(0, min(y0, H))
                    y1 = max(0, min(y1, H))
                    if x1 > x0 and y1 > y0:
                        valid_boxes.append([x0, y0, x1, y1])
                        valid_labels.append(lbl)
                boxes = valid_boxes
                labels = valid_labels
                # Apply normalization and ToTensor manually
                img_transformed = img_np.astype(np.float32) / 255.0
                img_transformed = torch.from_numpy(img_transformed).permute(2, 0, 1)
        else:
            # No transforms applied
            H, W = H_orig, W_orig
            img_transformed = torch.from_numpy(img_np).permute(2, 0, 1).float() / 255.0

        # Final check for valid boxes
        if len(boxes) == 0:
            raise ValueError(f"No valid boxes for {fname} after processing.")

        target = {
            "boxes": torch.tensor(boxes, dtype=torch.float32),
            "labels": torch.tensor(labels, dtype=torch.int64)
        }
        return img_transformed, target


# Instantiate model (same as before)
model = fasterrcnn_resnet50_fpn(weights=None, num_classes=2)
in_feats = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_feats, num_classes=2)

# Build dataLoaders 
train_ds = AugmentedRCNNDataset(
    img_dir=orig_img_dir,
    lbl_dir=orig_lbl_dir,
    transforms=train_transforms
)
train_loader = DataLoader(train_ds, batch_size=4, shuffle=True, collate_fn=collate_fn)

val_ds = AugmentedRCNNDataset(
    img_dir=val_img_dir,
    lbl_dir=val_lbl_dir,
    transforms=val_transforms
)
val_loader = DataLoader(val_ds, batch_size=2, shuffle=False, collate_fn=collate_fn)

# Train & evaluate
optimizer = torch.optim.SGD(model.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005)
print("=== Training on improved model ===")
start_time = time.time()
train_model(train_loader, model, optimizer, device, epochs=30)
end_time = time.time()
print("=== Evaluating on improved model ===")
evaluate_model(val_loader, model, device)
print(f"Improved data training took {orig_duration:.2f} seconds ({orig_duration/60:.2f} minutes)")

=== Training on improved model ===
Epoch 1/30, Loss: 129.5551
Epoch 2/30, Loss: 100.9766
Epoch 3/30, Loss: 92.9542
Epoch 4/30, Loss: 86.0217
Epoch 5/30, Loss: 92.4837
Epoch 6/30, Loss: 86.6239
Epoch 7/30, Loss: 84.6484
Epoch 8/30, Loss: 85.3347
Epoch 9/30, Loss: 84.8122
Epoch 10/30, Loss: 76.0549
Epoch 11/30, Loss: 74.8603
Epoch 12/30, Loss: 75.7057
Epoch 13/30, Loss: 71.6443
Epoch 14/30, Loss: 70.4226
Epoch 15/30, Loss: 68.1486
Epoch 16/30, Loss: 73.6413
Epoch 17/30, Loss: 65.0139
Epoch 18/30, Loss: 62.2270
Epoch 19/30, Loss: 62.9046
Epoch 20/30, Loss: 62.0147
Epoch 21/30, Loss: 65.1031
Epoch 22/30, Loss: 67.6373
Epoch 23/30, Loss: 61.1340
Epoch 24/30, Loss: 61.9004
Epoch 25/30, Loss: 57.8253
Epoch 26/30, Loss: 55.4636
Epoch 27/30, Loss: 53.1573
Epoch 28/30, Loss: 59.9789
Epoch 29/30, Loss: 61.0484
Epoch 30/30, Loss: 57.5018
=== Evaluating on improved model ===
Class       Images   Instances  Box(P)       R   mAP50    mAP50-95
all            244         472   0.620   0.739   0.620    

## Conclusion on the merged dataset
Parameter: epochs=30, batch=16

## Result
number of image: 244

Precision: 0.620

Recall: 0.739 (Finds just over half od all real trash object)

mAP@50: 0.620 

mAP@50-95: 0.379

## Time used
Training: 1398.30 s

### Save Model

In [25]:
checkpoint_path = os.path.join(save_dir, "fasterrcnn_trained_argumented.pth")
torch.save(model.state_dict(), checkpoint_path)

### Test on my dataset

In [18]:
metrics = evaluate_fasterrcnn(
img_dir = r"C:\Users\peggy\Desktop\Trash_detection\Dataset\my_trash_dataset\images",
lbl_dir = r"C:\Users\peggy\Desktop\Trash_detection\Dataset\my_trash_dataset\labels",
checkpoint_path = r"C:\Users\peggy\Desktop\Trash_detection\Dataset\models\fasterrcnn_trained_argumented.pth",
)

  model.load_state_dict(torch.load(checkpoint_path, map_location=device))


Class       Images   Instances  Box(P)       R   mAP50    mAP50-95
all            185         253   0.697   0.885   0.697       0.351
Total evaluation time: 58.54s
Avg time per image: 0.3164s (316.4 ms)


## Result on my data
Precision: 0.697

Recall: 0.885

mAP50: 0.697

mAP50-95: 0.351

Speed: Avg inference time/image: 316.4 ms