In [1]:
pip install torch torchvision pycocotools matplotlib


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import torch
from PIL import Image
from torch.utils.data import Dataset
from pycocotools.coco import COCO
import torchvision.transforms as T

class DocLayNetDataset(Dataset):
    def __init__(self, root, annotation, transforms=None):
        self.root = root
        self.coco = COCO(annotation)
        self.ids = list(sorted(self.coco.imgs.keys()))
        self.transforms = transforms

    def __getitem__(self, index):
        img_id = self.ids[index]
        ann_ids = self.coco.getAnnIds(imgIds=img_id)
        anns = self.coco.loadAnns(ann_ids)
        img_info = self.coco.loadImgs(img_id)[0]
        path = img_info['file_name']

        img = Image.open(os.path.join(self.root, path)).convert("RGB")

        boxes = []
        labels = []
        for ann in anns:
            bbox = ann['bbox']
            if bbox[2] <= 0 or bbox[3] <= 0:
                continue  # Skip invalid boxes
            boxes.append([bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]])
            labels.append(ann['category_id'])

        # Skip images with no valid boxes
        if len(boxes) == 0:
            return self.__getitem__((index + 1) % len(self))  # move to next image


        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)

        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["image_id"] = torch.tensor([img_id])

        if self.transforms:
            img = self.transforms(img)

        return img, target

    def __len__(self):
        return len(self.ids)

In [3]:
def get_transform():
    return T.Compose([
        T.ToTensor()
    ])

In [15]:
from torch.utils.data import DataLoader

# Paths to your dataset
image_dir = '../Dataset/DocLayNet/DocLayNet_core/PNG'
train_annotation_file = '../Dataset/DocLayNet/DocLayNet_core/COCO/train.json'
val_annotation_file = '../Dataset/DocLayNet/DocLayNet_core/COCO/val.json'
test_annotation_file = '../Dataset/DocLayNet/DocLayNet_core/COCO/test.json'

dataset = DocLayNetDataset(root=image_dir, annotation=train_annotation_file, transforms=get_transform())
data_loader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))

loading annotations into memory...
Done (t=3.98s)
creating index...
index created!


In [10]:
from torch.utils.data import Subset

subset_size = 200  # You can even try 20 for faster debugging
small_dataset = Subset(dataset, list(range(subset_size)))

data_loader = DataLoader(
    small_dataset,
    batch_size=2,
    shuffle=True,
    collate_fn=lambda x: tuple(zip(*x))
)

In [8]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

# Load a model pre-trained on COCO
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

# Replace the classifier with a new one, that has num_classes which is user-defined
num_classes = len(dataset.coco.getCatIds()) + 1  # +1 for background
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)



In [11]:
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    i = 0
    for images, targets in data_loader:
        images = list(img.to(device) for img in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if i % 10 == 0:
            print(f"Epoch [{epoch}], Step [{i}], Loss: {losses.item():.4f}")
        i += 1

Epoch [0], Step [0], Loss: 1.1508
Epoch [0], Step [10], Loss: 1.2008
Epoch [0], Step [20], Loss: 1.6238
Epoch [0], Step [30], Loss: 1.2171
Epoch [0], Step [40], Loss: 1.8042
Epoch [0], Step [50], Loss: 1.2839
Epoch [0], Step [60], Loss: 1.2874
Epoch [0], Step [70], Loss: 1.2310
Epoch [0], Step [80], Loss: 1.3439
Epoch [0], Step [90], Loss: 1.1340
Epoch [1], Step [0], Loss: 0.7197
Epoch [1], Step [10], Loss: 1.4401
Epoch [1], Step [20], Loss: 0.8241
Epoch [1], Step [30], Loss: 1.3185
Epoch [1], Step [40], Loss: 0.7903
Epoch [1], Step [50], Loss: 1.0069
Epoch [1], Step [60], Loss: 1.0932
Epoch [1], Step [70], Loss: 0.9970
Epoch [1], Step [80], Loss: 1.1217
Epoch [1], Step [90], Loss: 1.1764
Epoch [2], Step [0], Loss: 0.7656
Epoch [2], Step [10], Loss: 0.7360
Epoch [2], Step [20], Loss: 1.0090
Epoch [2], Step [30], Loss: 0.5459
Epoch [2], Step [40], Loss: 0.5559
Epoch [2], Step [50], Loss: 0.8344
Epoch [2], Step [60], Loss: 1.4666
Epoch [2], Step [70], Loss: 1.3315
Epoch [2], Step [80], L

In [12]:
torch.save(model.state_dict(), "fasterrcnn_doclaynet.pth")

In [16]:
val_dataset = DocLayNetDataset(
    root=image_dir,
    annotation=val_annotation_file,
    transforms=get_transform()
)

val_loader = DataLoader(
    val_dataset,
    batch_size=1,  # inference usually done 1 image at a time
    shuffle=False,
    collate_fn=lambda x: tuple(zip(*x))
)

loading annotations into memory...
Done (t=0.44s)
creating index...
index created!


In [17]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

# Create model architecture
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=False)

num_classes = len(val_dataset.coco.getCatIds()) + 1  # +1 for background
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

# Load your trained weights
model.load_state_dict(torch.load("fasterrcnn_doclaynet.pth", map_location="cpu"))  # or "cuda"
model.eval()



Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /Users/praharmodi/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


100%|██████████| 97.8M/97.8M [00:03<00:00, 32.2MB/s]


FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=1e-05)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=1e-05)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=1e-05)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=1e-05)
          (relu

In [18]:
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

from pycocotools.cocoeval import COCOeval
import numpy as np
import json

all_predictions = []

for images, targets in tqdm(val_loader):
    images = [img.to(device) for img in images]
    outputs = model(images)

    for i in range(len(outputs)):
        boxes = outputs[i]['boxes'].detach().cpu().numpy()
        scores = outputs[i]['scores'].detach().cpu().numpy()
        labels = outputs[i]['labels'].detach().cpu().numpy()
        image_id = int(targets[i]['image_id'].item())

        for box, score, label in zip(boxes, scores, labels):
            all_predictions.append({
                'image_id': image_id,
                'category_id': int(label),
                'bbox': [
                    float(box[0]), float(box[1]),
                    float(box[2] - box[0]), float(box[3] - box[1])
                ],
                'score': float(score)
            })

100%|██████████| 6489/6489 [58:23<00:00,  1.85it/s] 


In [19]:
with open("predictions.json", "w") as f:
    json.dump(all_predictions, f)

from pycocotools.coco import COCO

coco_gt = COCO(val_annotation_file)
coco_dt = coco_gt.loadRes("predictions.json")

coco_eval = COCOeval(coco_gt, coco_dt, iouType='bbox')
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()

loading annotations into memory...
Done (t=0.36s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.61s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=15.40s).
Accumulating evaluation results...
DONE (t=2.43s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.191
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.344
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.179
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.067
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.135
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.147
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.165
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.340
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDet