In [7]:
import sys
import requests
import tarfile
import json
import numpy as np
from os import path
from PIL import Image
from PIL import ImageFont, ImageDraw
from glob import glob
from matplotlib import pyplot as plt
%matplotlib inline
from os import path

In [8]:
import requests
import tarfile

# Download the dataset file
fname = 'examples.tar.gz'
url = 'https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/' + fname
r = requests.get(url)
open(fname, 'wb').write(r.content)

# Extract the dataset
tar = tarfile.open(fname)
tar.extractall()
tar.close()


In [9]:
data_path = "examples/"
path.exists(data_path)

True

In [10]:
# Define color code
colors = {'title': (255, 0, 0),
          'text': (0, 255, 0),
          'figure': (0, 0, 255),
          'table': (255, 255, 0),
          'list': (0, 255, 255)}

In [11]:
from PIL import ImageDraw, ImageFont

def markup(image, annotations, categories):
    draw = ImageDraw.Draw(image)
    font = ImageFont.truetype("examples/DejaVuSans.ttf", 15)

    for annotation in annotations:
        # Draw bounding box
        draw.rectangle(
            (annotation['bbox'][0],
             annotation['bbox'][1],
             annotation['bbox'][0] + annotation['bbox'][2],
             annotation['bbox'][1] + annotation['bbox'][3]),
            outline='red',
            width=2
        )

        # Draw label
        text = categories[annotation['category_id'] - 1]['name']
        draw.text((annotation['bbox'][0], annotation['bbox'][1]), text, font=font, fill=(255, 255, 255, 255))

    return image


In [12]:
import json
import os
import shutil

# Load the annotations JSON file
annotations_path = '/kaggle/input/documnet-layout-recognition-dataset-publaynet-t0/labels/publaynet/train.json'
with open(annotations_path, 'r') as f:
    annotations = json.load(f)

# Extract images and their annotations
image_data = {}
for image_info in annotations['images']:
    image_id = image_info['id']
    file_name = image_info['file_name']
    image_data[image_id] = {
        'file_name': file_name,
        'annotations': []
    }

for ann in annotations['annotations']:
    image_id = ann['image_id']
    image_data[image_id]['annotations'].append(ann)

# Now filter annotations for only the images that are in the dataset
image_dir = '/kaggle/input/documnet-layout-recognition-dataset-publaynet-t0/train-0/publaynet/train'
image_files = {f for f in os.listdir(image_dir) if f.endswith(('.jpg', '.jpeg', '.png'))}

filtered_annotations = {image_id: data for image_id, data in image_data.items() if data['file_name'] in image_files}

# Take the first 2000 images
filtered_annotations_2000 = dict(list(filtered_annotations.items())[:2000])

# Create new folder for saving the images and annotations
new_image_dir = '/kaggle/working/train_2000_images'
os.makedirs(new_image_dir, exist_ok=True)

# Copy the images to the new folder
for image_id, data in filtered_annotations_2000.items():
    src_path = os.path.join(image_dir, data['file_name'])
    dst_path = os.path.join(new_image_dir, data['file_name'])
    shutil.copyfile(src_path, dst_path)

# Create a new annotations file in COCO format for the first 2000 images
new_annotations = {
    "images": [],
    "annotations": [],
    "categories": annotations["categories"]  # Assuming categories stay the same
}

# Add the relevant images and annotations to the new annotations file
for image_id, data in filtered_annotations_2000.items():
    # Add image info
    for img_info in annotations['images']:
        if img_info['id'] == image_id:
            new_annotations["images"].append(img_info)
    
    # Add the annotations for this image
    for ann in data['annotations']:
        new_annotations["annotations"].append(ann)

# Save the new annotations file
new_annotations_path = '/kaggle/working/train_2000_annotations.json'
with open(new_annotations_path, 'w') as f:
    json.dump(new_annotations, f)

print(f"Done! Copied 2000 images and saved the annotations to {new_image_dir} and {new_annotations_path}.")


Done! Copied 2000 images and saved the annotations to /kaggle/working/train_2000_images and /kaggle/working/train_2000_annotations.json.


In [13]:
import shutil
import os

# Define the folder path to be zipped
folder_to_zip = '/kaggle/working/train_2000_images'

# Define the output zip file path (without the .zip extension)
output_zip = '/kaggle/working/train_2000_images'

# Create a zip file
shutil.make_archive(output_zip, 'zip', folder_to_zip)

print(f"Zipped folder saved as {output_zip}.zip")


Zipped folder saved as /kaggle/working/train_2000_images.zip


In [14]:
import os
import zipfile

# Define the path to the zip file and the directory to extract it
file_path = '/kaggle/input/documnet-layout-recognition-dataset-publaynet-t0/train-0/publaynet/train'
# Now count the number of images in the extracted directory
image_extensions = ['.jpg', '.jpeg', '.png']  # Add more extensions if needed

image_count = 0

for root, dirs, files in os.walk(file_path):
    for file in files:
        if any(file.lower().endswith(ext) for ext in image_extensions):
            image_count += 1

print(f"Total number of images in train-0: {image_count}")


Total number of images in train-0: 47958


In [15]:
import torch
from torch.utils.data import Dataset
from PIL import Image
import os

class PublayNetDataset(Dataset):
    def __init__(self, image_dir, annotations, transform=None):
        self.image_dir = image_dir
        self.annotations = annotations
        self.transform = transform

    def __len__(self):
        return len(self.annotations)  # Use the length of the filtered_annotations dictionary

    def __getitem__(self, idx):
        image_id = list(self.annotations.keys())[idx]
        img_path = os.path.join(self.image_dir, self.annotations[image_id]['file_name'])

        if not os.path.exists(img_path):
            raise FileNotFoundError(f"Image not found: {img_path}")

        img = Image.open(img_path).convert("RGB")
        
        # Get bounding boxes and labels
        boxes = []
        labels = []
        for ann in self.annotations[image_id]['annotations']:
            xmin, ymin, width, height = ann['bbox']
            if width > 0 and height > 0:
                xmax = xmin + width
                ymax = ymin + height
                boxes.append([xmin, ymin, xmax, ymax])
                labels.append(ann['category_id'])

        # Handle cases where there are no valid boxes
        if len(boxes) == 0:
            boxes = torch.zeros((0, 4), dtype=torch.float32)
            labels = torch.zeros((0,), dtype=torch.int64)
        else:
            boxes = torch.as_tensor(boxes, dtype=torch.float32)
            labels = torch.as_tensor(labels, dtype=torch.int64)

        target = {'boxes': boxes, 'labels': labels}

        if self.transform:
            img = self.transform(img)

        return img, target


In [16]:
from torch.utils.data import DataLoader
import torchvision
from torchvision.transforms import ToTensor
import torch.optim as optim

# Load the pre-trained Faster R-CNN model
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

# Adjust the number of classes (background + your categories)
num_classes = 6  # Define the number of classes you have (including background)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)

# Move the model to the appropriate device (GPU/CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Prepare the dataset and dataloader
train_dataset = PublayNetDataset(image_dir=image_dir, annotations=filtered_annotations, transform=ToTensor())
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))

# Define optimizer
optimizer = optim.SGD(model.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for images, targets in train_loader:
        images = [image.to(device) for image in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        # Forward pass
        loss_dict = model(images, targets)

        # Total loss
        losses = sum(loss for loss in loss_dict.values())
        epoch_loss += losses.item()

        # Backpropagation
        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader)}")


Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
100%|██████████| 160M/160M [00:01<00:00, 161MB/s]  


KeyboardInterrupt: 