# Layout Parser - Model trained


The model used here has been trained on BNF (Gallica)'s newspapers pages. they has been annotated and these annotations exported on COCO format.

It can be used to extract the images the newspapers pages contained.

The only thing to change is the paths to:
- the model & config file (.pth & .yaml)
- the pages you want to extract the pictures

This notebook is made to be used linked to your Google Drive account. You can easily adapt it to use it with Jupyter.


In [6]:
import torch
print(torch.version.cuda)

12.1


In [7]:
# Install necessary packages
!pip install -q requests
!pip install -q --upgrade pip
!pip install -q opencv-python
!pip install -q torchvision
!pip install Pillow==9.5.0

# Import necessary libraries
import shutil
import requests
import glob
import re
import math
import json
import os
import cv2
from PIL import Image

# Install and import layoutparser and detectron2
!pip install -q 'git+https://github.com/facebookresearch/detectron2.git@v0.4#egg=detectron2'
!pip install -q -U layoutparser

import layoutparser as lp

# Import PyTorch and empty the CUDA cache
import torch
torch.cuda.empty_cache()

# Print the PyTorch version
print(torch.__version__)

import requests

# Define the Hugging Face repository and model files
repo_id = "mgiardinetti/layout-parser-newspapers"
config_file = "config.yaml"
model_file = "model_final.pth"

# Create a directory to save the model
save_directory = "/content/layout-parser-newspapers"
os.makedirs(save_directory, exist_ok=True)

# Download the configuration file
config_url = f"https://huggingface.co/{repo_id}/resolve/main/{config_file}"
config_path = os.path.join(save_directory, config_file)
response = requests.get(config_url)
with open(config_path, 'wb') as f:
    f.write(response.content)

# Download the model weights
model_url = f"https://huggingface.co/{repo_id}/resolve/main/{model_file}"
model_path = os.path.join(save_directory, model_file)
response = requests.get(model_url)
with open(model_path, 'wb') as f:
    f.write(response.content)

from google.colab import drive
drive.mount('/content/drive')

[0m  Preparing metadata (setup.py) ... [?25l[?25hdone
[0m2.3.0+cu121
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
import layoutparser as lp

# Model and configuration file paths
model = "/content/layout-parser-newspapers/model_final.pth"
config = "/content/layout-parser-newspapers/config.yaml"

# Association id:nom for each class to detect
labels = "0:illustration 1:legende"

# Parse labels
dictlabels = {}
if labels:
    for asso in labels.split(" "):
        key, value = asso.split(":")
        dictlabels[int(key)] = value

# Load the model
try:
    if dictlabels:
        model = lp.models.Detectron2LayoutModel(
            config_path=config,
            model_path=model,
            label_map=dictlabels
        )
    else:
        model = lp.models.Detectron2LayoutModel(
            config_path=config,
            model_path=model
        )
    print("Modèle importé")
except Exception as e:
    print(f"Le modèle n'a pas été importé: {e}")


Modèle importé


In [9]:
import cv2

def resize_image(img, shape):
    """ Resize image to a specific shape, maintaining aspect ratio. """
    h, w = img.shape[:2]
    aspect_ratio = h / w
    target_h, target_w = shape
    if aspect_ratio > 1:
        new_h, new_w = target_h, int(target_h / aspect_ratio)
    else:
        new_w, new_h = target_w, int(target_w * aspect_ratio)

    resized_img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
    return resized_img

def add_borders(img, shape):
    """ Add black borders to maintain the desired aspect ratio. """
    h, w = img.shape[:2]
    top = bottom = (shape[0] - h) // 2
    left = right = (shape[1] - w) // 2
    color = [0, 0, 0]
    new_img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
    return new_img

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [20]:
import os
import glob
import re
import cv2
from PIL import Image, UnidentifiedImageError
# import layoutparser as lp
import numpy as np

# Define directories
extractions_path = '/content/drive/My Drive/similaires_rol_extracted/'
pages_directory = '/content/drive/My Drive/similaires_rol/'

seuil_score = 0.95

all_pages_extracted = glob.glob(os.path.join(extractions_path, '*.jpg'))
all_pages = glob.glob(os.path.join(pages_directory, '*.jpg'))

print(len(all_pages))

all_pages_filtered = []
for page in all_pages:
  filtered = False
  id = page.split('/')[-1].split('.')[0]
  for page_extracted in all_pages_extracted:
    id_extracted = page_extracted.split('/')[-1].split('.')[0]
    if id in id_extracted:
      filtered = True
    if filtered:
      break
  if filtered:
     all_pages_filtered.append(id)
print(len(all_pages_filtered))

for page_path in all_pages_filtered:
    page_name = os.path.basename(page_path)

    # Read and convert image
    try :
      image = cv2.imread(page_path)
      image = image[..., ::-1]

      # Detect layout
      layout = model.detect(image)
      layout = [x for x in layout if x.score > seuil_score]
      print("_________________Extraction of the pictures_________________")

      # Extract bounding box coordinates
      layout_str = str(layout)
      x1 = list(map(int, re.findall(r'x_1=(\d+)', layout_str)))
      y1 = list(map(int, re.findall(r'y_1=(\d+)', layout_str)))
      x2 = list(map(int, re.findall(r'x_2=(\d+)', layout_str)))
      y2 = list(map(int, re.findall(r'y_2=(\d+)', layout_str)))

      # Crop and save images
      for i, (x1_i, y1_i, x2_i, y2_i) in enumerate(zip(x1, y1, x2, y2)):
          try :
              img = Image.open(page_path)
              cropped = img.crop((x1_i, y1_i, x2_i, y2_i))
              cropped = np.array(cropped)
              cropped_file_name = f"{os.path.splitext(page_name)[0]}_{i+1:02d}.jpg"
              cropped = resize_image(cropped, (1024,1024))
              cropped = add_borders(cropped, (1024,1024))
              cv2.imwrite(os.path.join(extractions_path, cropped_file_name), cropped)
          except Exception as error:
              print(error)
              print("Error in subimage")
              continue
    except Exception as error:
        print(error)
        print("Error reading image")
        continue


2015
1260
