# climatecasechart, process surya outputs

Merge layout, reading order and ocr into a dataframe.

In [None]:
#@title Import

import re
import os
import json
import time
import pickle
import requests
import numpy as np
import pandas as pd
from tqdm.cli import tqdm
from bs4 import BeautifulSoup
from multiprocessing import Pool

from IPython.display import display
from shapely.geometry import box

import numpy as np
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
from boxly.ops import iou

tqdm.pandas()

# os.chdir('/slow-data/unitednationsclimatehealth/')
os.chdir('/ipfs-storage/ipfs/herbert/unitednationsclimatehealth/')

experiment_date = '2025-03-28-run-01' #@param {"type": "string"}

case_details = pd.read_pickle(f'data/{experiment_date}-case-details.p3')

case_details['layout'] = case_details['slug'].apply(lambda x: f'data/{experiment_date}-surya-layout/{x}/results.json')
case_details['ocr'] = case_details['slug'].apply(lambda x: f'data/{experiment_date}-surya-ocr/{x}/results.json')
case_details['pdf'] = case_details['slug'].apply(lambda x: f'data/{experiment_date}-pdf/{x}/')

finished_cases = case_details[
    case_details[['ocr', 'layout']].map(os.path.exists).min(1)]

len(finished_cases) / len(case_details)

0.9497041420118343

In [None]:
#@title Read OCR lines in one dataframe
def read_ocr(slug):
  with open(f'data/{experiment_date}-surya-ocr/{slug}/fixed-results.json') as f:
    return json.load(f)

def explode_lines(file_ocr):
  lines = pd.DataFrame(file_ocr).set_index('page')['text_lines'].explode()
  lines = lines.reset_index([]).set_index(lines.groupby(level=0).cumcount().rename('line'), append=True)['text_lines']
  return pd.DataFrame({k: lines.str[k] for k in ['polygon', 'confidence', 'text', 'bbox']})

ocr_lines_ = {
    (slug, filename): explode_lines(file_ocr)
    for slug in tqdm(finished_cases['slug'].unique(), position=0)
    for filename, file_ocr in read_ocr(slug).items()
}
ocr_lines = pd.concat(list(ocr_lines_.values()), keys=list(ocr_lines_.keys()), names=['slug', 'filename', 'page', 'line'])
ocr_lines = ocr_lines.dropna(subset=['bbox']).copy()
ocr_lines['box'] = ocr_lines['bbox'].progress_apply(lambda x: box(*x))

100%|██████████| 316/316 [00:07<00:00, 42.65it/s]
100%|██████████| 1314437/1314437 [00:18<00:00, 70789.04it/s]


In [None]:
#@title Read PDF lines in one dataframe

import pymupdf
pdf_lines_ = {
  (slug, filename, pagenum): pd.DataFrame(page.get_text("words", sort=False), columns=['x0', 'y0', 'x1', 'y1', 'text', 'block', 'line', 'word'])
  for slug, documents in tqdm(finished_cases.groupby('slug')['documents'], position=0)
  for filename in {document['File'].split('/')[-1].replace('.docx', '.pdf') for document in documents.explode()}
  if filename != ''
  if os.path.exists(f"data/{experiment_date}-pdf/{slug}/{filename}")
  for pagenum, page in enumerate(pymupdf.open(f"data/{experiment_date}-pdf/{slug}/{filename}"), start=1)
}

pdf_lines = pd.concat(list(pdf_lines_.values()), keys=pdf_lines_.keys(), names=['slug', 'filename', 'page', 'block'])
# empirical factor 4/3 between opcr and pdf, not what that is, but it works :P
pdf_lines['bbox'] = list(4/3 * np.stack(pdf_lines[['x0', 'y0', 'x1', 'y1']].values))
pdf_lines = pdf_lines.drop(['x0', 'y0', 'x1', 'y1'], axis=1).sort_index()
pdf_lines['box'] = pdf_lines['bbox'].progress_apply(lambda x: box(*x))

In [None]:
#@title Read Layouts in one dataframe
def read_layout(slug):
  with open(f'data/{experiment_date}-surya-layout/{slug}/fixed-results.json') as f:
    return json.load(f)

def explode_layout(file_layout):
  layout = pd.Series(file_layout).str['bboxes'].explode().apply(pd.Series)
  layout = layout.set_index(layout.index.to_series() + 1)
  layout = layout.set_index('position', append=True)
  layout.index.names = ['page', 'position']
  return layout


layouts_ = {
    (slug, filename): explode_layout(file_layout)
    for slug in tqdm(finished_cases['slug'].unique(), position=0)
    for filename, file_layout in read_layout(slug).items()
}
layouts = pd.concat(list(layouts_.values()), keys=list(layouts_.keys()), names=['slug', 'filename', 'page', 'position']).sort_index().sort_index()
layouts['box'] = layouts['bbox'].progress_apply(lambda x: box(*x) if x==x else None)

In [None]:
#@title Check if the OCR has much more text than the PDF.

# That would indicate the page is a scan, and just contains pixels but no OCR.
# Or vice versa that the PDF has text duplicated, for example to create drop shadows
# (OR that there's a bug and files are mismatched)

if 'length_ratios' not in globals():
  length_ratios = (
      pdf_lines['text'].str.replace('\n', '').str.replace(' ', '').str.len().groupby(['slug', 'filename']).sum() /
      ocr_lines['text'].str.replace('\n', '').str.replace(' ', '').str.len().groupby(['slug', 'filename']).sum())
(100 * length_ratios[(np.abs(1 - length_ratios) > 0.2)] - 100).round().astype(int)
length_ratios[(np.abs(1 - length_ratios) > 0.2)]

np.float64(0.7830923248053393)

In [None]:
#@title Choose between OCR or PDF text, or a mix

lines = ocr_lines.copy()

In [None]:
#@title Assign reading order position to lines via layouts


def assign_position(lines, layout):
  """Assign text to readin order boxes via maximum overlap"""
  return  lines['box'].apply(
    lambda text_box: layout['box'].apply(
      # layout with maximum inclusion of box
      lambda layout_box: layout_box.intersection(text_box).area).idxmax())


lines['position'] = lines.groupby(['slug', 'filename', 'page']).progress_apply(
  lambda page_lines: assign_position(page_lines, layouts.loc[page_lines.name])).reset_index([0,1,2], drop=True)

100%|██████████| 33259/33259 [05:01<00:00, 110.23it/s]


In [None]:
#@title Assign lines and word order per box

def assign_lines_to_boxes(text_box, eps=0.4):
  """Cluster all lines with transitively eps overlap in y-dimension to one line.
  add as line column to text_box"""
  boxes = np.stack(text_box['bbox']).reshape(-1, 2, 2)

  iou_distance = 1 - iou(boxes[..., 1:], boxes[..., 1:])

  # Cluster into lines using DBSCAN and the IoU
  lines = DBSCAN(
    eps=eps, min_samples=1, metric='precomputed'
  ).fit(iou_distance).labels_

  y_positions = boxes[..., 1].mean(-1).mean(-1)
  y_positions = pd.DataFrame({'lines': lines, 'y': y_positions}).groupby('lines')['y'].mean().sort_values()
  reorder = {v: k for k, v in enumerate(y_positions.sort_values().index)}
  return pd.Series([reorder[l] for l in lines], index=text_box.index)


def assign_word_order_to_lines(line_box):
  """Asign an integer word column to each like w.r.t. the x-position word order"""
  boxes = np.stack(line_box['bbox']).reshape(-1, 2, 2)
  x = pd.Series(boxes[..., 0].mean(-1)).rename('x')
  x.index.name = 'label'
  reorder = x.sort_values().reset_index().reset_index().set_index('label')['index'].to_dict()
  return pd.Series([reorder[l] for l in range(len(reorder))], index=line_box.index)

# 'words', i.e. chunks of words within a line, but not per se words seperated cleanly
words = lines.copy()
words['line_'] = words.groupby(['slug', 'filename', 'page', 'position']).progress_apply(assign_lines_to_boxes).reset_index([0,1,2,3], drop=True)
words['word'] = words.groupby(['slug', 'filename', 'page', 'position', 'line_']).progress_apply(assign_word_order_to_lines).reset_index([0,1,2,3,4], drop=True)
words = words.reset_index(['line'], drop=True).rename({'line_': 'line'}, axis=1).set_index(['position', 'line', 'word'], append=True)
words['label'] = layouts.loc[words.reset_index(['line', 'word'], drop=True).index]['label'].values
words.to_pickle(f'data/{experiment_date}-words.p3')

words.index.is_unique

100%|██████████| 361760/361760 [12:21<00:00, 488.19it/s]
100%|██████████| 1249706/1249706 [29:54<00:00, 696.59it/s]


True

In [515]:
#@title Export intermediate results (layouts, OCR tiles and PDF lines)
ocr_lines.to_pickle(f'data/{experiment_date}-ocr-lines.p3')
pdf_lines.to_pickle(f'data/{experiment_date}-pdf-lines.p3')
layouts.to_pickle(f'data/{experiment_date}-layouts.p3')