In [None]:
import requests
import time
from tempfile import TemporaryDirectory
from IPython.display import clear_output, FileLink, Image, display
from zipfile import ZipFile
from pyarrow import parquet
import json
import os

COSMOS_BASE_URL = "http://xdd.wisc.edu/cosmos_service"
SAMPLE_PDF_URL= "https://www.nature.com/articles/s41591-020-0883-7.pdf"
SAMPLE_PDF_PATH = "sidarthe.pdf"

### Submit a PDF for Processing

Make a POST request to the base URL containing a multipart form with the contents of the file in the `pdf` field.
Other form fields include:
- `compress_images` (default `True`): Return compressed JPG images if `True`, uncompressed PNGs otherwise.  

In [None]:
# Download a local copy of the PDF

with open(SAMPLE_PDF_PATH, 'wb') as pdf_writer:
    pdf_writer.write(requests.get(SAMPLE_PDF_URL).content)

In [None]:
# Submit the locally copied PDF to the COSMOS processing pipeline

submit_endpoint = COSMOS_BASE_URL + '/process/'

with open(SAMPLE_PDF_PATH, 'rb') as pdf_to_parse:
    file_form = {'pdf': pdf_to_parse }
    data_form = {'compress_images': False }

    response = requests.post(submit_endpoint, files=file_form, data=data_form)

    response_data = response.json()
    job_id = response_data['job_id']

    status_endpoint = response_data['status_endpoint']
    results_endpoint = response_data['result_endpoint']

    print(f"Message: {response_data['message']}\n"
          f"Job ID: {response_data['job_id']}\n"
          f"Status Endpoint: {status_endpoint}\n"
          f"Results Endpoint: {results_endpoint}")

### Monitor COSMOS Processing Status

Poll the URL suffix returned in the `status_endpoint` field of the initial response object.
For the sample document, processing can take up to 5 minutes. The `/status` endpoint
will return several fields, including a `job_completed` field and an `error` field which initially 
`false` and `null` respectively. If the `job_completed` field becomes `true`, then the job completed 
successfully, and if the `error` field becomes non-`null` then the job failed. 

In [None]:
POLL_COUNT = 80
POLL_INTERVAL = 5

job_done = False

for i in range(POLL_COUNT):
    response = requests.get(status_endpoint)
    response_data = response.json()
    clear_output(wait=True)
    print(f"Polled status endpoint {i} times:\n{response_data}")
    job_done = response_data['error'] or response_data['job_completed']
    if job_done:
        break
    time.sleep(POLL_INTERVAL)

if not job_done:
    print(f"ERROR: Job not complete after {POLL_COUNT * POLL_INTERVAL} seconds.")
elif response_data['error']:
    print(f"An unexpected error occurred: {response_data['error']}")
else:
    print(f"Job succeeded after {response_data['time_processing']} seconds.\n"
          f"Results can be viewed at {results_endpoint}")

### Obtain Results

Once the `/status` endpoint indicates that `job_completed` is `true`, the following endpoints will be available via GET request:
- `/result/text`: The text extracted from the document, alongside its bounding boxes and classification, as JSON.
- `/result/extractions/{extraction_type}`: The bounding boxes of individual extraction types, and temporary links to their images, as JSON. 
  Valid extraction types are `equations`, `figures`, and `tables`.
- `/result/images/{image_id}`: Temporary image hosting for the cosmos extractions.
- `/result`: The entire contents of the cosmos processing job as a zip file, containing images of extracted pdf regions and parquet files with those extractions' metadata.

All results are available for 24 hours after image processing completes, after which they will be deleted.

### Extractions (JSON)

In [None]:
# Extracted document text and bounding boxes
text_data = requests.get(f"{results_endpoint}/text")
print(json.dumps(text_data.json(), indent=2))

In [None]:
# Extracted document equations, bounding boxes, and images
equation_data = requests.get(f"{results_endpoint}/extractions/equations")
print(json.dumps(equation_data.json(), indent=2))

In [None]:
# Extracted document figures, bounding boxes, and images
figure_data = requests.get(f"{results_endpoint}/extractions/figures")
print(json.dumps(figure_data.json(), indent=2))

In [None]:
# Extracted document tables, bounding boxes, and images
table_data = requests.get(f"{results_endpoint}/extractions/tables")
print(json.dumps(table_data.json(), indent=2))

### Full Results (Zip)

In [None]:
ZIP_DOWNLOAD_PATH = SAMPLE_PDF_PATH.replace('.pdf', '.zip')
with open(ZIP_DOWNLOAD_PATH, 'wb') as writer:
    writer.write(requests.get(results_endpoint).content)

print(f'You can access the downloaded zip file at {FileLink(ZIP_DOWNLOAD_PATH)} .\nIt contains the following files:')

with ZipFile(ZIP_DOWNLOAD_PATH, 'r') as zipfile:
    zipfile.printdir()

### View Figures

The zip file contains a number of image files, as well as apache parquet files containing information about those images.
Each parquet file contains information about a separate extraction type (tables, figures, and equations). The names of each
parquet file are based on the name of the input PDF file.

In [None]:
def show_extractions(parquet_path, bb_column, page_column):
    """Utility function to show every extraction from a single COSMOS Parquet file"""
    with (TemporaryDirectory() as td, ZipFile(ZIP_DOWNLOAD_PATH, 'r') as zipfile):
        # extract the parquet file from the zip archive
        zipfile.extract(parquet_path, td)
        # read the parquet file
        parquet_table = parquet.read_table(os.path.join(td, parquet_path))
        pandas_data: pd.DataFrame = parquet_table.to_pandas()
        # TODO the img_path column currently contains the absolute path of the image on the server,
        # rather than the relative path from the root of the zip folder
        pandas_data['img_pth'] = pandas_data['img_pth'].replace('/tmp/tmp.*/','',regex=True)

        # Print each extraction alongside its score, page, and bounding box
        for idx, row in pandas_data.iterrows():
            img_path = row['img_pth']
            zipfile.extract(img_path, td)
            print(f"Extracted Item #{idx+1}\n"
                  f"Page: {row[page_column]}\n"
                  f"Bounding Box: {row[bb_column]}\n"
                  f"Score: {row['postprocess_score']}")
            display(Image(os.path.join(td,img_path)))

In [None]:
# Figures
show_extractions(
    SAMPLE_PDF_PATH.replace('.pdf','_figures.parquet'), 
    'obj_bbs',
    'obj_page')

In [None]:
# Equations
show_extractions(
    SAMPLE_PDF_PATH.replace('.pdf','_equations.parquet'), 
    'equation_bb',
    'equation_page')

In [None]:
# Tables
show_extractions(
    SAMPLE_PDF_PATH.replace('.pdf','_tables.parquet'), 
    'obj_bbs',
    'obj_page')