Note that PDFs are already downloaded via the Conservation Evidence website.

In [1]:
# get all pdf files in subdirectories, but not at top level
def get_pdf_files(directory):
    import os
    pdf_files = []
    for root, dirs, files in os.walk(directory):
        if root == directory: # don't include files I'm not sure about
            continue
        for file in files:
            if file.endswith(".pdf"):
                pdf_files.append(os.path.join(root, file).replace("\\", "/"))
                # pdf_files.append(os.path.join(root, file))
    return pdf_files

filepaths = get_pdf_files('../../../data/synopses/')
filepaths

['../../../data/synopses/Marine Invertebrates/Subtidal Benthic Invertebrate.pdf',
 '../../../data/synopses/Invasive Fish/Invasive Freshwater Species.pdf',
 '../../../data/synopses/Grassland/Grassland.pdf',
 '../../../data/synopses/Mammals/Primate.pdf',
 '../../../data/synopses/Mammals/Terrestrial Mammal.pdf',
 '../../../data/synopses/Amphibians/Amphibian.pdf',
 '../../../data/synopses/Insects/Butterfly and Moth.pdf',
 '../../../data/synopses/Insects/Bee.pdf',
 '../../../data/synopses/Bats/Bat.pdf',
 '../../../data/synopses/Farmland/Sustainable Farming.pdf',
 '../../../data/synopses/Farmland/Mediterranean Farmland.pdf',
 '../../../data/synopses/Farmland/Soil Fertility.pdf',
 '../../../data/synopses/Farmland/Farmland.pdf',
 '../../../data/synopses/Animals Ex-Situ/Management of Captive Animals.pdf',
 '../../../data/synopses/Fish/Marine Fish.pdf',
 '../../../data/synopses/Forests/Forest.pdf',
 '../../../data/synopses/Birds/Bird.pdf',
 '../../../data/synopses/Marine/Biodiversity of Marine A

In [2]:
def populate_json(result):
    # result has the following keys: id, title, author, year, text, classes

    json = {
        "reference type": "CE Study",
        "author": result["author"],
        "year": result["year"],
        "title": result["title"],
        "series editor": None,
        "series/book title": None,
        "place published": "None",
        "institution": "Conservation Evidence Team",
        "publisher": "Conservation Evidence",
        "date": None,
        "doi": None,
        "class":result['class'],
        "abstract": None,
        "url": 'https://www.conservationevidence.com/synopsis/index',
        "text": result["text"],
    }
    return json

In [3]:
import pymupdf

def parse_pdf(pdf_path):
    pdf = pymupdf.open(pdf_path)
    text = ""
    for page in pdf:
        text += page.get_text()


    author = pdf.metadata.get("Author", None)
    year = pdf.metadata.get("ModDate", None) # convert from bash to year
    if year is not None:
        year = year[2:6]
    title = pdf.metadata.get("Title", None)
    if title is None:
        title = pdf_path.split("/")[-1].split(".")[0]

    folder = pdf_path.split("synopses/")[1].split('/')[0]

    pdf.close()
    return populate_json({
        "author": author,
        "year": year,
        "title": title,
        "text": text,
        "class": folder,
    })

In [4]:
from tqdm.notebook import tqdm

def write_json(i,record):

    import json
    with open(f'../../../data/labelled/synopses/{i} - {record["title"]}.json', 'w') as f:
        json.dump(record, f)

records = []
for i,filepath in enumerate(tqdm(filepaths)):
    record = parse_pdf(filepath)
    # write_json(i,record)

    records.append(record)


  0%|          | 0/24 [00:00<?, ?it/s]

In [6]:
# get list of record classes
print('\n'.join([record['class'] for record in records]))
list(set([record['class'] for record in records]))

Marine Invertebrates
Invasive Fish
Grassland
Mammals
Mammals
Amphibians
Insects
Insects
Bats
Farmland
Farmland
Farmland
Farmland
Animals Ex-Situ
Fish
Forests
Birds
Marine
Marine
Wetlands
Wetlands
Shrubland
Pests
Reptiles


['Fish',
 'Farmland',
 'Forests',
 'Birds',
 'Bats',
 'Pests',
 'Animals Ex-Situ',
 'Marine',
 'Shrubland',
 'Grassland',
 'Reptiles',
 'Amphibians',
 'Insects',
 'Wetlands',
 'Invasive Fish',
 'Marine Invertebrates',
 'Mammals']