Note that PDFs are already downloaded via the Conservation Evidence website.

In [1]:
# get all pdf files in subdirectories, but not at top level
def get_pdf_files(directory):
    import os
    pdf_files = []
    for root, dirs, files in os.walk(directory):
        if root == directory: # don't include files I'm not sure about
            continue
        for file in files:
            if file.endswith(".pdf"):
                pdf_files.append(os.path.join(root, file).replace("\\", "/"))
                # pdf_files.append(os.path.join(root, file))
    return pdf_files

filepaths = get_pdf_files('../../../data/unprocessed/synopses/')
filepaths

['../../../data/synopses/Marine Invertebrates/Subtidal Benthic Invertebrate.pdf',
 '../../../data/synopses/Invasive Fish/Invasive Freshwater Species.pdf',
 '../../../data/synopses/Grassland/Grassland.pdf',
 '../../../data/synopses/Mammals/Primate.pdf',
 '../../../data/synopses/Mammals/Terrestrial Mammal.pdf',
 '../../../data/synopses/Amphibians/Amphibian.pdf',
 '../../../data/synopses/Insects/Butterfly and Moth.pdf',
 '../../../data/synopses/Insects/Bee.pdf',
 '../../../data/synopses/Bats/Bat.pdf',
 '../../../data/synopses/Farmland/Sustainable Farming.pdf',
 '../../../data/synopses/Farmland/Mediterranean Farmland.pdf',
 '../../../data/synopses/Farmland/Soil Fertility.pdf',
 '../../../data/synopses/Farmland/Farmland.pdf',
 '../../../data/synopses/Animals Ex-Situ/Management of Captive Animals.pdf',
 '../../../data/synopses/Fish/Marine Fish.pdf',
 '../../../data/synopses/Forests/Forest.pdf',
 '../../../data/synopses/Birds/Bird.pdf',
 '../../../data/synopses/Marine/Biodiversity of Marine A

In [2]:
class_map = {'Bird': ['Birds'],
 'Farmland': ['Farmland'],
 'Natural Pest Control': ['Pests'],
 'Control of Freshwater Invasive Species': ['Fish','Invasive'],
 'Shrubland and Heathland': ['Shrubland'],
 'Reptile': ['Reptiles'],
 'Terrestrial Mammal': ['Mammals'],
 'Marsh and Swamp': ['Wetlands'],
 'Grassland': ['Grassland'],
 'Bat': ['Bats'],
 'Amphibian': ['Amphibians'],
 'Bee': ['Insects'],
 'Butterfly and Moth': ['Insects'],
 'Forest': ['Forests'],
 'Primate': ['Mammals'],
 'Peatland': ['Wetlands'],
 'Mediterranean Farmland': ['Farmland'],
 'Subtidal Benthic Invertebrate': ['Marine Invertebrates'],
 'Marine and Freshwater Mammal': ['Mammals', 'Marine','Rivers and Lakes'],
 'Management of Captive Animals': ['Animals Ex-Situ', 'Captivity'],
 'Soil Fertility': ['Farmland'],
 'Sustainable Aquaculture': ['Marine'],
 'Marine Fish': ['Marine','Fish'],
 'Biodiversity of Marine Artificial Structures': ['Marine','Plants and Algae'],
 'Invasive Freshwater Species': ['Fish','Invasive','Rivers and Lakes'],
 'Sustainable Farming': ['Farmland'],
 'Marine Fish': ['Marine','Fish'],
}

In [3]:
import pymupdf

def parse_pdf(pdf_path):
    pdf = pymupdf.open(pdf_path)
    text = ""
    for page in pdf:
        text += page.get_text()


    author = pdf.metadata.get("Author", None)
    year = pdf.metadata.get("ModDate", None) # convert from bash to year
    if year is not None:
        year = year[2:6]
    title = pdf.metadata.get("Title", None)
    if title is None:
        title = pdf_path.split("/")[-1].split(".")[0]

    folder = pdf_path.split("synopses/")[1].split('/')[1].removesuffix('.pdf')

    pdf.close()
    return populate_json({
        "author": author,
        "year": year,
        "title": title,
        "text": text,
        "class": folder,
    })

In [4]:
def populate_json(result):
    # result has the following keys: id, title, author, year, text, classes
    multiclasses = set(class_map[result['class']])

    json = {
        "reference type": "CE Study",
        "author": result["author"],
        "year": result["year"],
        "title": result["title"],
        "series editor": None,
        "series/book title": None,
        "place published": "None",
        "institution": "Conservation Evidence Team",
        "publisher": "Conservation Evidence",
        "doi": None,
        'relevance': 'relevant',
        "multiclasses":multiclasses,
        "abstract": None,
        "url": 'https://www.conservationevidence.com/synopsis/index',
        "text": result["text"],
    }
    return json

In [5]:
from tqdm.notebook import tqdm

records = []
for i,filepath in tqdm(enumerate(filepaths),total=len(filepaths)):
    record = parse_pdf(filepath)
    # write_json(i,record)

    records.append(record)


  0%|          | 0/24 [00:00<?, ?it/s]

In [6]:
import pandas as pd

data = pd.DataFrame(records)

data.head()

Unnamed: 0,reference type,author,year,title,series editor,series/book title,place published,institution,publisher,date,doi,relevance,multiclasses,abstract,url,text
0,CE Study,,,Subtidal Benthic Invertebrate,,,,Conservation Evidence Team,Conservation Evidence,,,relevant,{Marine Invertebrates},,https://www.conservationevidence.com/synopsis/...,1 \n \n \n2 \n \n \nSubtidal Benthic Invertebr...
1,CE Study,,,Invasive Freshwater Species,,,,Conservation Evidence Team,Conservation Evidence,,,relevant,"{Fish, Rivers and Lakes, Invasive}",,https://www.conservationevidence.com/synopsis/...,\n \n \n Control of freshwater \n invasi...
2,CE Study,,,Grassland,,,,Conservation Evidence Team,Conservation Evidence,,,relevant,{Grassland},,https://www.conservationevidence.com/synopsis/...,1 \n \nGrassland Conservation \n2 \n \nGrassla...
3,CE Study,,,Primate,,,,Conservation Evidence Team,Conservation Evidence,,,relevant,{Mammals},,https://www.conservationevidence.com/synopsis/...,\n \n \nii \n \n \n \n \n \n \n \nPrimate Co...
4,CE Study,,,Terrestrial Mammal,,,,Conservation Evidence Team,Conservation Evidence,,,relevant,{Mammals},,https://www.conservationevidence.com/synopsis/...,CONSERVATION EVIDENCE SERIES SYNOPSES\nTerrest...


In [7]:
data.to_json('../../../data/level-0.5/synopses/synopses.json', orient='records')