In [23]:
# run 'docker run -t --rm -p 8070:8070 lfoppiano/grobid:0.8.0' in cmd
# keep docker opened

In [4]:
# https://github.com/titipata/scipdf_parser#installation
import scipdf
import pandas as pd
import os
from pathlib import Path
import numpy as np

In [5]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_seq_items', None)

In [6]:
def pdf_reader(path):
    article_dict = scipdf.parse_pdf_to_dict(path)
    return article_dict

In [7]:
def get_context(path):
    dictionary = pdf_reader(path)
    return dictionary['abstract'], dictionary['sections']

In [8]:
def clean_pdf(path):
    abstract, sections = get_context(path)
    keys_to_remove = ['publication_ref', 'figure_ref', 'table_ref']
    
    for d in sections:
        for key in keys_to_remove:
            d.pop(key, None)

    sections_new = {section['heading']: section['text'] for section in sections}
    context = {}
    context['abstract'] = abstract
    context.update(sections_new)
            
    return context

In [9]:
def pdf_pp(folder_o):
    folder = Path(folder_o)
    paths = folder.glob('*.pdf')
    all_context = {}
    
    i = 0
    for path in paths:
        print(f"Processing {path}...")
        context = clean_pdf(str(path))
        all_context[i] = context
        i = i + 1
        
    return all_context

In [9]:
paper_example = pdf_pp("examples/")
paper_example

Processing examples\Afifi_et_al.pdf...


  parsed_article = BeautifulSoup(parsed_article, "lxml")


Processing examples\Cattaneo_et_al.pdf...
Processing examples\Gray.pdf...


{0: {'abstract': 'This article addresses the interrelation between rainfall variability, food insecurity and human mobility in three villages located in the Same District, Kilimanjaro, Tanzania, namely the villages Vudee, Bangalala and Ruvu Mferejini which are of distinct elevation and precipitation levels. It runs a comparison between the three villages and shows that there is a positive relationship between rainfall shortage and out-migration, after taking other important demographic and socioeconomic factors into account, such as age, wealth and education. The article further argues that the mechanism through which rainfall variability affects human mobility in the research site is food insecurity for humans and livestock.',
  'Introduction': "Climate variability refers to a deviation from the long-term meteorological average over a certain period of time. Most simulations predict exacerbated variability and increasing extreme weather events as a result of global warming (IPCC, 2001

In [10]:
np.save('example.npy', paper_example)

In [10]:
paper_test = pdf_pp('data/')

Processing data\Abu_et_al.pdf...


  parsed_article = BeautifulSoup(parsed_article, "lxml")


Processing data\Afifi_et_al_2011.pdf...
Processing data\Afifi_et_al_2012.pdf...
Processing data\Afriyie_et_al_2018.pdf...
Processing data\Barrios_et_al.pdf...
Processing data\Carr_2005.pdf...
Processing data\Cattaneo_et_al_2016.pdf...
Processing data\Doevenspeck_2011.pdf...
Processing data\Dreier_et_al.pdf...
Processing data\Ezra_2011.pdf...
Processing data\Gray_et_al_2011.pdf...
Processing data\Hamza_et_al_2008.pdf...
Processing data\Haug_2002.pdf...
Processing data\Heaney_et_al.pdf...
Processing data\Henry_et_al_2003.pdf...
Processing data\Henry_et_al_2004.pdf...
Processing data\Henry_et_al_2004b.pdf...
Processing data\Hunter_et_al_2017.pdf...
Processing data\Koubi_et_al_2016.pdf...
Processing data\Kubik_et_al.pdf...
Processing data\Leyk_et_al.pdf...
Processing data\Meze-Hausken_2000.pdf...
Processing data\Naudé_2008.pdf...
Processing data\Neumann_et_al.pdf...
Processing data\Ocello_et_al_2014.pdf...
Processing data\Romankiewicz_et_al.pdf...
Processing data\Simatele_et_al_2015.pdf...

In [11]:
paper_test

{0: {'abstract': "Migration is at the centre of demographic research on the populationenvironment nexus. Increasing concerns about the impacts of environmental events on human population are fuelling interest on the relationship between migration and environmental change. Using data from the Climate Change Collective Learning and Observatory Network Ghana project, we employ binary logistic regression to examine migration intentions of households in response to major community stressors including climate-related ones. The results indicate that the type of community stressor that affects households most does not differentiate migration intentions in Ghana's forest-savannah transition zone: Even though the majority of the respondents mentioned climate-related events as the stressor that affects them the most, such events do not appear to directly explain migration intentions. However, socio-demographic factors such as age, household size and current migration status are significant predic

In [12]:
np.save('literature.npy', paper_test)