In [None]:
# Original source code by Yuwei Jiang @YuweiJ98, modfiied by David Johnson @djcomlab

In [None]:
import pytesseract
from pdf2image import convert_from_path
from PyPDF2 import PdfFileReader
import glob
import pandas as pd
import plotly.express as px
import os
import tqdm

In [None]:
pdf_files = []
# get all the pdf files from the directory
for file in glob.glob('../data/sdg7-papers/*.pdf'):
    pdf_files.append(file)
pdf_files

In [None]:
# to get all the texts of a paper
def get_paper_text(path):
    # convert PDF to images
    images = convert_from_path(path)
    # create a string to store texts of the whole paper
    whole_paper = ''
    # iterate each page
    for i in range(len(images)):        
        page_content = pytesseract.image_to_string(images[i])
        whole_paper = whole_paper + ' ' + page_content
    return whole_paper

In [None]:
# to extract sentences containg the word "data"
def sentences_containing_data(whole_paper):
    sentences = whole_paper.split(".")
    sentences_contain_data = []
    for sentence in sentences:
        if 'data' in sentence or 'dataset' in sentence:
            sentences_contain_data.append(sentence)
    return sentences_contain_data

In [None]:
# define different data sources and data types
data_sources = ['questionnaire','interview','survey','census','focus group',
'FAO', 'FAOSTAT', 'Food and Agriculture organization of the United Nations',
'UN', 'United Nations',
'SDSN', 'Sustainable Development Solutions Network',
'UNESCO', 'United Nations Educational, Scientific and Cultural Organization',
'Eurostat',
'EDGAR', 'Emissions Database for Global Atmospheric Research',
'Copernicus',
'ESDAC', 'ES-DAC', 'European Soil Data Center',
'World Bank',
'Freedom House',
'IEA', 'International Energy Agency',
'OECD', 'Organisation for Economic Co-operation and Development',
'IMF', 'International Monetary Fund',
'Global Carbon Atlas',
'Global Footprint Network',
'SESRIC', 'Statistical, Economic, and Social Research and Training Center for Islamic Countries',
'BP Statistical Review of World Energy',
'ADB', 'Asian Development Bank',
'WHO', 'World Health Organization',
'World Pop',
'GADM', 'Global Administrative Areas',
'Rosstat', 'Federation Federal State Statistics Service',
'General Statistics Office',
'Central Statistical Office',
'National Institute of Statistics',
'DOSM', 'Department of Statistics Malaysia',
'National Bureau of Statistics of Tanzania',
'Ministry of Education and Science of Ukraine',
'BGR', 'Federal Institute for Geosciences and Natural Resources',
'NPC', 'National Planning Commission',
'ANPM', 'National Environmental Protection Agency',
'EIA', 'Energy Information Administration',
'MDTCC', 'Ministry of Domestic Trade Co-operatives and Consumerism',
'NOAA', 'National Oceanic and Atmospheric Administration',
'NASA', 'National Aeronautics and Space Administration',
'weather','sensor','sensing',
'mineral','water','land','electricity','biomass','solar','heat',
'satellite imagery','GPS','GIS','OpenStreetMap', 'OSM']

# identify all the defined data sources from a sentence
def categorise(sentence):
    sources = []
    sources_string = ''
    for key in data_sources:
        if key in sentence and key not in sources:
            sources.append(key)
    sources_string = ','.join(sources)
    return sources_string

In [None]:
# to get the paper title from pdf metadata
def get_paper_title(path):
    pdfFile = open(path, 'rb')
    # create PDFFileReader object to read the file
    pdfReader = PdfFileReader(pdfFile)
    title = str(pdfReader.getDocumentInfo().title)
    # close the PDF file object
    pdfFile.close()
    return title

# to get the paper title from pdf name
def get_paper_title2(filepath):
    title_without_extension = os.path.splitext(os.path.basename(filepath))[0]
    return title_without_extension

In [None]:
# set for your own system depending on where tesseract is installed
pytesseract.pytesseract.tesseract_cmd = r'tesseract'

In [None]:
paper_tables = []
for file_name in tqdm(pdf_files):
    paper_text = get_paper_text(file_name)
    key_sentences = sentences_containing_data(paper_text)
    # create a table to store the sentences containg the word "data"
    data_table = pd.DataFrame({"Sentence": key_sentences})
    data_types = data_table['Sentence'].apply(categrize)
    # add a column to store the data type or database
    data_table['Data'] = data_types
    # filter out sentences without mentioning any data type or database
    data_table = data_table[data_table.Data.notnull()]
    # add a column to store the paper title
    paper_title = get_paper_title2(file_name)    
    data_table.insert(0, 'title', paper_title)
    paper_tables.append(data_table)

In [None]:
# combine tables for different papers
large_paper_table = pd.concat(paper_tables, axis=0)
large_paper_table = large_paper_table[large_paper_table.Data != '']
large_paper_table

In [None]:
# checkpoint save the table to an excel sheet
large_paper_table.to_excel(r'../data/sdg7-coding-auto.xlsx',index = False)