In [1]:
import pandas as pd
from tqdm.notebook import tqdm


In [2]:
data = pd.read_csv('../../../data/unprocessed/raw-grey-literature-sources.csv',encoding='latin-1')

In [3]:
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 488 entries, 0 to 487
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Reference type     488 non-null    object 
 1   Author             488 non-null    object 
 2   Year               488 non-null    int64  
 3   Title              488 non-null    object 
 4   Series Editor      8 non-null      object 
 5   Series/Book Title  423 non-null    object 
 6   Place Published    101 non-null    object 
 7   Institution        475 non-null    object 
 8   Volume             53 non-null     object 
 9   Pages              101 non-null    object 
 10  Publisher          48 non-null     object 
 11  Date               21 non-null     object 
 12  Report number      336 non-null    object 
 13  DOI                2 non-null      object 
 14  Folder             488 non-null    object 
 15  Abstract           0 non-null      float64
 16  Research notes     16 non-

Unnamed: 0,Reference type,Author,Year,Title,Series Editor,Series/Book Title,Place Published,Institution,Volume,Pages,Publisher,Date,Report number,DOI,Folder,Abstract,Research notes,URL
0,Report,A. A. Lyle; P. S. Maitland; I. J. Winfield,2005,Translocation of vendace from Derwentwater to ...,,English Nature Research Report,"Natural England, UK",Natural England (English Nature),,,,,Report No. ENRR635,,3. Fish,,,
1,Report,A. Anonymous,2007,World Wetlands Day 2007 brochure: Fish for tom...,,,,Ramsar,,,,,,,2. Birds,,,https://www.ramsar.org/document/world-wetlands...
2,Magazine Article,A. Anonymous,2006,Ex-situ breeding summary,,Flamingo: Bulletin of the IUCN-SSC/Wetlands In...,,Wetlands International,14.0,9-11,,,,,2. Birds,,,
3,Magazine Article,A. Anonymous,2006,In situ breeding summary,,Flamingo: Bulletin of the IUCN-SSC/Wetlands In...,,Wetlands International,14.0,5-9,,,,,2. Birds,,,
4,Magazine Article,A. Anonymous,2008,Ex-situ breeding summary,,Flamingo: Bulletin of the IUCN-SSC/Wetlands In...,,Wetlands International,16.0,12-12,,,,,2. Birds,,,https://www.wetlands.org/wp-content/uploads/20...


In [4]:

# clean up data
# if only a single row is missing a value, just remove it - it's not worth the effort

for column in data.columns:
    if data[column].isnull().values.any():
        print(column.title())
        print(data[column].isnull().sum())
        # get index of rows with missing values
        missing = data[column].index[data[column].isnull()]
        if len(missing) < 10:
            print(missing)
            for i in missing:
                data.drop(i, inplace=True)



#* Note that some columns are missing data for almost all rows
# Specifically, Series Editor, Volume, Pages, Publisher, Date, DOI, Abstract, Research Notes

# Of these, we can immediately drop Volume, Pages, and Research Notes, as they can't be relevant to our analysis
data = data.drop(columns=['Volume', 'Pages', 'Research notes','Date'])

# We'll keep the rest for now but, if we find that they are not useful to the model, we can drop them later

Series Editor
480
Series/Book Title
65
Place Published
387
Institution
13
Volume
435
Pages
387
Publisher
440
Date
467
Report Number
152
Doi
486
Abstract
488
Research Notes
472
Url
291


In [5]:
#* Not all URLs are valid, some get 404-ed. Filter out invalid ones with a test response
import requests

def is_valid_url(url):

    i,url=url
    if (
        pd.isna(url)
        or url.startswith("http://archive.jncc.gov.uk")
        or url.startswith("http://www.snh.org.uk")
        or url.startswith("http://www.ices.dk/sites")
        or url.startswith("http://publications.naturalengland.org.uk/file")
        or url.startswith("http://randd.defra.gov.uk")
    ):  # skip empty and certain common URLs where the whole site doesn't respond, or consistently fail

        pbar.update(1)
        return i, False

    try:
        r = requests.head(url)
        pbar.update(1)
        if r.status_code >= 200 and r.status_code < 400: # 200-399 works
            return i, True
        elif r.status_code == 403:
            print('-----403-----')
            print(url) # handle manually, since I can still access it
            print('-----403-----')
            return i,True
        else:
            print(url, r.status_code)
            return i, False

    except: # 404 and other errors
        pbar.update(1)
        return i, False


from concurrent.futures import ThreadPoolExecutor

urls = data['URL']

with tqdm(total=len(urls)) as pbar:
    with ThreadPoolExecutor(max_workers=10) as executor:
        errors = list(executor.map(is_valid_url, enumerate(urls)))

errors

  0%|          | 0/488 [00:00<?, ?it/s]

https://www.ramsar.org/document/world-wetlands-day-2007-brochure-fish-for-tomorrow%0A 404
https://www.ramsar.org/document/world-wetlands-day-2007-brochure-fish-for-tomorrow%0A 404
https://www.ramsar.org/document/world-wetlands-day-2007-brochure-fish-for-tomorrow%0A 404
https://www.ramsar.org/document/world-wetlands-day-2007-brochure-fish-for-tomorrow%0A 404
-----403-----
https://www.sciencedirect.com/science/article/pii/B9780128014028000093
-----403-----
https://www.ramsar.org/document/ramsar-technical-report-8-waterbird-flyway-initiatives%0A 404
http://www.smru.st-andrews.ac.uk/files/2016/08/Marine-Mammal-Research-at-Wild-Salmon-Fisheries-2013-Annual-Report.pdf 404
http://www.smru.st-andrews.ac.uk/files/2016/08/Marine-Mammal-Research-at-Wild-Salmon-Fisheries-2013-Annual-Report.pdf 404
-----403-----
https://iucn-csg.org/wp-content/uploads/2010/03/Final-IUCN-Publication-Jan-7-2015-1.pdf
-----403-----
https://www.wetlands.org/publications/1304/%0A 404
-----403-----
https://iucn-csg.org/w

[(0, False),
 (1, False),
 (2, False),
 (3, False),
 (4, True),
 (5, True),
 (6, True),
 (7, False),
 (8, False),
 (9, False),
 (10, False),
 (11, True),
 (12, True),
 (13, True),
 (14, False),
 (15, False),
 (16, True),
 (17, False),
 (18, True),
 (19, True),
 (20, True),
 (21, False),
 (22, False),
 (23, False),
 (24, True),
 (25, True),
 (26, False),
 (27, True),
 (28, True),
 (29, True),
 (30, True),
 (31, False),
 (32, False),
 (33, False),
 (34, False),
 (35, False),
 (36, False),
 (37, False),
 (38, False),
 (39, True),
 (40, True),
 (41, False),
 (42, False),
 (43, False),
 (44, False),
 (45, False),
 (46, False),
 (47, False),
 (48, False),
 (49, False),
 (50, False),
 (51, False),
 (52, False),
 (53, False),
 (54, False),
 (55, False),
 (56, False),
 (57, False),
 (58, False),
 (59, False),
 (60, False),
 (61, False),
 (62, False),
 (63, True),
 (64, False),
 (65, False),
 (66, False),
 (67, False),
 (68, False),
 (69, True),
 (70, False),
 (71, False),
 (72, False),
 (73, Tr

In [6]:
print(errors)
errors = list(filter(lambda x: x[1] == False, errors))

print(f'Removing {len(errors)} invalid URLS')

# replace invalid URLs with nan
# for i,x in errors:
    # data["URL"].iat[i] = None

# drop rows with invalid URLs
data = data.drop(index=[x[0] for x in errors])

[(0, False), (1, False), (2, False), (3, False), (4, True), (5, True), (6, True), (7, False), (8, False), (9, False), (10, False), (11, True), (12, True), (13, True), (14, False), (15, False), (16, True), (17, False), (18, True), (19, True), (20, True), (21, False), (22, False), (23, False), (24, True), (25, True), (26, False), (27, True), (28, True), (29, True), (30, True), (31, False), (32, False), (33, False), (34, False), (35, False), (36, False), (37, False), (38, False), (39, True), (40, True), (41, False), (42, False), (43, False), (44, False), (45, False), (46, False), (47, False), (48, False), (49, False), (50, False), (51, False), (52, False), (53, False), (54, False), (55, False), (56, False), (57, False), (58, False), (59, False), (60, False), (61, False), (62, False), (63, True), (64, False), (65, False), (66, False), (67, False), (68, False), (69, True), (70, False), (71, False), (72, False), (73, True), (74, True), (75, True), (76, False), (77, False), (78, False), (79, 

In [7]:
import pymupdf

def extract_pdf_text(url):
    r = requests.get(url)
    try:
        pdf = pymupdf.open(stream=r.content, filetype='pdf')
    except:
        print('Error opening PDF: ', url)
        return None
    return '\n\n'.join([pdf.get_page_text(i) for i in range(len(pdf))])

In [8]:
from bs4 import BeautifulSoup

def crawl_for_data(url): # search for redirects or pdf links

    # check for redirects to pdfs
    r = requests.get(url)
    if (r.url) != url:
        url = r.url
        if "pdf" in url or "download" in url or 'file' in url:
            return url

    # check for pdf links in the page

    html = BeautifulSoup(r.content, 'html.parser')

    for a in html.find_all("a", href=True):
        if "pdf" in a["href"] or "download" in a["href"]:
            return a["href"]


    return url


print(
    crawl_for_data(
        "https://www.researchgate.net/profile/Belinda-Wheeler/publication/346418211_The_effectiveness_of_Higher_Level_Stewardship_for_maintaining_and_restoring_species-rich_grasslands_a_resurvey_of_a_sample_of_grasslands_under_HLS_options_HK6_and_HK7/links/5fc0d1d8299bf104cf8382fc/The-effectiveness-of-Higher-Level-Stewardship-for-maintaining-and-restoring-species-rich-grasslands-a-resurvey-of-a-sample-of-grasslands-under-HLS-options-HK6-and-HK7.pdf"
    )
)


https://www.researchgate.net/profile/Belinda-Wheeler/publication/346418211_The_effectiveness_of_Higher_Level_Stewardship_for_maintaining_and_restoring_species-rich_grasslands_a_resurvey_of_a_sample_of_grasslands_under_HLS_options_HK6_and_HK7/links/5fc0d1d8299bf104cf8382fc/The-effectiveness-of-Higher-Level-Stewardship-for-maintaining-and-restoring-species-rich-grasslands-a-resurvey-of-a-sample-of-grasslands-under-HLS-options-HK6-and-HK7.pdf


In [9]:
#TODO: Remove duplicate identical URLs
# Note there's at least one duplicate URL where the only change is http vs https



In [10]:
# Some URLs are PDFs or something else parseable, but some just link to a page containing them
# So this is just a mapping of which ones do that
# plus some extra 404s that snuck past before
# bit hacky

url_maps = {
    "http://www.gov.scot/Resource/0050/00504418.pdf": "https://www.gov.scot/binaries/content/documents/govscot/publications/research-and-analysis/2014/10/evaluating-assessing-relative-effectiveness-acoustic-deterrent-devices-non-lethal-measures/documents/00504418-pdf/00504418-pdf/govscot%3Adocument/00504418.pdf",
    "http://www.accobams.org/new_accobams/wp-content/uploads/2016/06/ACCOBAMS_MOP2_Res.2.12.pdf": None,
    "http://randd.defra.gov.uk/Default.aspx?Menu=Menu&Module=More&Location=None&ProjectID=19358&FromSearch=Y&Publisher=1&SearchText=LM0443&SortString=ProjectCode&SortOrder=Asc&Paging=10#Descriptionhttp://randd.defra.gov.uk/Document.aspx?Document=14093_LM0443_Resurvey_of_grasslands_2014_FinalReport.pdf": None,
    "https://randd.defra.gov.uk/ProjectDetails?ProjectID=19358&FromSearch=Y&Publisher=1&SearchText=LM0443&SortString=ProjectCode&SortOrder=Asc&Paging=10#Descriptionhttp://randd.defra.gov.uk/Document.aspx?Document=14093_LM0443_Resurvey_of_grasslands_2014_FinalReport.pdf": None,
    "http://ices.dk/sites/pub/Publication%20Reports/Expert%20Group%20Report/acom/2017/WGBYC/wgbyc_2017.pdf": None,
    "http://randd.defra.gov.uk/Default.aspx?Menu=Menu&Module=More&Location=None&ProjectID=14340&FromSearch=Y&Publisher=1&SearchText=MA01031&SortString=ProjectCode&SortOrder=Asc&Paging=10#Descriptionhttp://randd.defra.gov.uk/Document.aspx?Document=13451_MA01031_finalreport.pdf": "https://nora.nerc.ac.uk/id/eprint/505290/1/N505290CR.pdf",
    "http://randd.defra.gov.uk/Document.aspx?Document=MF1003-FINALRevisedAugust2011.pdf": None,
    "https://medwet.org/publications/quelle-occupation-du-sol-au-sein-des-sites-ramsar-de-france-metropolitaine/": None, # in french
}

In [11]:
def get_text(url):
    if url in url_maps:
        url = url_maps[url]

    if pd.isna(url) or url is None:
        pbar.update(1)
        return None

    if 'download' in url or 'pdf' in url or 'file' in url:
        try:
            text = extract_pdf_text(url)
            pbar.update(1)
            return text
        except:
            url = crawl_for_data(url) # second chance
            if 'download' in url or 'pdf' in url or 'file' in url:
                pbar.update(1)
                return extract_pdf_text(url)
            # TODO: Start going through these manually, adding PDF links to url_maps

    else: # second chance
        url = crawl_for_data(url)
        if 'download' in url or 'pdf' in url or 'file' in url:
            pbar.update(1)
            return extract_pdf_text(url)
        else:
            req = requests.get(url)
            soup = BeautifulSoup(req.content, 'html.parser')

            pbar.update(1)
            try:
                return soup.find('p').get_text()
            except:
                return None

        # print(url)
        pass

    pbar.update(1)
    return None


tqdm.pandas()


# multithreading

with tqdm(total=len(data)) as pbar:
    with ThreadPoolExecutor(max_workers=10) as executor:
        data["text"] = list(executor.map(get_text, data["URL"]))

  0%|          | 0/95 [00:00<?, ?it/s]

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Error opening PDF:  https://iucn-csg.org/wp-content/uploads/2010/03/Final-IUCN-Publication-Jan-7-2015-1.pdf
Error opening PDF:  https://iucn-csg.org/wp-content/uploads/2010/03/Final-IUCN-Publication-Jan-7-2015-1.pdf
Error opening PDF:  https://iucn-csg.org/wp-content/uploads/2010/03/Final-IUCN-Publication-Jan-7-2015-1.pdf
Error opening PDF:  https://iucn-csg.org/wp-content/uploads/2010/03/IUCNActionPlan2003-009.pdf
Error opening PDF:  https://www.researchgate.net/profile/Dr_Manfred_Jusaitis/publication/260230062_Conservation_translocation_of_the_large-headed_daisy_to_Mount_Bold_South_Australia/links/0046353041f2d0779a000000.pdf#page=52


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 95 entries, 4 to 482
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Reference type     95 non-null     object 
 1   Author             95 non-null     object 
 2   Year               95 non-null     int64  
 3   Title              95 non-null     object 
 4   Series Editor      0 non-null      object 
 5   Series/Book Title  65 non-null     object 
 6   Place Published    11 non-null     object 
 7   Institution        93 non-null     object 
 8   Publisher          1 non-null      object 
 9   Date               5 non-null      object 
 10  Report number      19 non-null     object 
 11  DOI                0 non-null      object 
 12  Folder             95 non-null     object 
 13  Abstract           0 non-null      float64
 14  URL                95 non-null     object 
 15  text               84 non-null     object 
dtypes: float64(1), int64(1), object(

In [13]:
# see how many sources have URLs but still no text
print(data[pd.isna(data["text"]) & pd.notna(data["URL"])].size)

data[pd.isna(data["text"]) & pd.notna(data["URL"])]

176


Unnamed: 0,Reference type,Author,Year,Title,Series Editor,Series/Book Title,Place Published,Institution,Publisher,Date,Report number,DOI,Folder,Abstract,URL,text
13,Report,A. Anonymous,2016,"Les milieux humides remarquables, des espaces ...",,,,MedVet,,,,,20. Wetlands,,https://medwet.org/publications/quelle-occupat...,
39,Report,ACCOBAMS,2004,Resolution 2.12 Guidelines for the use of acou...,,ACCOBAMS resolutions for conservation actions,,Agreement on the Conservation of Cetaceans of ...,,,RESOLUTION 2.12,,6. Mammals,,http://www.accobams.org/new_accobams/wp-conten...,
40,Report,ACCOBAMS,2004,Resolution 2.12 Guidelines for the use of acou...,,ACCOBAMS resolutions for conservation actions,,Agreement on the Conservation of Cetaceans of ...,,,RESOLUTION 2.12,,25. Invasive mammals,,http://www.accobams.org/new_accobams/wp-conten...,
195,Generic,I. N. N. Suryadiputra,2008,Recommendation on several demo sites for coast...,,,,Wetlands International,,,,,20. Wetlands,,https://www.wetlands.org/publications/demo-sit...,
205,Report,International Council for the Exploration of t...,2017,Report of the Working Group on Bycatch of Prot...,,Working Group on Bycatch of protected species ...,,International Council for the Exploration of t...,,,ICES WGBYC Report 2017,,6. Mammals,,http://ices.dk/sites/pub/Publication%20Reports...,
383,Report,R. K. Sinha; B. Ahmed,2014,Rivers for Life ï¿½ Proceedings of the Interna...,,IUCN/SSC Cetacean Specialist Group Report,,International Union for Conservation of Nature...,,,,,3. Fish,,https://iucn-csg.org/wp-content/uploads/2010/0...,
384,Report,R. K. Sinha; B. Ahmed,2014,Rivers for Life ï¿½ Proceedings of the Interna...,,IUCN/SSC Cetacean Specialist Group Report,,International Union for Conservation of Nature...,,,,,6. Mammals,,https://iucn-csg.org/wp-content/uploads/2010/0...,
385,Report,R. K. Sinha; B. Ahmed,2014,Rivers for Life ï¿½ Proceedings of the Interna...,,IUCN/SSC Cetacean Specialist Group Report,,International Union for Conservation of Nature...,,,,,7. Reptiles,,https://iucn-csg.org/wp-content/uploads/2010/0...,
410,Report,R. Randall; B. D. Reeves; E. A. C. Smith; G. N...,2003,"Dolphins, whales and porpoises: 2002-2010 cons...",,IUCN/SSC Cetacean Specialist Group Report,,International Union for Conservation of Nature...,,,,,6. Mammals,,https://iucn-csg.org/wp-content/uploads/2010/0...,
448,Report,S. Stiles; J. Stiles; J. C. Godwin; C. Jenkins...,2013,Global Re-introduction Perspectives: 2013. Fur...,,,,"IUCN Species Survival Commission, Re-introduct...",,2013.0,,,7. Reptiles,,https://www.researchgate.net/profile/Dr_Manfre...,


In [14]:
# now just purge the rows with no text and no folder/class
data = data.dropna(subset=["text",'Folder'])


In [15]:
# note a clear divide between species, habitats, and invasive species
# could be useful to split them into three separate classes before using a more specific classifier

class_name_map = {
    "1. Amphibians": "Amphibians",
    "2. Birds": "Birds",
    "3. Fish": "Fish",
    "4. Invertebrates": "Invertebrates",
    "5. Marine Invertebrates": "Marine Invertebrates",
    "6. Mammals": "Mammals",
    "7. Reptiles": "Reptiles",
    "8. Animals ex-situ": "Animals Ex-Situ",
    "9. Individual plants & algae": "Plants and Algae",
    "9. Indiviual plants & algae": "Plants and Algae",  # note mispelling
    "10. Plants ex-situ": "Plants Ex-Situ",
    "11. Fungi": "Fungi",
    "12. Bacteria": "Bacteria",
    "13. Coastal": "Coastal",
    "14. Farmland": "Farmland",
    "15. Forests": "Forests",
    "16. Rivers, lakes": "Rivers and Lakes",
    "16. Rivers,lakes": "Rivers and Lakes",
    "17. Grassland": "Grassland",
    "18. Marine": "Marine",
    "19. Shrubland": "Shrubland",
    "20. Wetlands": "Wetlands",
    "21. Invasive amphibians": "Invasive Amphibians",
    "22. Invasive birds": "Invasive Birds",
    "23. Invasive fish": "Invasive Fish",
    "24. Invasive invertebrates": "Invasive Invertebrates",
    "24. Invasive inverts": "Invasive Invertebrates",  # note abbreviation
    "25. Invasive mammals": "Invasive Mammals",
    "26. Invasive reptiles": "Invasive Reptiles",
    "27. Invasive plants": "Invasive Plants",
    "28. Invasive fungi": "Invasive Fungi",
    "29. Invasive bacteria": "Invasive Bacteria",
    "30. Behaviour Change": "Behaviour Change",
}

data.insert(0,'class',data["Folder"].map(lambda x: class_name_map[x.strip()]))

In [16]:
data.insert(0, 'multiclasses', data['class'].map(lambda x: {x}))
data.insert(1, 'relevance', ['relevant'] * len(data))

In [17]:
data.head()

Unnamed: 0,multiclasses,relevance,class,Reference type,Author,Year,Title,Series Editor,Series/Book Title,Place Published,Institution,Publisher,Date,Report number,DOI,Folder,Abstract,URL,text
4,{Birds},relevant,Birds,Magazine Article,A. Anonymous,2008,Ex-situ breeding summary,,Flamingo: Bulletin of the IUCN-SSC/Wetlands In...,,Wetlands International,,,,,2. Birds,,https://www.wetlands.org/wp-content/uploads/20...,\n \n \nFlamingo\nFlamingo\nFlamingo\nFlaming...
5,{Birds},relevant,Birds,Magazine Article,A. Anonymous,2009,Ex-situ breeding summary,,Flamingo: Bulletin of the IUCN-SSC/Wetlands In...,,Wetlands International,,,,,2. Birds,,https://www.wetlands.org/publications/flamingo...,\n\n \n \n \n \n \n \nABOUT THE GROUP \n \nThe...
6,{Birds},relevant,Birds,Magazine Article,A. Anonymous,2011,Front Matter,,Flamingo: Bulletin of the IUCN-SSC/Wetlands In...,,Wetlands International,,,,,2. Birds,,https://www.wetlands.org/publications/flamingo...,Flamingo \n \n \n \n \n \n \n \nBulletin of th...
11,{Animals Ex-Situ},relevant,Animals Ex-Situ,Magazine Article,A. Anonymous,2008,Ex-situ breeding summary,,Flamingo: Bulletin of the IUCN-SSC/Wetlands In...,,Wetlands International,,,,,8. Animals ex-situ,,https://www.wetlands.org/wp-content/uploads/20...,\n \n \nFlamingo\nFlamingo\nFlamingo\nFlaming...
12,{Animals Ex-Situ},relevant,Animals Ex-Situ,Magazine Article,A. Anonymous,2009,Ex-situ breeding summary,,Flamingo: Bulletin of the IUCN-SSC/Wetlands In...,,Wetlands International,,,,,8. Animals ex-situ,,https://www.wetlands.org/publications/flamingo...,\n\n \n \n \n \n \n \nABOUT THE GROUP \n \nThe...


In [18]:
# format as JSON

def write(data):

    data.columns = map(str.lower, data.columns)


    json = data.to_json(path_or_buf="../../../data/level-0.5/scraped/scraped.json",orient='records')

In [19]:
write(data)