In [1]:
import requests
import bs4 as BeautifulSoup

In [2]:
prefix = "https://www.conservationevidence.com/individual-study/" # add number to end, from 2 to 12261
# skip 1, it's a test page

def get_study_text(study_number):
    url = prefix + str(study_number)
    page = requests.get(url)
    if page.status_code == 404:
        raise ValueError("Study not found")
    soup = BeautifulSoup.BeautifulSoup(page.content, 'html.parser')

    summary = soup.find_all('section', class_='summary')
    if len(summary) == 0:
        return None

    # get <p> tags out
    text = summary[0].find_all('p')
    title = soup.find_all('h1', class_='h1')[0].get_text()

    # and classes
    summary_classes = summary[0].find_all('td',attrs={'data-head': 'Category'})
    classes = [c.find('img')['alt'] for c in summary_classes if c.find('img') is not None]


    return {'title': title, 'text':[t.get_text() for t in text], 'classes': classes}

In [3]:
def parse_text(text):
    author = text[0].removeprefix('\nPublished source details\n')
    author = author.strip()
    year = author.split("(", 1)[1].split(")")[0]

    text = set(text[1:])

    return author, year, text

In [4]:
def make_request(study_number):
    try:
        result = get_study_text(study_number)
        if result['text'] is None:
            return None
        result['id'] = study_number
        result['author'], result['year'], result['text'] = parse_text(result['text'])
        return result
    except ValueError as e:
        # print(f"404 Error in {study_number}: {e}")
        return None
    except Exception as e:
        print(f"Error in {study_number}: {e}")
        return None


In [5]:
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

In [6]:
make_request(2)

{'title': 'The management of grass pastures for brent geese\n\n\n\nPublished source details\n                                            Vickery J.A., Sutherland W.J. & Lane S.J. (1994) The management of grass pastures for brent geese. Journal of Applied Ecology, 31, 283-290.                                    \n\n\n\nShare\n                \n\nTweet\n                \n\n\nIcons/envelope\n\n\n\nEmail\n                \n\n\n',
 'text': {'A randomised, replicated, controlled experiment in 1990-1992 on a pasture in Essex, UK (Vickery et al. 1994) found similar grazing intensities of brent geese Branta bernicla (pests) on sheep-grazed plots (averaging 31.6-39.5 total goose droppings/m²/winter), cut and grazed plots (28.2-36.4 droppings), and cut-only plots (28.5-36.8 droppings). The amount of vegetation was similar between grazed (223-236 g dry weight/m²), cut and grazed (195-255 g/m²) and cut-only plots (188-232 g/m²). In another randomised, replicated, controlled experiment, grazing inte

In [7]:
def scrape():
    n  = 12261
    # n = 10

    results = [0 for _ in range(n-2)]

    with tqdm(total=n-2) as pbar:
        with ThreadPoolExecutor(max_workers=20) as executor:
            futures = [executor.submit(make_request, i) for i in range(2, n)]

            for future in as_completed(futures):
                pbar.update(1)
                result = future.result()
                # store in results variable
                if result is not None:
                    results[result['id']-2] = result


    return results

results = scrape()

  0%|          | 0/12259 [00:00<?, ?it/s]

In [8]:
# remove None values
results = [r for r in results if type(r) == dict]

In [9]:
results[3]

{'title': 'Habitat destruction and its effect on a population of smooth newts Triturus vulgaris: an unfortunate field experiment\n\n\n\nPublished source details\n                                            Verrell P.A. (1987) Habitat destruction and its effect on a population of smooth newts Triturus vulgaris: an unfortunate field experiment. The Herpetological Journal, 1, 175-177.                                    \n\n\n\nShare\n                \n\nTweet\n                \n\n\nIcons/envelope\n\n\n\nEmail\n                \n\n\n',
 'text': {'As well as losing the benefits afforded by the terrestrial vegetation e.g. used as a terrestrial refuge/foraging area, European rabbit Oryctolagus cuniculus burrows known to be used as hibernacula by newts and common toads Bufo bufo, were also destroyed. Additionally, the opening up of Cleaver Pond has allowed greater ease of access and this has probably exacerbated problems of increased disturbance and capture of animals by members of the public.

In [10]:
# integrity check
results = [r for r in results if type(r['classes']) == list]

In [11]:
def populate_json(result):
    url = prefix + str(result['id'])
    json = {
        "reference type": "CE Study",
        "author": result["author"],
        "year": result["year"],
        "title": result["title"],
        "series editor": None,
        "series/book title": None,
        "place published": 'None',
        "institution": "Conservation Evidence Team",
        "publisher": 'Conservation Evidence',
        "report number": result["id"],
        "doi": None,
        "multiclasses": result['classes'],
        'relevance': 'relevant',
        "abstract": None,
        "url": url,
        "text": ''.join(result["text"]),
    }
    return json

In [12]:
populate_json(results[3])

{'reference type': 'CE Study',
 'author': 'Verrell P.A. (1987) Habitat destruction and its effect on a population of smooth newts Triturus vulgaris: an unfortunate field experiment. The Herpetological Journal, 1, 175-177.',
 'year': '1987',
 'title': 'Habitat destruction and its effect on a population of smooth newts Triturus vulgaris: an unfortunate field experiment\n\n\n\nPublished source details\n                                            Verrell P.A. (1987) Habitat destruction and its effect on a population of smooth newts Triturus vulgaris: an unfortunate field experiment. The Herpetological Journal, 1, 175-177.                                    \n\n\n\nShare\n                \n\nTweet\n                \n\n\nIcons/envelope\n\n\n\nEmail\n                \n\n\n',
 'series editor': None,
 'series/book title': None,
 'place published': 'None',
 'institution': 'Conservation Evidence Team',
 'publisher': 'Conservation Evidence',
 'date': None,
 'report number': 7,
 'doi': None,
 'mult

In [13]:
json_results = [populate_json(r) for r in results]
json_results[0]

{'reference type': 'CE Study',
 'author': 'Vickery J.A., Sutherland W.J. & Lane S.J. (1994) The management of grass pastures for brent geese. Journal of Applied Ecology, 31, 283-290.',
 'year': '1994',
 'title': 'The management of grass pastures for brent geese\n\n\n\nPublished source details\n                                            Vickery J.A., Sutherland W.J. & Lane S.J. (1994) The management of grass pastures for brent geese. Journal of Applied Ecology, 31, 283-290.                                    \n\n\n\nShare\n                \n\nTweet\n                \n\n\nIcons/envelope\n\n\n\nEmail\n                \n\n\n',
 'series editor': None,
 'series/book title': None,
 'place published': 'None',
 'institution': 'Conservation Evidence Team',
 'publisher': 'Conservation Evidence',
 'date': None,
 'report number': 2,
 'doi': None,
 'multiclasses': ['Bird Conservation',
  'Bird Conservation',
  'Bird Conservation',
  'Farmland Conservation',
  'Natural Pest Control'],
 'relevance': 

In [14]:
import pandas as pd

df = pd.DataFrame(json_results)
df.head()

Unnamed: 0,reference type,author,year,title,series editor,series/book title,place published,institution,publisher,date,report number,doi,multiclasses,relevance,abstract,url,text
0,CE Study,"Vickery J.A., Sutherland W.J. & Lane S.J. (199...",1994,The management of grass pastures for brent gee...,,,,Conservation Evidence Team,Conservation Evidence,,2,,"[Bird Conservation, Bird Conservation, Bird Co...",relevant,,https://www.conservationevidence.com/individua...,Provide 'sacrificial' grasslands to reduce the...
1,CE Study,Anon . (2004) Chemical control of Australian s...,2004,Chemical control of Australian swamp stonecrop...,,,,Conservation Evidence Team,Conservation Evidence,,5,,"[Control of Freshwater Invasive Species, Contr...",relevant,,https://www.conservationevidence.com/individua...,Crassula helmsii: Use lightproof barriers to c...
2,CE Study,"Marrs R.H., Phillips J.D.P., Todd P.A., Ghorba...",2004,Control of Molinia caerulea on upland moors\n\...,,,,Conservation Evidence Team,Conservation Evidence,,6,,"[Shrubland and Heathland Conservation, Shrubla...",relevant,,https://www.conservationevidence.com/individua...,"A randomized, replicated, controlled study in ..."
3,CE Study,Verrell P.A. (1987) Habitat destruction and it...,1987,Habitat destruction and its effect on a popula...,,,,Conservation Evidence Team,Conservation Evidence,,7,,[],relevant,,https://www.conservationevidence.com/individua...,As well as losing the benefits afforded by the...
4,CE Study,"Sorace A., Petrassi F. & Consiglio C. (2004) L...",2004,Long-distance relocation of nestboxes reduces ...,,,,Conservation Evidence Team,Conservation Evidence,,9,,[Bird Conservation],relevant,,https://www.conservationevidence.com/individua...,"A replicated, controlled study from 1995-1998 ..."


In [15]:
# write to json

df.to_json('../../../data/unprocessed/studies/raw_studies.json', orient='records')