In [1]:
import requests
import bs4 as BeautifulSoup

In [2]:
prefix = "https://www.conservationevidence.com/individual-study/" # add number to end, from 2 to 12261
# skip 1, it's a test page

def get_study_text(study_number):
    url = prefix + str(study_number)
    page = requests.get(url)
    if page.status_code == 404:
        raise ValueError("Study not found")
    soup = BeautifulSoup.BeautifulSoup(page.content, 'html.parser')

    summary = soup.find_all('section', class_='summary')
    if len(summary) == 0:
        return None

    # get <p> tags out
    text = summary[0].find_all('p')
    title = soup.find_all('h1', class_='h1')[0].get_text()

    # and classes
    summary_classes = summary[0].find_all('td',attrs={'data-head': 'Category'})
    classes = [c.find('img')['alt'] for c in summary_classes if c.find('img') is not None]



    return {'title': title, 'text':[t.get_text() for t in text], 'classes': classes}

In [3]:
def parse_text(text):
    author = text[0].removeprefix('\nPublished source details\n')
    author = author.strip()
    year = author.split("(", 1)[1].split(")")[0]

    text = set(text[1:])

    return author, year, text

In [4]:
def make_request(study_number):
    try:
        result = get_study_text(study_number)
        if result['text'] is None:
            return None
        result['id'] = study_number
        result['author'], result['year'], result['text'] = parse_text(result['text'])
        return result
    except ValueError as e:
        # print(f"404 Error in {study_number}: {e}")
        return None
    except Exception as e:
        print(f"Error in {study_number}: {e}")
        return None


In [5]:
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

In [6]:
make_request(2)

{'title': 'The management of grass pastures for brent geese\n\n\n\nPublished source details\n                                            Vickery J.A., Sutherland W.J. & Lane S.J. (1994) The management of grass pastures for brent geese. Journal of Applied Ecology, 31, 283-290.                                    \n\n\n\nShare\n                \n\nTweet\n                \n\n\nIcons/envelope\n\n\n\nEmail\n                \n\n\n',
 'text': {'A randomised, replicated, controlled experiment in 1990-1992 on a pasture in Essex, UK (Vickery et al. 1994) found similar grazing intensities of brent geese Branta bernicla (pests) on sheep-grazed plots (averaging 31.6-39.5 total goose droppings/m²/winter), cut and grazed plots (28.2-36.4 droppings), and cut-only plots (28.5-36.8 droppings). The amount of vegetation was similar between grazed (223-236 g dry weight/m²), cut and grazed (195-255 g/m²) and cut-only plots (188-232 g/m²). In another randomised, replicated, controlled experiment, grazing inte

In [7]:
def scrape():
    n  = 12261
    # n = 10

    results = [0 for _ in range(n-2)]

    with tqdm(total=n-2) as pbar:
        with ThreadPoolExecutor(max_workers=20) as executor:
            futures = [executor.submit(make_request, i) for i in range(2, n)]

            for future in as_completed(futures):
                pbar.update(1)
                result = future.result()
                # store in results variable
                if result is not None:
                    results[result['id']-2] = result


    return results

def retrieve():
    folder = 'data/labelled/studies/'
    import json
    import os
    for file in os.listdir(folder):
        if file.endswith(".json"):
            with open(os.path.join(folder,file)) as f:
                data = json.load(f)
                data['id'] = file.split(' - ')[0]
                data['classes'] = [data['class']]
                yield data


results = scrape()
# results = retrieve()

  0%|          | 0/12259 [00:00<?, ?it/s]

In [None]:
# remove None values
results = [r for r in results if type(r) == dict]

In [None]:
results[3]

{'title': 'Habitat destruction and its effect on a population of smooth newts Triturus vulgaris: an unfortunate field experiment\n\n\n\nPublished source details\n                                            Verrell P.A. (1987) Habitat destruction and its effect on a population of smooth newts Triturus vulgaris: an unfortunate field experiment. The Herpetological Journal, 1, 175-177.                                    \n\n\n\nShare\n                \n\nTweet\n                \n\n\nIcons/envelope\n\n\n\nEmail\n                \n\n\n',
 'text': {'As well as losing the benefits afforded by the terrestrial vegetation e.g. used as a terrestrial refuge/foraging area, European rabbit Oryctolagus cuniculus burrows known to be used as hibernacula by newts and common toads Bufo bufo, were also destroyed. Additionally, the opening up of Cleaver Pond has allowed greater ease of access and this has probably exacerbated problems of increased disturbance and capture of animals by members of the public.

In [None]:
import os
from json import load
class_name_map = {}


for synopsis in os.listdir('data/labelled/synopses'):
    if synopsis.endswith(".json"):
        with open(os.path.join('data/labelled/synopses',synopsis)) as f:
            syn = load(f)
            original_name = synopsis.split(' - ')[1].split('.')[0]
            class_name_map[original_name] = syn['class']
            class_name_map[original_name+' Conservation'] = syn["class"]



JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
def populate_json(result):
    classes = result['classes']
    # get most-common class
    # return first most common if there is a tie
    result['classes'] = max(set(classes), key=classes.count)
    url = prefix + str(result['id'])
    json = {
        "reference type": "CE Study",
        "author": result["author"],
        "year": result["year"],
        "title": result["title"],
        "series editor": None,
        "series/book title": None,
        "place published": 'None',
        "institution": "Conservation Evidence Team",
        "publisher": 'Conservation Evidence',
        "date": None,
        "report number": result["id"],
        "doi": None,
        "class": class_name_map[result['classes'][0]],
        "abstract": None,
        "url": url,
        "text": ''.join(result["text"]),
    }
    return json

In [None]:
populate_json(results[3])

KeyError: 'S'

In [None]:
json_results = [populate_json(r) for r in results]
json_results[0]

{'reference type': 'CE Study',
 'author': 'Vickery J.A., Sutherland W.J. & Lane S.J. (1994) The management of grass pastures for brent geese. Journal of Applied Ecology, 31, 283-290.',
 'year': '1994',
 'title': 'The management of grass pastures for brent geese\n\n\n\nPublished source details\n                                            Vickery J.A., Sutherland W.J. & Lane S.J. (1994) The management of grass pastures for brent geese. Journal of Applied Ecology, 31, 283-290.                                    \n\n\n\nShare\n                \n\nTweet\n                \n\n\nIcons/envelope\n\n\n\nEmail\n                \n\n\n',
 'series editor': None,
 'series/book title': None,
 'place published': 'None',
 'institution': 'Conservation Evidence Team',
 'publisher': 'Conservation Evidence',
 'date': None,
 'report number': 2,
 'doi': None,
 'class': 'Bird Conservation',
 'abstract': None,
 'url': 'https://www.conservationevidence.com/individual-study/<built-in function id>',
 'text': "A ser

In [None]:
# get all classes
classes = set()
for r in results:
    classes.update(r['classes'])
classes

NameError: name 'results' is not defined

In [None]:
def write_json(json_results):
    # write to individual files

    import json

    try:

        with open(
            f"data/labelled/studies/{results['id']} - {json_results['title'].replace('.','').replace(':','').replace('/','')[:25]}.json",
            "w",
        ) as f:
            json.dump(json_results, f, indent=4)

    except:
        try:
            with open(
                f"data/labelled/studies/{results['id']} - {json_results['title'].replace('.','').replace(':','').replace('/','')[:10]}.json",
                "w",
            ) as f:
                json.dump(json_results, f, indent=4)
        except:
            with open(f"data/labelled/studies/{results['id']}.json", "w") as f:
                json.dump(json_results, f, indent=4)


for result in tqdm(json_results):
    write_json(result)

  0%|          | 0/10105 [00:00<?, ?it/s]