## Parkrun location scraping

This code scrapes the parkrun locations across the world. The parkrun wiki page on anniveraries https://wiki.parkrun.com/index.php/Anniversaries is used to validate all parkrun locations in the UK have been scraped (regular events and junior events). 

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import geopandas as gpd
import json

In [3]:
# URL of the Parkrun Anniversaries page
url = "https://wiki.parkrun.com/index.php/Anniversaries"

# Headers to mimic a real browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Fetch the page content
response = requests.get(url, headers=headers)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, "html.parser")

    # Find all tables on the page
    tables = soup.find_all("table")

    # Function to extract table data
    def extract_table_data(table):
        headers = [th.text.strip() for th in table.find_all("th")]
        rows = []
        for tr in table.find_all("tr")[1:]:  # Skip header row
            cells = [td.text.strip() for td in tr.find_all("td")]
            if cells:
                rows.append(cells)
        return headers, rows

    # Extract data from the first table (regular events)
    if len(tables) > 0:
        reg_headers, reg_rows = extract_table_data(tables[0])
        df_reg = pd.DataFrame(reg_rows, columns=reg_headers)
        df_reg.to_csv("data/parkrun_anniversaries.csv", index=False)
        print("Regular Parkrun events saved to parkrun_anniversaries.csv")

    # Extract data from the second table (junior events)
    if len(tables) > 1:
        jr_headers, jr_rows = extract_table_data(tables[2])
        df_jr = pd.DataFrame(jr_rows, columns=jr_headers)
        df_jr.to_csv("data/junior_parkrun_anniversaries.csv", index=False)
        print("Junior Parkrun events saved to junior_parkrun_anniversaries.csv")

else:
    print(f"Failed to retrieve the webpage, status code: {response.status_code}")


Regular Parkrun events saved to parkrun_anniversaries.csv
Junior Parkrun events saved to junior_parkrun_anniversaries.csv


In [4]:
# define whether Parkrun is regular or Junior Parkrun for later join
df_reg['TYPE']='Regular'
df_jr['TYPE']='Junior'
# combine anniversary dataframes
park_runs =pd.concat([df_reg,df_jr])

In [5]:
# get the park run location data (lat, long)
response = requests.get("https://images.parkrun.com/events.json")
events = json.loads(response.content)

# Access the features list from the 'events' dictionary
features = events.get('events', {}).get('features', [])

# Prepare a list to hold event data
data = []

# Loop through the features and extract necessary fields
for feature in features:
    event_info = {
        'event_id': feature.get('id', ''),
        'eventname': feature.get('properties', {}).get('eventname', ''),
        'EventLongName': feature.get('properties', {}).get('EventLongName', ''),
        'EventShortName': feature.get('properties', {}).get('EventShortName', ''),
        'LocalisedEventLongName': feature.get('properties', {}).get('LocalisedEventLongName', ''),
        'countrycode': feature.get('properties', {}).get('countrycode', ''),
        'seriesid': feature.get('properties', {}).get('seriesid', ''),
        'EventLocation': feature.get('properties', {}).get('EventLocation', ''),
        'latitude': feature.get('geometry', {}).get('coordinates', [None, None])[1],  # Extract latitude
        'longitude': feature.get('geometry', {}).get('coordinates', [None, None])[0]  # Extract longitude
    }
    data.append(event_info)

# Create a pandas DataFrame from the extracted data
df = pd.DataFrame(data)
# convert data to geodataframe
gpd.GeoDataFrame(df,geometry=gpd.points_from_xy(df.longitude, df.latitude), crs="EPSG:4326"
).to_file('data/parkrun_locations.geojson')

In [6]:
# subset locaiton data to include only UK (country code 97 in data)
uk_park_run =df.loc[df['countrycode']==97,]

# join locaiton data to anniversary data to check no locations missing 
uk_park_run =uk_park_run.merge(park_runs, left_on='EventLongName', right_on='Event', how='left', indicator=True)

# check all locations have anniversary data
print(uk_park_run['_merge'].value_counts())

# convert data to geodataframe
uk_park_run =gpd.GeoDataFrame(uk_park_run.drop(columns=['_merge']),geometry=gpd.points_from_xy(uk_park_run.longitude, uk_park_run.latitude), crs="EPSG:4326"
)

_merge
both          1285
left_only        0
right_only       0
Name: count, dtype: int64


In [8]:
# # visualise UK parkruns
uk_park_run.explore()

In [7]:
# remove Faulkland island park run from data
uk_park_run =uk_park_run.loc[uk_park_run['event_id']!=2529]
# save park run locations as geojson
uk_park_run.to_file('data/uk_park_run.geojson')