# Global Config


In [331]:
from selenium import webdriver
from selenium.common.exceptions import ElementClickInterceptedException, ElementNotInteractableException, StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import concurrent.futures
import logging
from bs4 import BeautifulSoup
import pandas as pd
import time

In [265]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
IRONMAN_BASE_LINK = 'https://www.ironman.com'
IRONMAN_RACES_LINK = f'{IRONMAN_BASE_LINK}/races'

# Exploration

In [266]:
def init_web_driver(link, headless = True):
    logging.info("Initializing Web Driver")

    options = Options()
    if headless:
        options.add_argument('--headless=new')

    driver = webdriver.Chrome(options=options)
    driver.get(link)
    return driver

def get_page_source(link):
    driver = init_web_driver(link)
    source = driver.page_source
    driver.quit()
    return BeautifulSoup(source, 'html.parser')

def get_race_ids(headless = True):
    try:
        logging.info('Getting race ids')
        driver = init_web_driver(IRONMAN_RACES_LINK, headless)
        race_ids = []

        next_page_button_css = '.nextPageButton:not(.hidden)'
        next_page_button = driver.find_element(By.CSS_SELECTOR, next_page_button_css)
        page_count = 1

        # While there is a next page button visible
        while next_page_button:
            logging.info(f'Scraping page {page_count}')
            race_ids += [link.get_attribute('href').split('/')[-1] for link in driver.find_elements(By.LINK_TEXT, 'See Race Details')]

            # Check to see if this is the last page
            if('hidden' in next_page_button.get_attribute('class')):
                logging.info(f'Found last page, ending execution')
                break
            else:
                driver.execute_script('arguments[0].click();', next_page_button)
                page_count += 1
        
        logging.info(f"Retrieved {len(race_ids)} race ids")
        return race_ids
    finally:
        driver.quit()

# Data Retrieval

In [267]:
def get_race_info(race_id):
    logging.info(f"Getting race info for {race_id}")
    race_link = f"{IRONMAN_BASE_LINK}/{race_id}"
    race_html = get_page_source(race_link)

    race_object = {
        'id': race_id,
        'title': race_html.find('h1').text,
        'swim_type': race_html.find('div', class_='swim-type').text[5:-1],
        'bike_type': race_html.find('div', class_='bike-type').text[5:-1],
        'run_type': race_html.find('div', class_='run-type').text[4:-1],
        'avg_air_temp': race_html.find('div', class_='airTemp').text.split("Temp")[-1][:-1],
        'avg_water_temp': race_html.find('div', class_='waterTemp').text.split("Temp")[-1][:-1],
        'airport': race_html.find('div', class_='airport').text[8:-1]
    }
    return race_object

def get_all_races_info():
    races = []
    for race in get_race_ids():
        races.append(get_race_info(race))
    return pd.DataFrame(races)

get_all_races_info().to_csv("Race Data.csv", index=False)

INFO:root:Getting race ids
INFO:root:Initializing Web Driver


UnboundLocalError: cannot access local variable 'driver' where it is not associated with a value

In [329]:
def get_competitor_labs_urls(race_id):
    logging.info(f'Getting data page for race {race_id}')
    results_link = f'https://www.ironman.com/{race_id}-results'
    COMPETITOR_LAB_LINK = 'https://labs.competitor.com/result/subevent/'

    driver = init_web_driver(results_link)

    race_competitor_labs_urls = []
    current_year_url_ids = set()
    years = set()

    results_years_buttons = driver.find_elements(By.CSS_SELECTOR, '.tab-remote')

    for result_year in results_years_buttons:
        try:
            result_year.click()
            iframes_list = driver.find_elements(By.TAG_NAME, 'iframe')
            time.sleep(25)
            for iframe in iframes_list:
                iframe_url = iframe.get_attribute('src')
                if COMPETITOR_LAB_LINK in iframe_url and result_year.text.strip() != '':
                    logging.info(f'Competitor labs ID retreived for {race_id} - {result_year.text} with id {iframe_url.split("/")[-1]}')
                    current_year_url_ids.add(iframe_url.split('/')[-1])
                    years.add(result_year.text)
        except (ElementClickInterceptedException, ElementNotInteractableException, StaleElementReferenceException):
            logging.warning(f'Non-clickable/ineractable/stale element on {race_id} - {result_year.text} for year {result_year.text}')

    race_object = {
        'id': race_id,
        'competitor_lab_ids': list(current_year_url_ids),
        'years': years # Included for QA
    }
    race_competitor_labs_urls.append(race_object)

    driver.quit()
    return race_competitor_labs_urls

def get_all_race_data_urls(race_ids):
    race_dimension_info = []

    for idx, race_id in enumerate(race_ids):
        logging.info(f'On race {race_id} out of {idx+1}/{len(race_ids)}')
        race_dimension_info += get_competitor_labs_urls(race_id)

    return pd.DataFrame(race_dimension_info)


race_ids = pd.read_csv('Race Data.csv')['id'].to_list()
get_all_race_data_urls(race_ids).to_csv("Race Data URLs.csv", index=False)

INFO:root:On race im703-pucon out of 1/186
INFO:root:Getting data page for race im703-pucon
INFO:root:Initializing Web Driver
INFO:root:Competitor labs ID retreived for im703-pucon - 2023 with id D3A4FCDC-BC4E-4422-AC03-5404EDCBF683
INFO:root:Competitor labs ID retreived for im703-pucon - 2022 with id D3A4FCDC-BC4E-4422-AC03-5404EDCBF683


KeyboardInterrupt: 

In [None]:
data = pd.read_csv("Race Data URLs.csv")
data

Unnamed: 0,id,competitor_lab_ids,years
0,im703-pucon,"['D16CFDBB-F9D2-E611-9412-005056951BF1', '2746...","{'2016', '2014', '2018', '2019', '2017', '2015..."
1,im703-tasmania,['15141085-CF1B-4A9A-83C1-AD4B3557CF2E'],{'5 February 2023'}
2,5150-camsur,['5EC36C17-C4ED-43DF-9713-A94555F43831'],{'2022'}
3,im703-oman,"['6F4C381E-8C66-E911-A97A-000D3A36478D', 'C200...","{'2023', '2019', '2022', '2020'}"
4,im703-bangsaen,['8DFA084E-A5B7-E911-A986-000D3A364086'],"{'2019', '2020'}"
...,...,...,...
181,im-finland,"['7EE1F377-6038-429C-9682-F32EB13F8892', 'B484...","{'2021 Bike-Run', '2021', '2022'}"
182,im703-tiberias,['0D213280-D97C-4475-ABFF-4F79193AC7D4'],{'2021'}
183,im-israel,[],set()
184,im703-dresden,['6536126C-06E5-44EC-A52D-A85D92EA6161'],{'2022'}


In [330]:
count = 0

for race, year in zip(data['competitor_lab_ids'], data['years']):
    if len(race.split(",")) != len(year.split(",")):
        count += 1
        print(f'{len(race.split(","))}, {len(year.split(","))}')
        print(f'{race.split(",")}, {year.split(",")}')

count

7, 8
["['D16CFDBB-F9D2-E611-9412-005056951BF1'", " '27468F10-B431-E711-9416-005056951BF1'", " 'D3A4FCDC-BC4E-4422-AC03-5404EDCBF683'", " '3B7113D1-D64F-E811-941E-005056951BF1'", " 'D4E3B1E8-49DA-484D-BFC3-AEE02A068F43'", " '9C820508-778B-4ADE-99CC-70E96C2B5554'", " 'B82C94BA-DFBA-E511-940C-005056951BF1']"], ["{'2016'", " '2014'", " '2018'", " '2019'", " '2017'", " '2015'", " '2022'", " '2023'}"]
3, 4
["['6F4C381E-8C66-E911-A97A-000D3A36478D'", " 'C200EDB9-C575-41CA-8ADE-297433B8FE11'", " 'C14C594A-A71D-4AB9-BDDB-E273C429C824']"], ["{'2023'", " '2019'", " '2022'", " '2020'}"]
1, 2
["['8DFA084E-A5B7-E911-A986-000D3A364086']"], ["{'2019'", " '2020'}"]
5, 6
["['170A1686-3DBC-E411-9400-005056951BF1'", " '640E92CB-82AD-42A9-B13C-6B131B749798'", " '054952B5-7A15-E311-9EC7-005056956277'", " 'D9616994-3055-E211-B7A2-005056956277'", " 'F7FDDE9A-C773-464F-872B-5725B50CFCD8']"], ["{'2016'", " '2013'", " '2012'", " '2014'", " '2022'", " '2023'}"]
6, 7
["['3701C834-8179-46FD-97ED-4C1F0DC3840F'", " '

125

In [328]:
count = 0

for race, year in zip(data['competitor_lab_ids'], data['years']):
    if len(race.split(",")) != len(year.split(",")):
        count += 1
        print(f'{len(race.split(","))}, {len(year.split(","))}')
        print(f'{race.split(",")}, {year.split(",")}')

count

7, 8
["['D16CFDBB-F9D2-E611-9412-005056951BF1'", " '27468F10-B431-E711-9416-005056951BF1'", " 'D3A4FCDC-BC4E-4422-AC03-5404EDCBF683'", " '3B7113D1-D64F-E811-941E-005056951BF1'", " 'D4E3B1E8-49DA-484D-BFC3-AEE02A068F43'", " '9C820508-778B-4ADE-99CC-70E96C2B5554'", " 'B82C94BA-DFBA-E511-940C-005056951BF1']"], ["{'2016'", " '2014'", " '2018'", " '2019'", " '2017'", " '2015'", " '2022'", " '2023'}"]
3, 4
["['6F4C381E-8C66-E911-A97A-000D3A36478D'", " 'C200EDB9-C575-41CA-8ADE-297433B8FE11'", " 'C14C594A-A71D-4AB9-BDDB-E273C429C824']"], ["{'2023'", " '2019'", " '2022'", " '2020'}"]
1, 2
["['8DFA084E-A5B7-E911-A986-000D3A364086']"], ["{'2019'", " '2020'}"]
5, 6
["['170A1686-3DBC-E411-9400-005056951BF1'", " '640E92CB-82AD-42A9-B13C-6B131B749798'", " '054952B5-7A15-E311-9EC7-005056956277'", " 'D9616994-3055-E211-B7A2-005056956277'", " 'F7FDDE9A-C773-464F-872B-5725B50CFCD8']"], ["{'2016'", " '2013'", " '2012'", " '2014'", " '2022'", " '2023'}"]
6, 7
["['3701C834-8179-46FD-97ED-4C1F0DC3840F'", " '

125