# Global Config


In [1]:
from selenium import webdriver
from selenium.common.exceptions import ElementClickInterceptedException, ElementNotInteractableException, StaleElementReferenceException, NoSuchElementException
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import concurrent.futures
import logging
from bs4 import BeautifulSoup
import pandas as pd
import time

In [2]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)

IRONMAN_BASE_LINK = 'https://www.ironman.com'
IRONMAN_RACES_LINK = f'{IRONMAN_BASE_LINK}/races'
COMPETITOR_LAB_LINK = 'https://labs.competitor.com/result/subevent/'
COMPETITOR_LAB_LINK_SUFFIX= '?filter=%7B%7D&order=ASC&perPage=50&sort=FinishRankOverall&page='
NUM_THREADS = 4
RETRY_COUNT = 3
HEADLESS_MODE = False

# Exploration

In [3]:
def init_web_driver(link, headless = HEADLESS_MODE):
    options = Options()
    if headless:
        options.add_argument('--headless=new')
    start_time = time.time()
    driver = webdriver.Chrome(options=options)
    driver.get(link)
    wait = WebDriverWait(driver, 60)
    wait.until(EC.presence_of_element_located((By.TAG_NAME, 'body')))

    logging.info(f'Initializing Web Driver - waited {time.time() - start_time:.2f} for link {link}')
    return driver

def get_page_source(link):
    driver = init_web_driver(link)
    source = driver.page_source
    driver.quit()
    return BeautifulSoup(source, 'html.parser')

def get_race_ids(headless = True):
    try:
        logging.info('Getting race ids')
        driver = init_web_driver(IRONMAN_RACES_LINK, headless)
        race_ids = []

        next_page_button_css = '.nextPageButton:not(.hidden)'
        next_page_button = driver.find_element(By.CSS_SELECTOR, next_page_button_css)
        page_count = 1

        # While there is a next page button visible
        while next_page_button:
            logging.info(f'Scraping page {page_count}')
            race_ids += [link.get_attribute('href').split('/')[-1] for link in driver.find_elements(By.LINK_TEXT, 'See Race Details')]

            # Check to see if this is the last page
            if('hidden' in next_page_button.get_attribute('class')):
                logging.info(f'Found last page, ending execution')
                break
            else:
                driver.execute_script('arguments[0].click();', next_page_button)
                page_count += 1
        
        logging.info(f"Retrieved {len(race_ids)} race ids")
        return race_ids
    finally:
        driver.quit()

# Data Retrieval

In [4]:
def get_race_info(race_id):
    logging.info(f"Getting race info for {race_id}")
    race_link = f"{IRONMAN_BASE_LINK}/{race_id}"
    race_html = get_page_source(race_link)

    race_object = {
        'id': race_id,
        'title': race_html.find('h1').text,
        'swim_type': race_html.find('div', class_='swim-type').text[5:-1],
        'bike_type': race_html.find('div', class_='bike-type').text[5:-1],
        'run_type': race_html.find('div', class_='run-type').text[4:-1],
        'avg_air_temp': race_html.find('div', class_='airTemp').text.split("Temp")[-1][:-1],
        'avg_water_temp': race_html.find('div', class_='waterTemp').text.split("Temp")[-1][:-1],
        'airport': race_html.find('div', class_='airport').text[8:-1]
    }
    return race_object

def get_all_races_info(workers=NUM_THREADS):
    with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
            races = list(executor.map(get_race_info, get_race_ids()))
    
    return pd.DataFrame(races)

# get_all_races_info().to_csv("Race Data.csv", index=False)
# get_all_races_info()

In [5]:
def get_competitor_labs_urls(race_id):
    logging.info(f'Getting data page for race {race_id}')
    results_link = f'{IRONMAN_BASE_LINK}/{race_id}-results'
    

    driver = init_web_driver(results_link)

    race_competitor_labs_urls = []
    current_year_url_ids = set()
    years = set()

    results_years_buttons = driver.find_elements(By.CSS_SELECTOR, '.tab-remote')

    for result_year in results_years_buttons:
        try:
            result_year.click()
            iframes_list = driver.find_elements(By.TAG_NAME, 'iframe')
            for iframe in iframes_list:
                iframe_url = iframe.get_attribute('src')
                if COMPETITOR_LAB_LINK in iframe_url and result_year.text.strip() != '':
                    current_year_url_ids.add(iframe_url.split('/')[-1])
                    years.add(result_year.text)
            logging.info(f'Competitor labs ID retreived for {race_id} - {result_year.text}')
        except (ElementClickInterceptedException, ElementNotInteractableException, StaleElementReferenceException, NoSuchElementException) as error:
            logging.warning(f'Error element on {race_id} - {result_year.text} for year {result_year.text} --- {error}')

    race_object = {
        'id': race_id,
        'competitor_lab_ids': ', '.join(list(current_year_url_ids)),
        'years': ', '.join(list(years)) # Included for QA
    }
    race_competitor_labs_urls.append(race_object)

    driver.quit()
    return race_competitor_labs_urls

def get_all_race_data_urls(race_ids):
    race_dimension_info = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
        result_urls = list(executor.map(get_competitor_labs_urls, race_ids))

    for urls in result_urls:
        race_dimension_info += urls

    return pd.DataFrame(race_dimension_info)

# race_ids = pd.read_csv('Race Data.csv')['id'].to_list()
# get_all_race_data_urls(race_ids).to_csv('Race Data URLs.csv', index=False)

In [6]:
def get_clab_data(race_data_id: str) -> dict:
    data_url_first_page = f'{COMPETITOR_LAB_LINK}{race_data_id}{COMPETITOR_LAB_LINK_SUFFIX}1'
    driver = init_web_driver(data_url_first_page)
    time.sleep(2)
    
    num_pages = int(driver.find_elements(By.XPATH, '//ul[contains(@class, "MuiPagination-ul")]/li/button')[-2].text)
    driver.quit()

    data_page_urls = [f'{COMPETITOR_LAB_LINK}{race_data_id}{COMPETITOR_LAB_LINK_SUFFIX}{page_num}' for page_num in range(1, num_pages+1)]

    with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
        pages_data_list = list(executor.map(clabs_data_extraction_handler, data_page_urls))

    competitor_data_list = []
    for page_data in pages_data_list:
        competitor_data_list += page_data

    return pd.DataFrame(competitor_data_list)

def clabs_data_extraction_handler(data_url):
    current_page = data_url.split('=')[-1]
    race_data_id = data_url[len(COMPETITOR_LAB_LINK): len(COMPETITOR_LAB_LINK)+36]
    logging.info(f'Extracting data for page {current_page}')
    driver = init_web_driver(data_url)
    error_count = 0
    complete_flag = False

    WebDriverWait(driver, 10).until(
        expected_conditions.presence_of_element_located((By.TAG_NAME, 'tr'))
    )

    # Retry 
    while not complete_flag:
        try:
            page_values = clabs_extract_data_from_page(driver, current_page, race_data_id)
            complete_flag = True
            driver.quit()
            return page_values
        except (StaleElementReferenceException, NoSuchElementException) as error:
            logging.error(f'Retrying page {current_page}, retry count : {error}')
            time.sleep(5)
            error_count += 1
            if error_count == RETRY_COUNT:
                logging.error(f'Retried page {error_count} times, raising error')
                raise

def clabs_extract_data_from_page(driver, current_page, race_data_id):
    DNF_DESIGNATIONS = ['DNS', 'DNF', 'DQ']
    competitor_data_list = []
    table_rows = driver.find_elements(By.XPATH, '//tbody/tr')

    for idx, table_row in enumerate(table_rows):
        logging.info(f'Extracting from row {idx+1} on page {current_page}')
        table_row.click()
        # Wait until country flag has loaded -> implies the rest of the data has loaded as well
        WebDriverWait(driver, 10).until(
            expected_conditions.presence_of_element_located((By.XPATH, '//div[contains(@class, "text") and contains(@class, "countryFlag")]/img'))
        )

        summary_rows = driver.find_elements(By.CLASS_NAME, 'detailsButton')
        for row in summary_rows:
            time.sleep(0.25)
            row.click()

        designation = driver.find_element(By.XPATH, '//p[text()="Designation"]/preceding-sibling::p').text
        dnf_flag = True if designation in DNF_DESIGNATIONS else False

        competitor_data = {
            'data_source_id': race_data_id,
            'Name': table_row.find_elements(By.TAG_NAME, 'span')[1].text,
            'Designation': designation,
            'Div Rank' : driver.find_element(By.XPATH, '//p[text()="Div Rank"]/preceding-sibling::p').text if not dnf_flag else '',
            'Gender Rank': driver.find_element(By.XPATH, '//p[text()="Gender Rank"]/preceding-sibling::p').text if not dnf_flag else '',
            'Overall Rank': driver.find_element(By.XPATH, '//p[text()="Overall Rank"]/preceding-sibling::p').text if not dnf_flag else '',
            'Bib': driver.find_elements(By.XPATH, '//div[contains(@class, "tableRow") and contains(@class, "tableFooter")]/div/div')[0].text,
            'Division': driver.find_elements(By.XPATH, '//div[contains(@class, "tableRow") and contains(@class, "tableFooter")]/div/div')[1].text,
            'Country': driver.find_element(By.XPATH, '//div[contains(@class, "text") and contains(@class, "countryFlag")]/img').get_attribute('alt'),
            'Points': driver.find_elements(By.XPATH, '//div[contains(@class, "tableRow") and contains(@class, "tableFooter")]/div/div')[3].text,
            'Swim Time': driver.find_elements(By.XPATH, '//div[contains(@id, "swimDetails")]/div/div/div/div')[4].text,
            'Swim Div Rank': driver.find_elements(By.XPATH, '//div[contains(@id, "swimDetails")]/div/div/div/div')[5].text,
            'Swim Gender Rank': driver.find_elements(By.XPATH, '//div[contains(@id, "swimDetails")]/div/div/div/div')[6].text,
            'Swim Overall Rank': driver.find_elements(By.XPATH, '//div[contains(@id, "swimDetails")]/div/div/div/div')[7].text,
            'Bike Time': driver.find_elements(By.XPATH, '//div[contains(@id, "bikeDetails")]/div/div/div/div')[4].text,
            'Bike Div Rank': driver.find_elements(By.XPATH, '//div[contains(@id, "bikeDetails")]/div/div/div/div')[5].text,
            'Bike Gender Rank': driver.find_elements(By.XPATH, '//div[contains(@id, "bikeDetails")]/div/div/div/div')[6].text,
            'Bike Overall Rank': driver.find_elements(By.XPATH, '//div[contains(@id, "bikeDetails")]/div/div/div/div')[7].text,
            'Run Time': driver.find_elements(By.XPATH, '//div[contains(@id, "runDetails")]/div/div/div/div')[4].text,
            'Run Div Rank': driver.find_elements(By.XPATH, '//div[contains(@id, "runDetails")]/div/div/div/div')[5].text,
            'Run Gender Rank': driver.find_elements(By.XPATH, '//div[contains(@id, "runDetails")]/div/div/div/div')[6].text,
            'Run Overall Rank': driver.find_elements(By.XPATH, '//div[contains(@id, "runDetails")]/div/div/div/div')[7].text,
            'Transition 1': driver.find_elements(By.XPATH, '//div[contains(@id, "transitions")]/div/div/div/div')[2].text,
            'Transition 2': driver.find_elements(By.XPATH, '//div[contains(@id, "transitions")]/div/div/div/div')[3].text,
            'Overall Time': driver.find_element(By.XPATH, '//div[contains(@class, "summaryRow") and contains(@class, "overallRow")]/p[contains(@class, "summaryTime")]').text
        }
        # Close the current row
        table_row.click()
        competitor_data_list.append(competitor_data)

    return competitor_data_list

data = pd.read_csv('Race Data URLs.csv')

for race_data_list in data['competitor_lab_ids']:
    race_data_ids_list = race_data_list.split(', ')
    for race_data_id in race_data_ids_list:
        get_clab_data(race_data_id).to_csv(f'Race_Data/{race_data_id}_Data.csv', index=False)

INFO:root:Initializing Web Driver - waited 2.47 for link https://labs.competitor.com/result/subevent/3B7113D1-D64F-E811-941E-005056951BF1?filter=%7B%7D&order=ASC&perPage=50&sort=FinishRankOverall&page=1
INFO:root:Extracting data for page 1
INFO:root:Extracting data for page 2
INFO:root:Extracting data for page 3
INFO:root:Extracting data for page 4
INFO:root:Initializing Web Driver - waited 2.01 for link https://labs.competitor.com/result/subevent/3B7113D1-D64F-E811-941E-005056951BF1?filter=%7B%7D&order=ASC&perPage=50&sort=FinishRankOverall&page=3
INFO:root:Initializing Web Driver - waited 2.06 for link https://labs.competitor.com/result/subevent/3B7113D1-D64F-E811-941E-005056951BF1?filter=%7B%7D&order=ASC&perPage=50&sort=FinishRankOverall&page=2
INFO:root:Initializing Web Driver - waited 2.13 for link https://labs.competitor.com/result/subevent/3B7113D1-D64F-E811-941E-005056951BF1?filter=%7B%7D&order=ASC&perPage=50&sort=FinishRankOverall&page=4
INFO:root:Initializing Web Driver - wait