In [34]:
import re
import statistics
import pandas as pd

In [35]:
LOGS_PATH = r"Logs/"
test_filename = "02_10_2024_13_24_clab_HTML_extraction.log"

def find_races(lines: list) -> list:
    """
    Finds all unique races in a log file and their status
    """
    races = set()
    race_id_pattern = re.compile(r"(?<=Extracing data for ).*(?= with )")

    for line in lines:
        race_id_match = race_id_pattern.findall(line)
        if len(race_id_match) > 0:
            races.update(race_id_match)

    return list(races)

def find_num_pages(lines: list, race_id: str) -> str:
    """
    Finds the number of pages for a given race
    """
    race_id_pattern = re.compile(rf"(?<=Extracing data for ){re.escape(race_id)}(?= with )")

    for line in lines:
        race_id_match = race_id_pattern.findall(line)
        if len(race_id_match) > 0:
            return line.split(" ")[-2]

def find_total_extraction_time(lines: list, race_id: str) -> float:
    """
    Returns the total extraction time for a given race id (if it is there)
    """
    race_id_pattern = re.compile(rf"(?<=Total extraction time for race ){re.escape(race_id)}")

    for line in lines:
        race_id_match = race_id_pattern.findall(line)
        if len(race_id_match) > 0:
            return float(line.split(" ")[-1][:-2])

def get_page_extraction_times(lines: list, race_id):
    """
    Reads all lines for a given log and returns the average page extraction time by race
    """
    page_exectuion_string = f"{race_id} and page "
    
    page_times = []
    for line in lines:
        if page_exectuion_string in line:
            page_times.append(float(line.split(" ")[-1][:-2]))

    return page_times

def get_page_load_times(lines: list, race_id):
    page_load_string = f"for link https://labs.competitor.com/result/subevent/{race_id}"
    
    page_load_times = []
    for line in lines:
        if page_load_string in line:
            page_load_times.append(float(line.split(" ")[-4]))

    return page_load_times

def print_high_level_stats(df: pd.DataFrame):
    """
    Prints high level stats for the log file
    """
    print(f"Total Number of Races: {df.shape[0]}")
    print(f"Total Page Count: {pd.to_numeric(df["Number of Pages"]).sum()}")
    print(f"Total Execution time (hours): {df['Total Time'].sum()/3600}")
    print(f"Execution Time Per Page (seconds): {df['Total Time'].sum() / pd.to_numeric(df['Number of Pages']).sum()}")
    print(f"Total Extraction Time (hours): {pd.to_numeric(df['Total Page Extraction Time']).sum() / 3600}")
    print(f"Extraction Time Per Page (seconds): {pd.to_numeric(df['Total Page Extraction Time']).sum() / pd.to_numeric(df['Number of Pages']).sum()}")
    print(f"Total Page Load Time (hours): {pd.to_numeric(df['Total Page Load Times']).sum() / 3600}")
    print(f"Page Load Time Per Page (seconds): {pd.to_numeric(df['Total Page Load Times']).sum() / pd.to_numeric(df['Number of Pages']).sum()}")


with open(LOGS_PATH + test_filename) as file:
    lines = file.readlines()
    races = find_races(lines)
    race_data_list = []
    for race in races:
        num_pages = find_num_pages(lines, race)
        total_time = find_total_extraction_time(lines, race)
        page_load_times = get_page_load_times(lines, race)
        page_extraction_times = get_page_extraction_times(lines, race)
        
        race_data = {
            "id": race,
            "Total Time": total_time,
            "Number of Pages": num_pages,
            "Average Page Load Time": statistics.mean(page_load_times),
            "Max Page Load Time": max(page_load_times),
            "Min Page Load Time": min(page_load_times),
            "Total Page Load Times": sum(page_load_times),
            "Percent Page Load Time": (sum(page_load_times) / total_time) * 100 if total_time != None else 0,
            "Page Load Times": page_load_times,
            "Average Page Extraction Time": statistics.mean(page_extraction_times) if len(page_extraction_times) > 0 else 0,
            "Max Page Extraction Time": max(page_extraction_times) if len(page_extraction_times) > 0 else 0,
            "Min Page Extraction Time": min(page_extraction_times) if len(page_extraction_times) > 0 else 0,
            "Total Page Extraction Time": sum(page_extraction_times) if len(page_extraction_times) > 0 else 0,
            "Percent Page Extraction Time": (sum(page_extraction_times) / total_time) * 100 if total_time != None else 0,
            "Page Extraction Times": page_extraction_times
        }
        race_data_list.append(race_data)
final_data = pd.DataFrame(race_data_list)
print_high_level_stats(final_data)
final_data
    

Total Number of Races: 633
Total Page Count: 21972
Total Execution time (hours): 38.79113888888888
Execution Time Per Page (seconds): 6.355730020025486
Total Extraction Time (hours): 34.94144444444445
Extraction Time Per Page (seconds): 5.724977243764792
Total Page Load Time (hours): 2.8630138888888883
Page Load Time Per Page (seconds): 0.4690902057163662


Unnamed: 0,id,Total Time,Number of Pages,Average Page Load Time,Max Page Load Time,Min Page Load Time,Total Page Load Times,Percent Page Load Time,Page Load Times,Average Page Extraction Time,Max Page Extraction Time,Min Page Extraction Time,Total Page Extraction Time,Percent Page Extraction Time,Page Extraction Times
0,F8B5A1CE-B142-4D63-B50D-7EDD6E23FBF0,138.8,30,0.374516,1.99,0.28,11.61,8.364553,"[1.99, 0.34, 0.33, 0.3, 0.46, 0.32, 0.28, 0.28...",3.560000,4.4,0.8,106.8,76.945245,"[3.3, 3.3, 3.4, 3.4, 3.4, 3.4, 3.4, 3.4, 3.6, ..."
1,09626994-3055-E211-B7A2-005056956277,270.0,55,0.396964,2.15,0.31,22.23,8.233333,"[2.15, 0.37, 0.31, 0.35, 0.31, 0.36, 0.41, 0.3...",3.880000,9.1,0.3,213.4,79.037037,"[3.5, 3.4, 3.6, 3.6, 4.1, 4.6, 4.1, 4.0, 3.8, ..."
2,8C623579-DCDD-4779-8F5A-B5E441DB337D,104.2,17,0.397222,2.05,0.26,7.15,6.861804,"[2.05, 0.34, 0.29, 0.29, 0.29, 0.28, 0.31, 0.2...",4.935294,23.5,3.3,83.9,80.518234,"[3.3, 3.3, 3.4, 3.3, 3.4, 3.4, 3.5, 3.5, 3.5, ..."
3,124952B5-7A15-E311-9EC7-005056956277,154.7,31,0.371563,2.01,0.28,11.89,7.685844,"[2.01, 0.42, 0.32, 0.28, 0.29, 0.3, 0.28, 0.31...",3.564516,3.7,2.9,110.5,71.428571,"[3.5, 3.4, 3.4, 3.5, 3.5, 3.5, 3.6, 3.5, 3.6, ..."
4,E23C3D02-D8BA-E511-940C-005056951BF1,135.2,27,0.420714,2.11,0.30,11.78,8.713018,"[2.11, 0.35, 0.37, 0.3, 0.33, 0.32, 0.31, 0.33...",3.851852,4.3,1.3,104.0,76.923077,"[3.5, 3.7, 3.8, 3.8, 3.7, 3.7, 3.7, 4.0, 3.7, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
628,D4796FC3-D1AC-4C69-9DC8-215D1ECB3853,128.5,28,0.394828,2.14,0.30,11.45,8.910506,"[2.14, 0.39, 0.38, 0.3, 0.36, 0.3, 0.34, 0.3, ...",3.482143,3.7,2.9,97.5,75.875486,"[3.3, 3.3, 3.3, 3.3, 3.3, 3.4, 3.4, 3.5, 3.5, ..."
629,216A9127-DABA-E511-940C-005056951BF1,241.0,47,0.370208,2.08,0.28,17.77,7.373444,"[2.08, 0.32, 0.3, 0.31, 0.29, 0.28, 0.66, 0.29...",3.755319,8.9,1.0,176.5,73.236515,"[3.4, 3.4, 3.4, 3.4, 3.4, 3.4, 3.5, 3.5, 3.6, ..."
630,9B3EC571-8550-E911-A97F-000D3A364086,87.1,16,0.500000,2.11,0.33,8.50,9.758898,"[2.11, 0.35, 0.39, 0.34, 0.37, 0.4, 0.59, 0.36...",4.131250,9.1,3.2,66.1,75.889782,"[3.5, 3.5, 3.5, 3.6, 3.5, 3.8, 4.4, 4.3, 3.9, ..."
631,F73D243C-2D80-E611-9410-005056951BF1,259.1,52,0.397925,2.09,0.30,21.09,8.139714,"[2.09, 0.34, 0.34, 0.33, 0.32, 0.33, 0.33, 0.3...",3.917308,8.8,3.2,203.7,78.618294,"[8.5, 3.7, 3.5, 3.4, 3.4, 3.5, 3.5, 3.5, 3.6, ..."


# Estimation

In [37]:
import time
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import statistics as stats
import helper
import os
import logging

logging_file = "Logging - Page Number Estimation.log"

logging.basicConfig(
    filename=logging_file,
    encoding="utf-8")
logger = logging.getLogger()
logger.setLevel(logging.INFO)

def get_page_counts(clab_ids_list: list) -> list:
    """
    Counts the number of pages per event for a list of provided clab events
    """
    COMPETITOR_LAB_LINK = "https://labs.competitor.com/result/subevent/"
    COMPETITOR_LAB_LINK_SUFFIX= "?filter=%7B%7D&order=ASC&perPage=50&sort=FinishRankOverall&page="

    driver = helper.init_web_driver()
    pages = []
    
    for idx, clab_id in enumerate(clab_ids_list):
        data_url_page = f"{COMPETITOR_LAB_LINK}{clab_id}{COMPETITOR_LAB_LINK_SUFFIX}1"
        driver = helper.driver_get_new_page(driver, data_url_page)
        try:
            WebDriverWait(driver, 2.5).until(
                EC.presence_of_element_located((By.XPATH, "//ul[contains(@class, 'MuiPagination-ul')]/li/button"))
            )
            num_pages = int(driver.find_elements(By.XPATH, "//ul[contains(@class, 'MuiPagination-ul')]/li/button")[-2].text)
        except (IndexError, TimeoutException):
            # If there is only one page, handle error
            num_pages = 1

        remaining = len(clab_ids_list) - idx
        pages.append(num_pages)
        print(f"Pages: {num_pages}, Remaining: {remaining} Url: {data_url_page}")
        logging.info(f"Url: {data_url_page} Pages: {num_pages} ")

    driver.quit()
    return pages

def  main():
    """
    Main function for execution
    """

    with open(logging_file, "r") as file:
        already_logged_clab_ids = file.read()
    
    all_clab_ids = pd.read_csv("Competitor Labs URLs.csv")["Competitor Labs ID"].to_list()
    already_accounted_for_ids = [id for id in all_clab_ids if not pd.isna(id) and id in already_logged_clab_ids]
    existing_files = [file[:-5] for file in os.listdir(r"./Race Data HTML") if file.endswith(".html")]
    in_progress_ids = [clab_id for clab_id in all_clab_ids if clab_id not in already_accounted_for_ids and clab_id not in existing_files and not pd.isna(clab_id)]
    
    get_page_counts(in_progress_ids)

main()


In [38]:
def analyze_page_counts():
    """Reads the linked logging file to pull out the useful information about the remaining page counts"""
    average_page_time_seconds = 3
    with open(logging_file, "r") as file:
        lines = file.readlines()
    
    page_counts = []
    test_string = "Pages: "
    race_count = 0
    for line in lines:
        if test_string in line:
            page_counts.append(int(line.split(" ")[-2]))
            race_count += 1

    print(f"Total: {sum(page_counts)}")
    print(f"Races: {race_count}")
    print(f"Estimated completion time (Minutes): {(sum(page_counts) * average_page_time_seconds)/60}")

analyze_page_counts()

Total: 4
Races: 4
Estimated completion time (Minutes): 0.2


In [2]:
import get_clab_ids as test
test.get_competitor_labs_urls("im703-pucon")

  (Session info: chrome=131.0.6778.205)
Stacktrace:
0   chromedriver                        0x0000000100c43184 cxxbridge1$str$ptr + 3626716
1   chromedriver                        0x0000000100c3b9d4 cxxbridge1$str$ptr + 3596076
2   chromedriver                        0x00000001006a8968 cxxbridge1$string$len + 89228
3   chromedriver                        0x00000001006f25e4 cxxbridge1$string$len + 391432
4   chromedriver                        0x00000001006f0c34 cxxbridge1$string$len + 384856
5   chromedriver                        0x00000001006eeb78 cxxbridge1$string$len + 376476
6   chromedriver                        0x00000001006edf90 cxxbridge1$string$len + 373428
7   chromedriver                        0x00000001006e3474 cxxbridge1$string$len + 329624
8   chromedriver                        0x00000001006e2ec8 cxxbridge1$string$len + 328172
9   chromedriver                        0x00000001007265b4 cxxbridge1$string$len + 604376
10  chromedriver                        0x00000001006

[{'id': 'im703-pucon',
  'competitor_lab_ids': 'D16CFDBB-F9D2-E611-9412-005056951BF1, 9C820508-778B-4ADE-99CC-70E96C2B5554, 3B7113D1-D64F-E811-941E-005056951BF1, 094952B5-7A15-E311-9EC7-005056956277, D3A4FCDC-BC4E-4422-AC03-5404EDCBF683, D7E31506-79DE-4180-A0F5-7BFE2C7AD47D, D4E3B1E8-49DA-484D-BFC3-AEE02A068F43, B82C94BA-DFBA-E511-940C-005056951BF1, 27468F10-B431-E711-9416-005056951BF1',
  'years': '2024, 2014, 2022, 2015, 2019, 2025, 2017, 2016, 2023, 2018'}]

In [12]:
pd.read_csv("Competitor Labs URLs.csv").shape

(1272, 4)