In [1]:
import re
import statistics
import pandas as pd

In [2]:
LOGS_PATH = r"Logs/"
test_filename = "02_08_2024_00_03.log"

def find_races(lines: list) -> list:
    """
    Finds all unique races in a log file and their status
    """
    races = set()
    race_id_pattern = re.compile(r"(?<=Extracing data for ).*(?= with )")

    for line in lines:
        race_id_match = race_id_pattern.findall(line)
        if len(race_id_match) > 0:
            races.update(race_id_match)

    return list(races)

def find_num_pages(lines: list, race_id: str) -> str:
    """
    Finds the number of pages for a given race
    """
    race_id_pattern = re.compile(rf"(?<=Extracing data for ){re.escape(race_id)}(?= with )")

    for line in lines:
        race_id_match = race_id_pattern.findall(line)
        if len(race_id_match) > 0:
            return line.split(" ")[-2]

def find_total_extraction_time(lines: list, race_id: str) -> float:
    """
    Returns the total extraction time for a given race id (if it is there)
    """
    race_id_pattern = re.compile(rf"(?<=Total extraction time for race ){re.escape(race_id)}")

    for line in lines:
        race_id_match = race_id_pattern.findall(line)
        if len(race_id_match) > 0:
            return float(line.split(" ")[-1][:-2])

def get_page_extraction_times(lines: list, race_id):
    """
    Reads all lines for a given log and returns the average page extraction time by race
    """
    page_exectuion_string = f"{race_id} and page "
    
    page_times = []
    for line in lines:
        if page_exectuion_string in line:
            page_times.append(float(line.split(" ")[-1][:-2]))

    return page_times

def get_page_load_times(lines: list, race_id):
    page_load_string = f"for link https://labs.competitor.com/result/subevent/{race_id}"
    
    page_load_times = []
    for line in lines:
        if page_load_string in line:
            page_load_times.append(float(line.split(" ")[-4]))

    return page_load_times


with open(LOGS_PATH + test_filename) as file:
    lines = file.readlines()
    races = find_races(lines)
    race_data_list = []
    for race in races:
        num_pages = find_num_pages(lines, race)
        total_time = find_total_extraction_time(lines, race)
        page_load_times = get_page_load_times(lines, race)
        page_extraction_times = get_page_extraction_times(lines, race)
        
        race_data = {
            "id": race,
            "Total Time": total_time,
            "Number of Pages": num_pages,
            "Average Page Load Time": statistics.mean(page_load_times),
            "Max Page Load Time": max(page_load_times),
            "Min Page Load Time": min(page_load_times),
            "Total Page Load Times": sum(page_load_times),
            "Percent Page Load Time": (sum(page_load_times) / total_time) * 100 if total_time != None else 0,
            "Page Load Times": page_load_times,
            "Average Page Extraction Time": statistics.mean(page_extraction_times),
            "Max Page Extraction Time": max(page_extraction_times),
            "Min Page Extraction Time": min(page_extraction_times),
            "Total Page Extraction Time": sum(page_extraction_times),
            "Percent Page Extraction Time": (sum(page_extraction_times) / total_time) * 100 if total_time != None else 0,
            "Page Extraction Times": page_extraction_times
        }
        race_data_list.append(race_data)
final_data = pd.DataFrame(race_data_list)
final_data
    

Unnamed: 0,id,Total Time,Number of Pages,Average Page Load Time,Max Page Load Time,Min Page Load Time,Total Page Load Times,Percent Page Load Time,Page Load Times,Average Page Extraction Time,Max Page Extraction Time,Min Page Extraction Time,Total Page Extraction Time,Percent Page Extraction Time,Page Extraction Times
0,8B6CF837-4E20-E711-9415-005056951BF1,57.2,16,2.042941,2.56,1.73,34.73,60.716783,"[1.95, 2.05, 2.12, 2.24, 2.51, 1.95, 2.56, 1.8...",7.3375,18.1,1.4,117.4,205.244755,"[8.1, 8.1, 13.1, 2.5, 18.1, 7.6, 2.5, 2.9, 12...."
1,8DFA084E-A5B7-E911-A986-000D3A364086,73.1,33,2.211176,2.82,1.93,75.18,102.845417,"[2.13, 1.93, 2.05, 2.45, 2.46, 2.25, 2.34, 2.3...",2.981818,3.3,1.3,98.4,134.610123,"[3.2, 3.2, 3.3, 3.2, 3.1, 3.1, 3.1, 3.2, 3.1, ..."
2,033E4941-E422-E811-941D-005056951BF1,70.9,31,2.102812,2.37,1.86,67.29,94.908322,"[2.1, 2.05, 2.11, 2.23, 2.37, 2.25, 2.0, 2.01,...",3.387097,8.3,2.0,105.0,148.09591,"[3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.3, 3.3, 3.3, ..."
3,03652C5C-EEB0-E111-80AE-005056956277,72.8,29,2.132333,2.46,1.88,63.97,87.870879,"[2.14, 2.14, 2.14, 2.28, 2.42, 1.91, 2.06, 2.3...",3.141379,3.4,1.2,91.1,125.137363,"[3.2, 3.2, 3.2, 3.2, 3.3, 3.4, 3.4, 3.4, 3.3, ..."
4,15141085-CF1B-4A9A-83C1-AD4B3557CF2E,44.6,20,2.135714,2.55,1.85,44.85,100.560538,"[2.26, 2.24, 2.29, 2.29, 2.43, 2.18, 1.88, 2.0...",2.875,3.0,1.5,57.5,128.923767,"[2.9, 2.9, 3.0, 2.9, 2.9, 3.0, 3.0, 3.0, 3.0, ..."
5,D9616994-3055-E211-B7A2-005056956277,44.9,14,2.078,3.04,1.83,31.17,69.420935,"[2.08, 1.98, 2.0, 2.15, 3.04, 1.86, 1.88, 2.24...",4.428571,8.7,2.7,62.0,138.084633,"[3.0, 3.1, 8.4, 8.3, 2.9, 2.7, 3.3, 2.8, 2.7, ..."
6,EB17D0B1-4EAE-E111-80AE-005056956277,70.5,31,2.144687,3.24,1.77,68.63,97.347518,"[2.14, 2.13, 2.18, 2.22, 2.36, 1.95, 2.16, 2.2...",2.767742,3.3,0.5,85.8,121.702128,"[3.2, 3.3, 3.3, 3.3, 3.1, 3.2, 3.2, 3.1, 2.4, ..."
7,640E92CB-82AD-42A9-B13C-6B131B749798,41.3,16,2.162941,2.39,1.95,36.77,89.031477,"[2.01, 2.18, 2.24, 2.33, 2.39, 2.25, 2.05, 2.1...",4.0,4.2,3.8,64.0,154.96368,"[3.9, 3.9, 3.8, 3.8, 4.0, 4.1, 4.1, 4.0, 4.1, ..."
8,AB9F0371-C93F-4256-B734-EAE3C3F45BB2,46.9,20,2.167619,2.45,1.97,45.52,97.057569,"[2.1, 2.1, 2.25, 2.27, 2.34, 2.01, 2.07, 2.02,...",3.07,3.3,1.0,61.4,130.916844,"[3.2, 3.3, 3.3, 3.3, 3.2, 3.2, 3.3, 3.3, 3.1, ..."
9,6ad80410-1063-ea11-a811-000d3a5a1cf8,34.6,13,2.130714,2.37,1.83,29.83,86.213873,"[2.13, 2.23, 2.23, 2.26, 2.32, 1.86, 2.37, 1.9...",3.038462,3.4,0.0,39.5,114.16185,"[3.4, 3.3, 3.4, 3.3, 3.3, 3.3, 3.3, 3.3, 3.2, ..."


# Estimation

In [3]:
import time
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import statistics as stats
import helper
import os
import logging


logging.basicConfig(
    filename=f"Page Number Estimation.log",
    encoding="utf-8")
logger = logging.getLogger()
logger.setLevel(logging.INFO)

def get_page_counts(clab_ids_list: list) -> list:
    """
    Counts the number of pages per event for a list of provided clab events
    """
    COMPETITOR_LAB_LINK = "https://labs.competitor.com/result/subevent/"
    COMPETITOR_LAB_LINK_SUFFIX= "?filter=%7B%7D&order=ASC&perPage=50&sort=FinishRankOverall&page="

    driver = helper.init_web_driver()
    pages = []
    
    for idx, clab_id in enumerate(clab_ids_list):
        data_url_page = f"{COMPETITOR_LAB_LINK}{clab_id}{COMPETITOR_LAB_LINK_SUFFIX}1"
        driver = helper.driver_get_new_page(driver, data_url_page)
        try:
            WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, "//ul[contains(@class, 'MuiPagination-ul')]/li/button"))
            )
            num_pages = int(driver.find_elements(By.XPATH, "//ul[contains(@class, 'MuiPagination-ul')]/li/button")[-2].text)
        except (IndexError, TimeoutException):
            # If there is only one page, handle error
            num_pages = 1

        remaining = len(clab_ids_list) - idx
        pages.append(num_pages)
        print(f"Pages: {num_pages}, Remaining: {remaining} Url: {data_url_page}")
        logging.info(f"Url: {data_url_page} Pages: {num_pages} ")

    driver.quit()
    return pages

def  main():
    """
    Main function for execution
    """

    with open("Page Number Estimation.log", "r") as file:
        already_logged_clab_ids = file.read()
    
    all_clab_ids = pd.read_csv("Competitor Labs URLs.csv")["Competitor Labs ID"].to_list()
    already_accounted_for_ids = [id for id in all_clab_ids if not pd.isna(id) and id in already_logged_clab_ids]
    existing_files = [file[:-5] for file in os.listdir(r"./Race Data HTML") if file.endswith(".html")]
    in_progress_ids = [clab_id for clab_id in all_clab_ids if clab_id not in already_accounted_for_ids and clab_id not in existing_files]
    
    get_page_counts(in_progress_ids)

main()


Pages: 1, Remaining: 541 Url: https://labs.competitor.com/result/subevent/nan?filter=%7B%7D&order=ASC&perPage=50&sort=FinishRankOverall&page=1
Pages: 49, Remaining: 540 Url: https://labs.competitor.com/result/subevent/77B45A74-3F87-E711-9419-005056951BF1?filter=%7B%7D&order=ASC&perPage=50&sort=FinishRankOverall&page=1
Pages: 42, Remaining: 539 Url: https://labs.competitor.com/result/subevent/8BE47099-E69A-444B-98C5-F5BE134EBACC?filter=%7B%7D&order=ASC&perPage=50&sort=FinishRankOverall&page=1
Pages: 54, Remaining: 538 Url: https://labs.competitor.com/result/subevent/2F9635A9-4187-E711-9419-005056951BF1?filter=%7B%7D&order=ASC&perPage=50&sort=FinishRankOverall&page=1
Pages: 57, Remaining: 537 Url: https://labs.competitor.com/result/subevent/F5F53B8A-2F41-E511-9409-005056951BF1?filter=%7B%7D&order=ASC&perPage=50&sort=FinishRankOverall&page=1
Pages: 41, Remaining: 536 Url: https://labs.competitor.com/result/subevent/8BC5A0B9-F40F-E311-9EC7-005056956277?filter=%7B%7D&order=ASC&perPage=50&so

In [5]:
def analyze_page_counts():
    """Reads the linked logging file to pull out the useful information about the remaining page counts"""
    average_page_time_seconds = 4
    with open("Page Number Estimation.log", "r") as file:
        lines = file.readlines()
    
    page_counts = []
    test_string = "Pages: "
    for line in lines:
        if test_string in line:
            page_counts.append(int(line.split(" ")[-2]))

    print(f"Total: {sum(page_counts)}")
    print(f"Estimated completion time (Minutes): {(sum(page_counts) * average_page_time_seconds)/60}")

analyze_page_counts()

Total: 37794
Estimated completion time (Minutes): 2519.6
