In [1]:
import re
import statistics
import pandas as pd

In [2]:
LOGS_PATH = r"Logs/"
test_filename = "02_08_2024_00_03.log"

def find_races(lines: list) -> list:
    """
    Finds all unique races in a log file and their status
    """
    races = set()
    race_id_pattern = re.compile(r"(?<=Extracing data for ).*(?= with )")

    for line in lines:
        race_id_match = race_id_pattern.findall(line)
        if len(race_id_match) > 0:
            races.update(race_id_match)

    return list(races)

def find_num_pages(lines: list, race_id: str) -> str:
    """
    Finds the number of pages for a given race
    """
    race_id_pattern = re.compile(rf"(?<=Extracing data for ){re.escape(race_id)}(?= with )")

    for line in lines:
        race_id_match = race_id_pattern.findall(line)
        if len(race_id_match) > 0:
            return line.split(" ")[-2]

def find_total_extraction_time(lines: list, race_id: str) -> float:
    """
    Returns the total extraction time for a given race id (if it is there)
    """
    race_id_pattern = re.compile(rf"(?<=Total extraction time for race ){re.escape(race_id)}")

    for line in lines:
        race_id_match = race_id_pattern.findall(line)
        if len(race_id_match) > 0:
            return float(line.split(" ")[-1][:-2])

def get_page_extraction_times(lines: list, race_id):
    """
    Reads all lines for a given log and returns the average page extraction time by race
    """
    page_exectuion_string = f"{race_id} and page "
    
    page_times = []
    for line in lines:
        if page_exectuion_string in line:
            page_times.append(float(line.split(" ")[-1][:-2]))

    return page_times

def get_page_load_times(lines: list, race_id):
    page_load_string = f"for link https://labs.competitor.com/result/subevent/{race_id}"
    
    page_load_times = []
    for line in lines:
        if page_load_string in line:
            page_load_times.append(float(line.split(" ")[-4]))

    return page_load_times


with open(LOGS_PATH + test_filename) as file:
    lines = file.readlines()
    races = find_races(lines)
    race_data_list = []
    for race in races:
        num_pages = find_num_pages(lines, race)
        total_time = find_total_extraction_time(lines, race)
        page_load_times = get_page_load_times(lines, race)
        page_extraction_times = get_page_extraction_times(lines, race)
        
        race_data = {
            "id": race,
            "Total Time": total_time,
            "Number of Pages": num_pages,
            "Average Page Load Time": statistics.mean(page_load_times),
            "Max Page Load Time": max(page_load_times),
            "Min Page Load Time": min(page_load_times),
            "Total Page Load Times": sum(page_load_times),
            "Percent Page Load Time": (sum(page_load_times) / total_time) * 100 if total_time != None else 0,
            "Page Load Times": page_load_times,
            "Average Page Extraction Time": statistics.mean(page_extraction_times),
            "Max Page Extraction Time": max(page_extraction_times),
            "Min Page Extraction Time": min(page_extraction_times),
            "Total Page Extraction Time": sum(page_extraction_times),
            "Percent Page Extraction Time": (sum(page_extraction_times) / total_time) * 100 if total_time != None else 0,
            "Page Extraction Times": page_extraction_times
        }
        race_data_list.append(race_data)
final_data = pd.DataFrame(race_data_list)
final_data
    

# Estimation

In [3]:
import time
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import statistics as stats
import helper
import os
import logging


logging.basicConfig(
    filename=f"Page Number Estimation.log",
    encoding="utf-8")
logger = logging.getLogger()
logger.setLevel(logging.INFO)

def get_page_counts(clab_ids_list: list) -> list:
    """
    Counts the number of pages per event for a list of provided clab events
    """
    COMPETITOR_LAB_LINK = "https://labs.competitor.com/result/subevent/"
    COMPETITOR_LAB_LINK_SUFFIX= "?filter=%7B%7D&order=ASC&perPage=50&sort=FinishRankOverall&page="

    driver = helper.init_web_driver()
    pages = []
    
    for idx, clab_id in enumerate(clab_ids_list):
        data_url_page = f"{COMPETITOR_LAB_LINK}{clab_id}{COMPETITOR_LAB_LINK_SUFFIX}1"
        driver = helper.driver_get_new_page(driver, data_url_page)
        try:
            WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, "//ul[contains(@class, 'MuiPagination-ul')]/li/button"))
            )
            num_pages = int(driver.find_elements(By.XPATH, "//ul[contains(@class, 'MuiPagination-ul')]/li/button")[-2].text)
        except (IndexError, TimeoutException):
            # If there is only one page, handle error
            num_pages = 1

        remaining = len(clab_ids_list) - idx
        pages.append(num_pages)
        print(f"Pages: {num_pages}, Remaining: {remaining} Url: {data_url_page}")
        logging.info(f"Url: {data_url_page} Pages: {num_pages} ")

    driver.quit()
    return pages

def  main():
    """
    Main function for execution
    """

    with open("Page Number Estimation.log", "r") as file:
        already_logged_clab_ids = file.read()
    
    all_clab_ids = pd.read_csv("Competitor Labs URLs.csv")["Competitor Labs ID"].to_list()
    already_accounted_for_ids = [id for id in all_clab_ids if not pd.isna(id) and id in already_logged_clab_ids]
    existing_files = [file[:-5] for file in os.listdir(r"./Race Data HTML") if file.endswith(".html")]
    in_progress_ids = [clab_id for clab_id in all_clab_ids if clab_id not in already_accounted_for_ids and clab_id not in existing_files]
    
    get_page_counts(in_progress_ids)

main()


Pages: 33, Remaining: 1117 Url: https://labs.competitor.com/result/subevent/1C2D90D3-3FD3-4989-BBFC-5E636CBABEBF?filter=%7B%7D&order=ASC&perPage=50&sort=FinishRankOverall&page=1
Pages: 20, Remaining: 1116 Url: https://labs.competitor.com/result/subevent/15141085-CF1B-4A9A-83C1-AD4B3557CF2E?filter=%7B%7D&order=ASC&perPage=50&sort=FinishRankOverall&page=1
Pages: 9, Remaining: 1115 Url: https://labs.competitor.com/result/subevent/5EC36C17-C4ED-43DF-9713-A94555F43831?filter=%7B%7D&order=ASC&perPage=50&sort=FinishRankOverall&page=1
Pages: 10, Remaining: 1114 Url: https://labs.competitor.com/result/subevent/C14C594A-A71D-4AB9-BDDB-E273C429C824?filter=%7B%7D&order=ASC&perPage=50&sort=FinishRankOverall&page=1
Pages: 10, Remaining: 1113 Url: https://labs.competitor.com/result/subevent/C200EDB9-C575-41CA-8ADE-297433B8FE11?filter=%7B%7D&order=ASC&perPage=50&sort=FinishRankOverall&page=1
Pages: 18, Remaining: 1112 Url: https://labs.competitor.com/result/subevent/6F4C381E-8C66-E911-A97A-000D3A36478

In [None]:
def analyze_page_counts():
    """Reads the linked logging file to pull out the useful information about the remaining page counts"""
    average_page_time_seconds = 4
    with open("Page Number Estimation.log", "r") as file:
        lines = file.readlines()
    
    page_counts = []
    test_string = "Pages: "
    for line in lines:
        if test_string in line:
            page_counts.append(int(line.split(" ")[-2]))

    print(f"Total: {sum(page_counts)}")
    print(f"Estimated completion time (Minutes): {(sum(page_counts) * average_page_time_seconds)/60}")

analyze_page_counts()