# WEB SCRAPING

#### This code scrapes race results data from the Leipzig Marathon 'Ergebnis-Datenbank' (Results Databank). 

#### The browser:

- Selects the year from a dropdown menu.  

- Checks how many pages of results exist for that year.  

- Scrapes the table data on each page and adds it to a CSV file.  

- Clicks the 'Weiter' (Next) button to move through all year data has been scraped.

- Then, the script moves on to the next year in the list and repeats the process.

#### Note: The loop runs from index 1 to 46, which covers the years 1977 to 2024 (1999 and 2022 are missing from the data).

#### There's a short delay after each interaction to avoid overloading the server or triggering bot protection.

In [None]:
# Libraries

import time
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException

In [None]:
# Driver Setup

chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("detach", True)
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0")

driver = webdriver.Chrome(options=chrome_options)
driver.get("https://leipzigmarathon.de/ergebnis-suche/")

time.sleep(3)

In [None]:
def calculate_number_of_clicks():
    try:
        # Wait up to 10 seconds for all visible pagination buttons to load
        pagination_buttons = WebDriverWait(driver, 10).until(
            EC.visibility_of_all_elements_located((By.CSS_SELECTOR, ".gridjs-pages button"))
        )
        
        # Get the text from each button, keeping only those that are digits
        page_numbers = [button.text for button in pagination_buttons if button.text.isdigit()]
        
        # Return the highest page number found, or 0 if there are no page numbers
        return int(page_numbers[-1]) if page_numbers else 0

    except Exception:
        # If anything goes wrong (e.g. buttons not found), return 0
        return 0

In [None]:
def click_with_retry(driver, button_aria_label, max_attempts=3):
    # Try clicking the button up to 3 times
    for attempt in range(max_attempts):
        try:
            # Wait up to 10 seconds for the button to become clickable
            next_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, f'button[aria-label="{button_aria_label}"]'))
            )
            
            # Click the button using JavaScript (more reliable than .click() on dynamic pages)
            driver.execute_script("arguments[0].click();", next_button)
            
            # Wait 2 seconds after clicking to allow the page to update
            time.sleep(2)
            
            # Click was successful, return True
            return True
        
        # If the button becomes stale or doesn't load in time, wait and try again
        except (StaleElementReferenceException, TimeoutException):
            time.sleep(2)

    # If all attempts fail, return False
    return False

In [None]:
def scrape_table_data(year):
    # Create the filename using the year
    filename = f"scraped_data_{year}.csv"

    # Open the file in append mode, with UTF-8 encoding and proper newline handling
    with open(filename, "a", newline="", encoding="utf-8") as file:
        writer = csv.writer(file, quoting=csv.QUOTE_MINIMAL)

        # If the file is empty, write the header row
        if file.tell() == 0:
            writer.writerow(["Datum", "Wettbewerb", "PlatzGesamt", "Name", "Vorname", "Nettozeit",
                             "Ort/Verein", "Nation", "Jahrgang", "Alterklasse", "PlatzAk", "Bruttozeit"])

        # Find all table rows on the current webpage
        rows = driver.find_elements(By.XPATH, "//table/tbody/tr")
        for row in rows:
            # Get all columns (cells) in the current row
            columns = row.find_elements(By.TAG_NAME, "td")

            # Skip rows that don't contain real data
            if len(columns) > 1:
                wettbewerb = columns[1].text.strip()
                
                # Only process rows for the "Marathon" event
                if wettbewerb == "Marathon":
                    # Extract and clean text from each column
                    row_data = [col.text.strip() for col in columns]

                    # Ensure row_data has exactly 12 items, filling in blanks if needed
                    row_data = row_data[:12] + [""] * (12 - len(row_data))

                    # Write the cleaned row to the CSV file
                    writer.writerow(row_data)

In [None]:
def iterate_through_years_index():
    start_year = 1977

    # Loop through 46 years, starting from 1977
    for i in range(1, 47):
        current_year = start_year + (i - 1)

        # Find the year dropdown and select the year by its index
        years_element = driver.find_element("name", "wettkampf_select")
        dd = Select(years_element)
        dd.select_by_index(i)
        time.sleep(3)  # Wait for the page to load results for the selected year

        # Calculate how many pages of results there are
        total_pages = calculate_number_of_clicks()

        # Loop through each page of results
        for page_number in range(1, total_pages + 1):
            # Scrape the table data for the current page and year
            scrape_table_data(current_year)

            # If there are more pages, click the "Next" button
            if page_number < total_pages:
                button_aria_label = "Weiter"
                success = click_with_retry(driver, button_aria_label)
                if not success:
                    break  # Stop if unable to click "Next"

            time.sleep(2)  # Wait before loading the next page

In [None]:
iterate_through_years_index()

In [None]:
driver.quit()