# WEB SCRAPING

#### This code scrapes race results data from the Leipzig Marathon 'Ergebnis-Datenbank' (Results Databank). 

#### The browser:

- Selects the year from a dropdown menu.  

- Checks how many pages of results exist for that year.  

- Scrapes the table data on each page and adds it to a CSV file.  

- Clicks the 'Weiter' (Next) button to move through all year data has been scraped.

- Then, the script moves on to the next year in the list and repeats the process.

#### Note: The loop runs from index 1 to 46, which covers the years 1977 to 2024 (1999 and 2022 are missing from the data).

#### There's a short delay after each interaction to avoid overloading the server or triggering bot protection.

In [None]:
# Libraries

import time
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException

In [None]:
# Driver Setup

chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("detach", True)
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0")

driver = webdriver.Chrome(options=chrome_options)
driver.get("https://leipzigmarathon.de/ergebnis-suche/")

time.sleep(3)

### calculate_number_of_clicks()
#### This function finds all the pagination buttons. It waits up to 10 seconds for them to appear, then filters out any buttons that don’t have numeric text. From the list of numbers, it grabs the last one, which represents the highest page number (i.e. total pages), and returns it as an integer. If no buttons are found or if something goes wrong, it safely returns 0.

In [None]:
def calculate_number_of_clicks():
    try:
        pagination_buttons = WebDriverWait(driver, 10).until(
            EC.visibility_of_all_elements_located((By.CSS_SELECTOR, ".gridjs-pages button"))
        )
        page_numbers = [button.text for button in pagination_buttons if button.text.isdigit()]
        return int(page_numbers[-1]) if page_numbers else 0
    except Exception:
        return 0

### click_with_retry()
#### This function tries to click on the 'Weiter' (Next) button, retrying up to 3 times if it fails. It looks for a button using its aria-label, waits until it’s clickable, and then clicks it using JavaScript. If the click succeeds, it returns True. If the button is stale or takes too long to appear, it waits 2 seconds and tries again. If all attempts fail, it returns False.

In [None]:
def click_with_retry(driver, button_aria_label, max_attempts=3):
    for attempt in range(max_attempts):
        try:
            next_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, f'button[aria-label="{button_aria_label}"]'))
            )
            driver.execute_script("arguments[0].click();", next_button)
            time.sleep(2)
            return True
        except (StaleElementReferenceException, TimeoutException):
            time.sleep(2)  

    return False

### scrape_table_data()
#### This function collects race results from a table and saves these to a CSV file named for the given year. It opens the file in append mode and writes the header only if the file is empty. Then, it finds all table rows, checks each one to see if it includes "Marathon" in the second column, and if so, extracts and cleans the text. It ensures each row has exactly 12 columns by trimming or padding as needed, then writes the cleaned data into the file.

In [None]:
def scrape_table_data(year):
    filename = f"scraped_data_{year}.csv"

    with open(filename, "a", newline="", encoding="utf-8") as file:
        writer = csv.writer(file, quoting=csv.QUOTE_MINIMAL)

        if file.tell() == 0:
            writer.writerow(["Datum", "Wettbewerb", "PlatzGesamt", "Name", "Vorname", "Nettozeit",
                             "Ort/Verein", "Nation", "Jahrgang", "Alterklasse", "PlatzAk", "Bruttozeit"])

        rows = driver.find_elements(By.XPATH, "//table/tbody/tr")
        for row in rows:
            columns = row.find_elements(By.TAG_NAME, "td")

            if len(columns) > 1:
                wettbewerb = columns[1].text.strip()
                if wettbewerb == "Marathon":
                    row_data = [col.text.strip() for col in columns]

                    row_data = row_data[:12] + [""] * (12 - len(row_data))

                    writer.writerow(row_data)

### iterate_through_years_index()
#### This function loops through each year from 1977 onward by selecting the corresponding index in a dropdown menu on the website. For each selected year, it waits for the page to load, checks how many pagination pages exist, and then scrapes the table data page by page using the scrape_table_data() function. If there are multiple pages, it clicks the "Weiter" (Next) button using click_with_retry() to move to the next one. If the click fails at any point, it stops scraping that year and moves on. The function includes short pauses to let pages load and reduce strain on the server.

In [None]:
def iterate_through_years_index():
    start_year = 1977

    for i in range(1, 47):
        current_year = start_year + (i - 1)
        years_element = driver.find_element("name", "wettkampf_select")
        dd = Select(years_element)
        dd.select_by_index(i)
        time.sleep(3)

        total_pages = calculate_number_of_clicks()

        for page_number in range(1, total_pages + 1):
            scrape_table_data(current_year)

            if page_number < total_pages:
                button_aria_label = "Weiter"
                success = click_with_retry(driver, button_aria_label)
                if not success:
                    break  

            time.sleep(2)

In [None]:
iterate_through_years_index()

In [None]:
driver.quit()