In [1]:
#### CONFIGURATION
# Link to location of Chromium driver, install from: https://chromedriver.chromium.org/downloads
chromedriver_path = "~/workspace/foosball_world_ranking/chromedriver_mac64"
# Name of output file
output_csv = "output/itsf_competitions_per_year.csv"
#### 

In [2]:
from typing import List
from collections import defaultdict
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from tqdm import tqdm

import pandas as pd
import numpy as np

import csv
import html5lib
import re
import os
import requests

In [3]:
def make_tournament_url(comp_id: str):
    url = f"http://extranet.fast4foos.org/fast/tournament/players_station/players_station_home.jsp?originalTournamentId={comp_id}&screenIndex=-1"
    return url

def get_tournaments_from_year(year: int) -> str:
    """
    The URL https://www.tablesoccer.org/tournaments?sort_by=field_date_value&sort_order=ASC&field_tour_value=2004
    shows a list of all ITSF tournaments played in 2004.
    
    This method returns a list of all fast4foos tournament IDs of all ITSF tournaments that were played 
    in agiven year.
    
    :param year: the year for wich to obtain the ITSF tournaments
    :returns: the list of all IDs of the ITSF tournaments that were played in that year
    """
    url = f"https://www.tablesoccer.org/tournaments?sort_by=field_date_value&sort_order=ASC&field_tour_value={year}"
    response = requests.get(url)

    soup = BeautifulSoup(response.content, 'html5lib')

    # Find all <a> tags with an href attribute containing the desired pattern
    pattern = r'http://extranet.fast4foos.org/fast/live/(\d+)'
    a_tags = soup.find_all('a', href=re.compile(pattern))

    # Extract the tournament_ids  using regex
    tournament_ids = [re.search(pattern, str(a)).group(1) for a in a_tags]

    return tournament_ids

def get_competitions_for_tournament(tournament_id: str, driver = None) -> List[str]:
    """
    A fast4foos tournament may consist of several competitions within the tournament. I.e., 
    
    http://extranet.fast4foos.org/fast/tournament/players_station/players_station_home.jsp?originalTournamentId=483042770&screenIndex=-1
    consists of a "Mixed Doubles", "Open Doubles", "Women Doubles", "Open Singles", etc.
    
    This method returns a list of new tournament IDs for all the competitions within a tournament.
    
    :param tournament_id: the year for wich to obtain the ITSF tournaments
    :param driver: The Selenium WebDriver to re-use if there isn't already one. Re-using an existing driver 
        speeds up the requests since we can prevent creation and termination of a new driver for each request.
    :returns: the list of all IDs of the ITSF tournaments that were played in that year
    """
    competition_ids = []

    # Create a ChromeDriver if no driver was provided
    new_driver_was_created = False
    if driver is None:
        new_driver_was_created = True
        service = Service(executable_path=chromedriver_path)
        driver = webdriver.Chrome(service=service)

    driver.get(make_tournament_url(tournament_id))

    # Find all the buttons with class 'competition_button_FINISHED' using Selenium
    try:
        buttons = WebDriverWait(driver, 10).until(
            EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'input.competition_button_FINISHED'))
        )
    except TimeoutException:
        return []

    # Extract the second arguments of the goToCompetition methods. These are the competition IDs.
    for button in buttons:
        onclick_value = button.get_attribute('onclick')
        competition_id = onclick_value.split(',')[1].strip().rstrip(')')
        competition_ids.append(competition_id)
            
    # Clean up driver only if we created it within the scope of this method call
    if new_driver_was_created:
        driver.quit()
        
    return competition_ids
        
def get_competitions_for_year(year: int) -> List[str]:
    """
    Obtains a list of all competitions that were played at an ITSF tournament in a year.
    
    :param year: the year for wich to obtain the ITSF competitions
    :returns: the list of all IDs of the competitions at an ITSF tournaments in a given year
    """
    tournaments = get_tournaments_from_year(year)
    competitions = []
    
    service = Service(executable_path=chromedriver_path)
    driver = webdriver.Chrome(service=service)
    
    for tournament in tournaments:
        competitions += get_competitions_for_tournament(tournament)
        
    driver.quit()
    return competitions

In [4]:
def write_competitions_to_csv(year: int, csv_filename=output_csv) -> None:
    subcompetitions = get_competitions_for_year(year)
    
    header = ['year', 'competition_id']

    file_exists = os.path.isfile(csv_filename)

    if not file_exists:
        with open(csv_filename, "w", newline='') as file:
            writer = csv.writer(file)
            writer.writerow(header)
            
    if csv_filename:
        with open(csv_filename, "a", newline='') as file:
            writer = csv.writer(file)
            for competition_id in subcompetitions:
                writer.writerow([year, competition_id])

In [5]:
for year in range(2004, 2024):
    print(f"processing year {year}")
    write_competitions_to_csv(year)

processing year 2004
processing year 2005
