In [1]:
#### CONFIGURATION
competitions_per_year_csv = "output/itsf_competitions_per_year.csv" # output file of 01_scape_tournament_entries.ipynb
output_csv = "output/scraped_itsf_tournament_results.csv"
#### 

In [2]:
from typing import List
from bs4 import BeautifulSoup
from tqdm import tqdm

import pandas as pd

import csv
import html5lib
import re
import os
import requests

In [3]:
file_exists = os.path.isfile(output_csv)

if not file_exists:
    # If file doesn't exist, create and write a header row
    with open(output_csv, "w", newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['competition_id', 'view', 'teams', 'ranks'])

itsf_results = pd.read_csv(output_csv)

In [4]:
def get_and_parse_competition_results(url, session=None):
    """
    Parses the competition results at the page of the URL.
    
    The output is in the form: [(["Bert", "Ernie"], 1), (["Harry", "Voldemort"], 2)]
    
    :param url: the year for wich to obtain the ITSF tournaments
    :returns: the competition result
    """

    if session is None:
        session = requests.Session()

    results = []
    response = session.get(url)
    soup = BeautifulSoup(response.content, 'html5lib')
    target_element = soup.find(string="Team")

    if target_element:
        parent_element = target_element.find_parent().find_parent()
        sibling_elements = parent_element.find_next_siblings()

        for sibling in sibling_elements:
            # Extract leading digit separately
            leading_digit = re.search(r'^\d+', sibling.get_text(strip=True))
            leading_digit = leading_digit.group(0) if leading_digit else ''

            # Exclude blocks containing "TS" between parentheses
            sibling_text = sibling.get_text(strip=True)
            sibling_text = re.sub(r'\([^()]*TS[^()]*\)', '', sibling_text)

            # Remove leading digit from sibling text
            sibling_text = re.sub(r'^\d+', '', sibling_text)

            # Remove trailing digits for player age group indication using regex (e.g., 65+)
            sibling_text = re.sub(r'\d+[+-]$', '', sibling_text)
            sibling_text = sibling_text.strip()
            
            # Print the leading digit and modified sibling text
            if " - " in sibling_text:
                try:
                    player1, player2 = sibling_text.split(" - ")
                except ValueError:
                    print(f"WARNING: cannot parse players from {sibling_text}, skipping team")
                    continue
                results += [([player1, player2], leading_digit)]
            else:
                player1 = sibling_text
                results += [([player1], leading_digit)]            
            
        return results
    else:
        return

In [5]:
def parse_and_persist_competition_result(competition_id: str, csv_filename: str) -> None:
    """
    Wraps around get_and_parse_competition_results by obtaining results for both the
    qualifiers-phase and the final-phase (knockout) of the competition and
    writing result rows to CSV.
    
    :param competition_id: the competition id to process
    :param csv_filename: the CSV filename to write the results to
    """
    views = ["qualif_ranking", "final_ranking"]
    for view in views:
        url = f"https://extranet.fast4foos.org/fast/tournament/players_station/players_station_competition.jsp?lang=en&competitionId=0&originalCompetitionId={competition_id}&playerId=0&view={view}&noTimeout=false&screenIndex=-1&barCode=null"
        results = get_and_parse_competition_results(url)
        if results is None or len(results) == 0:
            continue
        teams = [elem[0] for elem in results]
        ranks = [elem[1] for elem in results]
                
        if csv_filename:
            with open(csv_filename, "a", newline='') as file:
                writer = csv.writer(file)
                team_string = "||".join(["|".join(team) if len(team) > 1 else team[0] for team in teams])
                writer.writerow([competition_id,view,team_string,"||".join(ranks)])

In [6]:
tournaments = pd.read_csv(competitions_per_year_csv)

In [7]:
# Process all the competition results that were already in CSV file. This allows us to load and continue 
# from where we left off in the case of an error, making us resilient to e.g. connection errors.
already_scraped_competitions = set(itsf_results.competition_id)

for i, tournament in tqdm(enumerate(list(tournaments["competition_id"]))):
    if tournament not in already_scraped_competitions:
        parse_and_persist_competition_result(tournament, csv_filename=output_csv)
        tournament += 1
        already_scraped_competitions.add(tournament)

100it [00:00, 316312.52it/s]
