In [1]:
#### CONFIGURATION
competition_results_csv = "output/scraped_itsf_tournament_results.csv" # output file of 02_scape_tournament_results.ipynb
output_csv = "output/world_ranking.csv"
#### 

In [2]:
from typing import List
from collections import defaultdict
from bs4 import BeautifulSoup
from tqdm import tqdm

import pandas as pd
import numpy as np

import csv
import re
import openskill
import os
import requests

In [3]:
itsf_results = pd.read_csv(competition_results_csv)

In [4]:
itsf_results

Unnamed: 0,competition_id,view,teams,ranks
0,176951,final_ranking,Frédéric COLLIGNON (BEL)||Marcello MANGANIELLO...,1||2||3||4||5||5||7||7||9||9||9||9||13||13||13...
1,120412,final_ranking,Frédéric COLLIGNON (BEL)||Nicolas BELLIOT (FRA...,1||2||3||4||5||5||7||7||9||9||9||9||13||13||13...
2,8851297,final_ranking,Estelle JACQUOT (FRA)||Marie COLLIGNON (FRA)||...,1||2||3||4||5||5||5||5||9||9||9||9||9||9||9||9...
3,8851449,final_ranking,Augustin BOUCHARD (FRA)||Mathieu CHAPRON (FRA)...,1||2||3||4||5||5||5||5||9
4,8851469,final_ranking,Patrick FOURNIER (BEL)||Fabien BOREZ (FRA)||Di...,1||2||3||4||5||5||5||5||9||9||9||9||9||9||9||9...
...,...,...,...,...
10520,652278846,final_ranking,Tania KRENCHEVA (BGR)|Ekaterina ATANASOVA (BGR...,1||2||3||4||5||5||7
10521,652278900,qualif_ranking,Krasimir DIMITROV (BGR)||Stanislav GEORGIEV (B...,1||2||3||4||5||6||7||8||9||10||11||12||13||14|...
10522,652278900,final_ranking,Nikola VELIKOV (BGR) -55+||Stanislav GEORGIEV ...,1||2||3||4||5||5||5||5||9||9||9||9||9||9||9||9...
10523,652278822,qualif_ranking,Rosen KYUCHUKOV (BGR)|Anton BOZHIKOV (BGR)||Di...,1||2||3||4||5||6||7||8||9||10||11||12||13||14|...


In [5]:
class Leaderboard:
    def __init__(self):
        self.players = defaultdict(openskill.Rating)
        self.num_total_matches = defaultdict(int)
        self.num_singles_matches = defaultdict(int)
        self.num_doubles_matches = defaultdict(int)
    
    @staticmethod
    def is_valid_player(name: str) -> bool:
        """
        Some tournaments contain player names like "absent (#1)" to record no-shows.
        We remove them from results processing to prevent matches with no-shows from biasing results.
        """
        if "absent(#" in name or name == "absent":
            # Remove absent player entries
            return False
        elif name.upper() == name and len(name.split(" ")) == 1:
            # Remove country entries
            return False
        return True
    
    @staticmethod
    def is_valid_team(team: List[str]) -> bool:
        for player in team:
            if not Leaderboard.is_valid_player(player):
                return False
        return True
    
    
    @staticmethod
    def remove_last_tail(s: str) -> str:
        last_closing_parenthesis_index = s.rfind(')')

        if last_closing_parenthesis_index != -1:
            # Extract the substring containing all but the last closing parenthesis
            extracted_substring = s[:last_closing_parenthesis_index+1]

            # Print the extracted substring
            return extracted_substring
        else:
            return s

        
    def process_competition(self, teams: List[str], ranks: List[int]):
        """
        Update the player rating objects based on the competition results.

        :param teams: the list of teams that participated in the competition
        :param ranks: the list of teams' positions in the final ranking
        :returns: nothing, it just updates the players in the players dict

        :example:
        >>> # Method call to process a tournament with Bert and Ernie in first and Harry and Voldemort in 2nd place.
        >>> process_competition(teams = [["Bert", "Ernie"], ["Harry", "Voldemort"]], ranks = [1, 2])
        """
        team_validity = [Leaderboard.is_valid_team(team) for team in teams]
        teams = [team for i, team in enumerate(teams) if team_validity[i]]
        ranks = [rank for i, rank in enumerate(ranks) if team_validity[i]]
        
        team_lineups = [[self.players[playername] for playername in team] for team in teams]
        team_lineups = openskill.rate(team_lineups, rank=ranks, model=openskill.models.BradleyTerryPart)
        
        for t_idx, team in enumerate(teams):
            for p_idx, playername in enumerate(team):
                self.num_total_matches[playername] += 1
                if len(team) > 1:
                    self.num_doubles_matches[playername] += 1
                else:
                    self.num_singles_matches[playername] += 1
                self.players[playername] = team_lineups[t_idx][p_idx]
            if p_idx >= 2:
                break
        
    def process_competitions(self, results_df: pd.DataFrame, exclude_competitions: List[int] = None):
        """
        Process all competitions in a dataframe.
        
        :param results_df: a Pandas dataframe with one row per competition, where columns "teams" and "ranks"
            determine the competition results.
        :param exclude_competitions: a list of competition IDs to exclude from the ranking. A mechanism to prevent
            competitions with problems in the results table from affecting the world ranking.
        """
        if exclude_competitions is None:
            exclude_competitions = []
            
        for row in tqdm(results_df.iterrows()):
            if row[1].competition_id in exclude_competitions:
                continue
                
            teams = row[1].teams.split("||")
            teams = [team.split("|") for team in teams]

            for i, team in enumerate(teams):
                team = [self.remove_last_tail(player) for player in team]
                teams[i] = team

            ranks = row[1].ranks.split("||")
            self.process_competition(teams, ranks)

                
    def make_ranking(self) -> pd.DataFrame:
        """
        Builds the world ranking based on the processed competitions.
        """
        player_names = list(self.players.keys())
        singles = [[self.players[elem]] for elem in player_names]
        scores = openskill.rate(singles)
        mus = [score[0].mu for score in scores]
        sigmas = [score[0].sigma for score in scores]

        num_matches = [self.num_total_matches[elem] for elem in player_names]
        num_singles = [self.num_singles_matches[elem] for elem in player_names]
        num_doubles = [self.num_doubles_matches[elem] for elem in player_names]

        df = pd.DataFrame([player_names, mus, sigmas, num_matches, num_singles, num_doubles]).T
        df.columns = ["player", "mu", "sigma", "num_matches", "num_singles", "num_doubles"]
        # Following the formula in https://openskill.me/en/stable/manual.html#ranks
        df["ranking_score"] = df["mu"] - 2* df["sigma"]

        df = df.sort_values("mu", ascending=False).reset_index(drop=True)
        df['rank_mu'] = np.arange(len(df)) + 1

        df = df.sort_values("ranking_score", ascending=False).reset_index(drop=True)
        df['rank_ranking_score'] = np.arange(len(df)) + 1
        return df
    
    
    def predict_team1_win_prob(self, team1: List[str], team2: List[str]):
        team1_ = [self.players[elem] for elem in team1]
        team2_ = [self.players[elem] for elem in team2]
        return openskill.predict_win(teams=[team1_, team2_])[0]

In [6]:
# There seems something off with this particular competition id
odd_tournaments = [5836564]

In [7]:
itsf_ranking = Leaderboard()

In [8]:
itsf_results

Unnamed: 0,competition_id,view,teams,ranks
0,176951,final_ranking,Frédéric COLLIGNON (BEL)||Marcello MANGANIELLO...,1||2||3||4||5||5||7||7||9||9||9||9||13||13||13...
1,120412,final_ranking,Frédéric COLLIGNON (BEL)||Nicolas BELLIOT (FRA...,1||2||3||4||5||5||7||7||9||9||9||9||13||13||13...
2,8851297,final_ranking,Estelle JACQUOT (FRA)||Marie COLLIGNON (FRA)||...,1||2||3||4||5||5||5||5||9||9||9||9||9||9||9||9...
3,8851449,final_ranking,Augustin BOUCHARD (FRA)||Mathieu CHAPRON (FRA)...,1||2||3||4||5||5||5||5||9
4,8851469,final_ranking,Patrick FOURNIER (BEL)||Fabien BOREZ (FRA)||Di...,1||2||3||4||5||5||5||5||9||9||9||9||9||9||9||9...
...,...,...,...,...
10520,652278846,final_ranking,Tania KRENCHEVA (BGR)|Ekaterina ATANASOVA (BGR...,1||2||3||4||5||5||7
10521,652278900,qualif_ranking,Krasimir DIMITROV (BGR)||Stanislav GEORGIEV (B...,1||2||3||4||5||6||7||8||9||10||11||12||13||14|...
10522,652278900,final_ranking,Nikola VELIKOV (BGR) -55+||Stanislav GEORGIEV ...,1||2||3||4||5||5||5||5||9||9||9||9||9||9||9||9...
10523,652278822,qualif_ranking,Rosen KYUCHUKOV (BGR)|Anton BOZHIKOV (BGR)||Di...,1||2||3||4||5||6||7||8||9||10||11||12||13||14|...


In [9]:
itsf_ranking.process_competitions(
    results_df=itsf_results, 
    exclude_competitions=odd_tournaments
)

10525it [00:08, 1182.82it/s]


In [11]:
ranking_df = itsf_ranking.make_ranking()

In [12]:
pd.set_option("display.max_rows", 300, "display.max_columns", 200)
ranking_df[ranking_df.sigma<2.5][:300].reset_index()

Unnamed: 0,index,player,mu,sigma,num_matches,num_singles,num_doubles,ranking_score,rank_mu,rank_ranking_score
0,0,Frédéric COLLIGNON (BEL),32.449201,1.66096,265,90,175,29.127281,5,1
1,1,Tony SPREDEMAN (USA),31.628125,1.30345,347,147,200,29.021226,8,2
2,2,Kevin HUNDSTORFER (AUT),28.748221,1.031432,424,145,279,26.685357,79,3
3,3,Ekaterina ATANASOVA (BGR),27.116644,0.836281,612,230,382,25.444083,323,4
4,4,Stefan BURMETLER (AUT),27.469877,1.100825,304,116,188,25.268227,236,5
5,5,Marina TABAKOVIC (AUT),27.166712,1.032959,375,114,261,25.100794,314,6
6,6,Cinderella POIDEVIN (FRA),27.499889,1.206446,238,77,161,25.086996,227,7
7,7,Cindy KUBIATOWICZ (CHE),27.180101,1.07237,347,122,225,25.035361,310,8
8,8,Miguel DOS SANTOS LOTE (FRA),26.718325,0.866477,566,201,365,24.985372,489,9
9,9,Amalie BREMER (DNK),27.103198,1.071436,351,127,224,24.960326,330,10


In [14]:
ranking_df.to_csv(output_csv)

In [17]:
# We can simulate match outcomes and calculate win probabilities for hypothetical matchups
itsf_ranking.predict_team1_win_prob(
    ["Tony SPREDEMAN (USA)", "Kevin HUNDSTORFER (AUT)"],
    ["Ryan MOORE (USA)", "Sven WONSYLD (DNK)"]
)

0.8084007638464163

In [15]:
# Easy to filter ranking per country
pd.set_option("display.max_rows", 300, "display.max_columns", 200)
ranking_df[ranking_df.player.str.contains("\(USA\)")][:100]

Unnamed: 0,player,mu,sigma,num_matches,num_singles,num_doubles,ranking_score,rank_mu,rank_ranking_score
1,Tony SPREDEMAN (USA),31.628125,1.30345,347,147,200,29.021226,8,2
10,Ryan MOORE (USA),27.619732,1.341963,237,109,128,24.935807,190,11
44,Tiffany MOORE (USA),28.410869,2.223943,92,44,48,23.962983,94,45
57,Todd LOFFREDO (USA),26.536077,1.377846,243,97,146,23.780385,578,58
114,Billy PAPPAS (USA),26.228918,1.417125,205,74,131,23.394668,810,115
120,Bruce NARDOCI (USA),25.967344,1.289906,266,111,155,23.387533,1209,121
131,Sullivan RUE (USA),27.133232,1.899913,109,36,73,23.333407,320,132
141,Blake ROBERTSON (USA),26.503448,1.602265,155,63,92,23.298918,592,142
149,Dan BARBER (USA),26.18151,1.453643,200,79,121,23.274225,874,150
160,Tracy MCMILLIN (USA),26.354941,1.559987,161,63,98,23.234967,703,161


In [16]:
ranking_df[ranking_df.player.str.contains("\(GBR\)")][:100]

Unnamed: 0,player,mu,sigma,num_matches,num_singles,num_doubles,ranking_score,rank_mu,rank_ranking_score
37,Robert ATHA (GBR),26.210428,1.033983,434,150,284,24.142462,837,38
61,David ZIEMANN (GBR),25.463706,0.874692,538,216,322,23.714322,3777,62
73,Stephen LYALL (GBR),25.710671,1.041494,336,109,227,23.627683,1987,74
79,Callum OAKES (GBR),25.786254,1.114411,297,108,189,23.557433,1674,80
103,Richard MARSH (GBR),25.365687,0.959254,389,125,264,23.447178,4857,104
111,Matthew WARR (GBR),26.052598,1.323859,202,81,121,23.404881,1041,112
148,Olga LASECKA (GBR),26.017983,1.367429,198,83,115,23.283126,1100,149
251,Rhys ROBERTS (GBR),26.139927,1.568609,129,56,73,23.00271,916,252
293,Jonathan MAY (GBR),25.913995,1.505205,145,62,83,22.903585,1321,294
368,Boris ATHA (GBR),25.072138,1.154586,296,131,165,22.762966,10150,369
