In [1]:
import json
import re
import urllib.request

from bs4 import BeautifulSoup
import pandas as pd

In [2]:
ROCHADE_URL = "https://rochadeeuropa.com/lichess-turniere-beendet/"
BUNDESLIGA_REGEXP = r"|".join(["(1. DE-Quarantäne Team Battle)",
                               "([0-9]+\. ?DE[ -]Quarantäne Teams 1-10)",
                               "(5. Quarantäne-Liga Teams 1-10)",
                               "([0-9]+\. ?Quarantäne-Bundesliga)"
                              ])


In [3]:
def get_bundesliga_tournaments():
    """
    scrape rochadeeuropa.com in order to find lichess quarantaine bundesliga matches.
    Rochade URL and regex to determine which tournament was actually a bundesliga tournaments are 
    taken from configurable global variables
    """
    # scrape rochade finished lichess tournaments
    response = urllib.request.urlopen(ROCHADE_URL)
    html = response.read()
    soup = BeautifulSoup(html, 'html.parser')
    
    # parse table
    data = []
    table = soup.find('table', attrs={'class':'tablepress tablepress-id-3'})
    table_body = table.find('tbody')
    
    rows = table_body.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        data.append([ele.text.strip() for ele in cols])
    # filter for bundesliga
    buli_tournaments = [el for el in data if re.match(BUNDESLIGA_REGEXP, el[1])]
    return buli_tournaments

In [4]:
def get_individual_results(tournament_id):
    """
    For individual results for a given tournament we use lichess API
    """
    api_url = f"https://lichess.org/api/tournament/{tournament_id}/results"
    api_response = urllib.request.urlopen(api_url)
    player_results = api_response.read()
    return pd.DataFrame([json.loads(pl) for pl in player_results.splitlines()])

In [5]:
def get_team_results(tournament_url):
    """
    Unfortunately the API does not provide team results, hence we need to go the pedestrian way and scrape it by hand
    """
    tournament_response = urllib.request.urlopen(tournament_url)
    tournament_html = tournament_response.read()
    tournament_soup = BeautifulSoup(tournament_html, 'html.parser')
    # there is a string before the json that needs to be deleted.
    # This seems a little clumsy and error prone but there is no other way when parsing manually
    str_to_delete = "lichess.tournament="
    tournament_strings = [script.text for script in tournament_soup.find_all("script") 
                          if str_to_delete in script.text]
    tournament_strings.sort(key=lambda s: len(s))
    tournament_string = tournament_strings[-1]
    tournament_json = json.loads(tournament_string[tournament_string.find(str_to_delete)+len(str_to_delete):])
    relevant_keys = ["rank", "id", "score"]
    team_results = [{k: v for k, v in team_dict.items() if k in relevant_keys} for team_dict in 
                    tournament_json["data"]["teamStanding"]]
    return pd.DataFrame(team_results)

In [6]:
team_df = pd.DataFrame()
individual_df = pd.DataFrame()

buli_tournaments = get_bundesliga_tournaments()
for tournament in buli_tournaments:
    print(f"Downloading data of tournament {tournament[1]} on {tournament[0]}")
    team_df = team_df.append(get_team_results(tournament[4]))
    individual_df = individual_df.append(get_individual_results(tournament[4].split("/")[-1]))
    
team_df["count"] = 1
team_df["champion"] = (team_df["rank"] == 1).astype(int)
individual_df["count"] = 1
individual_df["champion"] = (individual_df["rank"] == 1).astype(int)

Downloading data of tournament 14. Quarantäne-Bundesliga on 02.04.
Downloading data of tournament 13. Quarantäne-Bundesliga on 31.03.
Downloading data of tournament 12. Quarantäne-Bundesliga on 29.03.
Downloading data of tournament 11. Quarantäne-Bundesliga on 26.03.
Downloading data of tournament 10. Quarantäne-Bundesliga on 25.03.
Downloading data of tournament 9. Quarantäne-Bundesliga on 24.03.
Downloading data of tournament 8. Quarantäne-Bundesliga on 23.03.
Downloading data of tournament 7. Quarantäne-Bundesliga on 22.03.
Downloading data of tournament 6. Quarantäne-Bundesliga on 21.03.
Downloading data of tournament 5. Quarantäne-Liga Teams 1-10 on 19.03.
Downloading data of tournament 4. DE Quarantäne Teams 1-10 on 18.03.
Downloading data of tournament 3. DE-Quarantäne Teams 1-10 on 17.03.
Downloading data of tournament 2. DE-Quarantäne Teams 1-10 on 16.03.
Downloading data of tournament 1. DE-Quarantäne Team Battle on 15.03.


In [7]:
all_time_teams = team_df.groupby("id", as_index=False).agg(
    {"score": "sum", "count": "count", "champion": "sum", "rank": "mean"}).sort_values("score", ascending=False)
all_time_teams.columns = ["Team", "Gesamtpunkte", "Teilnahmen", "Meisterschaften", "Durchschnittsplatzierung"]
all_time_teams.index = range(1, len(all_time_teams) + 1)
all_time_indiv = individual_df.groupby("username", as_index=False).agg(
    {"score": "sum", "count": "count", "champion": "sum", "performance": "mean", "rank": "mean", }).sort_values("score", ascending=False)
all_time_indiv.columns = ["Name", "Gesamtpunkte", "Teilnahmen", "Turniersiege", "Durchschnittsperformance", "Durchschnittsplatzierung"]
all_time_indiv.index = range(1, len(all_time_indiv) + 1)
all_time_indiv["Durchschnittsscore"] = all_time_indiv["Gesamtpunkte"] / all_time_indiv["Teilnahmen"]

In [8]:
all_time_teams.round({"Durchschnittsplatzierung": 1})

Unnamed: 0,Team,Gesamtpunkte,Teilnahmen,Meisterschaften,Durchschnittsplatzierung
1,bierstube-muc,3619,14,3,2.6
2,hsk-lister-turm,2755,10,4,3.3
3,ksk-dr-lasker-1861-ev,2365,9,0,5.2
4,oxfordgang,2357,11,0,5.9
5,sk-landau-1908-ev,2254,8,0,5.9
6,tsv-schoniach,2226,9,0,5.2
7,die-nordlichter,1859,9,3,5.1
8,hessische-schachjugend,1408,5,1,5.8
9,sv-werder-bremen,1193,6,0,6.5
10,die-oagen,1174,4,0,7.0


In [9]:
all_time_indiv.round({"Durchschnittsplatzierung": 1, "Durchschnittsperformance": 1, "Durchschnittsscore": 1})

Unnamed: 0,Name,Gesamtpunkte,Teilnahmen,Turniersiege,Durchschnittsperformance,Durchschnittsplatzierung,Durchschnittsscore
1,SomeMightSay,772,14,4,2564.4,4.0,55.1
2,Satscho,458,14,0,2346.6,35.8,32.7
3,psammenitos,456,9,4,2584.0,18.1,50.7
4,Kabeljaukrieger,451,13,0,2405.6,27.7,34.7
5,Chillkroete77,431,11,1,2338.1,16.3,39.2
6,Blend99,388,8,0,2506.6,8.0,48.5
7,P1W4,384,14,0,2172.1,57.0,27.4
8,fluffy_cactus,383,14,0,2229.6,56.5,27.4
9,Eldrail,376,10,0,2534.7,27.3,37.6
10,Veez,359,11,0,2357.2,32.1,32.6
