Collects data from the API of the _Saisonmanager_

In [None]:
import itertools
import httpx
import pandas as pd

API_URL = "https://saisonmanager.de/api/v2"

leagues_url = API_URL + "/leagues.json"
league_url = API_URL + "/leagues/{league_id}.json"
standings_url = API_URL + "/leagues/{league_id}/table.json"
scorers_url = API_URL + "/leagues/{league_id}/scorer.json"
schedule_url = API_URL + "/leagues/{league_id}/schedule.json"
game_url = API_URL + "/games/{game_id}.json"

In [None]:
def collect_leagues(filename = "leagues.pkl"):
    # Collect Leagues
    print('Collecting leagues')
    leagues = pd.read_json(leagues_url)
    leagues.to_pickle(filename)
    print('Done')
    print()
    
def collect_players(filename = "scorers.pkl"):
    # Collect Scorers
    # collect scorer data for the leagues; need to check that the URL can
    # be reached beforehand because a few fail with a 500 Internal Server Error
    print('Collecting scorers')
    leagues = pd.read_json(leagues_url)
    scorers = {
        league_id: pd.read_json(scorers_url.format(league_id=league_id))
        for league_id in leagues.id
        if httpx.head(scorers_url.format(league_id=league_id)).is_success
    }
    # create additional column containing league ids
    league_ids = pd.Series(
        itertools.chain(
            *[
                itertools.repeat(league_id, len(scorerdata))
                for league_id, scorerdata in scorers.items()
            ]
        ),
        name="league_id",
    )
    # create dataframe from the combined schedules
    players = pd.concat(scorers.values(), ignore_index=True)
    # combine the dataframes into one with all the columns
    playerdata = pd.concat([league_ids, players], axis="columns")
    playerdata.to_pickle(filename)
    print('Done')
    print()
    
def collect_matches(filename = "matches.pkl"):
    # Collect schedules
    # collect schedule data for the leagues; need to check that the URL can
    # be reached beforehand because a fewfail with a 500 Internal Server Error
    print('Collecting schedules')
    leagues = pd.read_json(leagues_url)
    schedules = {
        league_id: pd.read_json(schedule_url.format(league_id=league_id))
        for league_id in leagues.id
        if httpx.head(schedule_url.format(league_id=league_id)).is_success
    }

    # create additional column containing league ids
    league_ids = pd.Series(
        itertools.chain(
            *[
                itertools.repeat(league_id, len(schedule))
                for league_id, schedule in schedules.items()
            ]
        ),
        name="league_id",
    )

    # create matches dataframe from the combined schedules
    matches = pd.concat(schedules.values(), ignore_index=True)
    # unpack the JSON strings in the result column into a dataframe
    results = pd.json_normalize(matches["result"])
    # use nullable integer type
    results["home_goals"] = results["home_goals"].astype("Int64")
    results["guest_goals"] = results["guest_goals"].astype("Int64")
    # remove JSON result column
    matches = matches.drop(columns=["result"])
    # combine the three dataframes into one with all the columns
    matches = pd.concat([league_ids, results, matches.reset_index()], axis="columns")
    matches.to_pickle(filename)
    print('Done')
    print()

In [None]:
collect_leagues()
collect_players()
collect_matches()