In [7]:
import json
import zipfile
from pathlib import Path

import numpy as np
import pandas as pd

In [8]:
root_folder = Path("./") if Path("./replay_downloader").exists() else Path("./")

# don't re-extract if the folder already exists
if not (root_folder / "replay_downloader/replays").exists():
    with zipfile.ZipFile(root_folder / "replays.zip", 'r') as zip_ref:
        # extract the zip to the directory where this script resides
        zip_ref.extractall()

In [54]:
# data is in json files inside "project/replay_downloader/replays/{rank-name}/parsed/{id}.json"
# there's too many to load at once, so we'll load a random sample from each rank
# then we'll concatenate them into a single dataframe

# load the data
dfs: dict[str, pd.DataFrame] = {}

all_keys = {
    "boost/amount_collected",
    "boost/amount_collected_big",
    "boost/amount_collected_small",
    "boost/amount_overfill",
    "boost/amount_overfill_stolen",
    "boost/amount_stolen",
    "boost/amount_stolen_big",
    "boost/amount_stolen_small",
    "boost/amount_used_while_supersonic",
    "boost/avg_amount",
    "boost/bcpm",
    "boost/bpm",
    "boost/count_collected_big",
    "boost/count_collected_small",
    "boost/count_stolen_big",
    "boost/count_stolen_small",
    "boost/percent_boost_0_25",
    "boost/percent_boost_25_50",
    "boost/percent_boost_50_75",
    "boost/percent_boost_75_100",
    "boost/percent_full_boost",
    "boost/percent_zero_boost",
    "boost/time_boost_0_25",
    "boost/time_boost_25_50",
    "boost/time_boost_50_75",
    "boost/time_boost_75_100",
    "boost/time_full_boost",
    "boost/time_zero_boost",
    "core/assists",
    "core/goals",
    "core/goals_against",
    "core/mvp",
    "core/saves",
    "core/score",
    "core/shooting_percentage",
    "core/shots",
    "core/shots_against",
    "demo/inflicted",
    "demo/taken",
    "movement/avg_powerslide_duration",
    "movement/avg_speed",
    "movement/avg_speed_percentage",
    "movement/count_powerslide",
    "movement/percent_boost_speed",
    "movement/percent_ground",
    "movement/percent_high_air",
    "movement/percent_low_air",
    "movement/percent_slow_speed",
    "movement/percent_supersonic_speed",
    "movement/time_boost_speed",
    "movement/time_ground",
    "movement/time_high_air",
    "movement/time_low_air",
    "movement/time_powerslide",
    "movement/time_slow_speed",
    "movement/time_supersonic_speed",
    "movement/total_distance",
    "positioning/avg_distance_to_ball",
    "positioning/avg_distance_to_ball_no_possession",
    "positioning/avg_distance_to_ball_possession",
    "positioning/avg_distance_to_mates",
    "positioning/percent_behind_ball",
    "positioning/percent_closest_to_ball",
    "positioning/percent_defensive_half",
    "positioning/percent_defensive_third",
    "positioning/percent_farthest_from_ball",
    "positioning/percent_infront_ball",
    "positioning/percent_most_back",
    "positioning/percent_most_forward",
    "positioning/percent_neutral_third",
    "positioning/percent_offensive_half",
    "positioning/percent_offensive_third",
    "positioning/time_behind_ball",
    "positioning/time_closest_to_ball",
    "positioning/time_defensive_half",
    "positioning/time_defensive_third",
    "positioning/time_farthest_from_ball",
    "positioning/time_infront_ball",
    "positioning/time_most_back",
    "positioning/time_most_forward",
    "positioning/time_neutral_third",
    "positioning/time_offensive_half",
    "positioning/time_offensive_third",
    "positioning/goals_against_while_last_defender",
}

for rank in [
    "grandchampion",
    "champion",
    "diamond",
    "platinum",
    "gold",
    "silver",
    "bronze",
]:
    for div in [str(i) for i in range(1, 4)]:
        full_rank = f"{rank}-{div}"
        rank_data: dict[str, list] = {key: [] for key in all_keys}

        path = root_folder / f"replay_downloader/replays/{full_rank}/parsed"
        files = list(path.glob("*.json"))
        if len(files) == 0:
            continue

        # load at most 2000 random samples
        n = min(2000, len(files))
        files = np.random.choice(files, n, replace=False)

        for i, file in enumerate(files):
            with open(file, "r") as f:
                full_data = json.load(f)
                players = full_data["blue"]["players"] + full_data["orange"]["players"]
                stats = [player["stats"] for player in players]

                # flatten the stats from stats["core"]["shots"] and stats["boost"]["bpm"] to stats["core/shots"] and stats["boost/bpm"]
                # add that to the rank_data dict
                for key in all_keys:
                    category, sub_key = key.split("/")
                    for player_stats in stats:
                        if category in player_stats and sub_key in player_stats[category]:
                            value = player_stats[category][sub_key]
                        else:
                            value = np.nan
                        rank_data[key].append(value)

        dfs[full_rank] = pd.DataFrame(rank_data)

In [55]:
# write each rank to a csv

rank_samples_folder = root_folder / "rank_samples"
rank_samples_folder.mkdir(exist_ok=True)

for rank, df in dfs.items():
    df.to_csv(rank_samples_folder / f"{rank}.csv", index=False)