In [3]:
from pathlib import Path

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split

In [4]:
# boost/bcpm
# boost/amount_stolen
# movement/count_powerslide
# movement/avg_speed
# movement/percent_ground
# movement/percent_high_air
# positioning/avg_distance_to_mates

In [5]:
# load the data
rank_data: dict[str, pd.DataFrame] = {}
include_columns = [
    "positioning/goals_against_while_last_defender",
    "movement/percent_high_air",
    "movement/time_high_air",
    "positioning/avg_distance_to_mates",
    "movement/count_powerslide",
    "movement/percent_supersonic_speed",
    "movement/avg_powerslide_duration",
    "movement/avg_speed",
    "movement/percent_ground",
    "movement/avg_speed_percentage",
    "boost/bcpm",
    "boost/amount_collected_small",
    "movement/time_powerslide",
    "boost/amount_used_while_supersonic",
    "boost/bpm",
    "positioning/percent_closest_to_ball",
    "positioning/percent_neutral_third",
    "boost/count_collected_small",
    "positioning/avg_distance_to_ball_no_possession",
    "movement/time_supersonic_speed",
    "movement/percent_boost_speed",
    "movement/percent_slow_speed",
    "positioning/avg_distance_to_ball",
    "positioning/avg_distance_to_ball_possession",
    "movement/percent_low_air",
    "positioning/percent_most_forward",
    "boost/percent_boost_50_75",
    "positioning/percent_farthest_from_ball",
    "boost/percent_zero_boost",
    "boost/avg_amount",
    "boost/percent_boost_0_25",
    "boost/percent_boost_25_50",
    "core/score",
    "boost/time_zero_boost",
    "positioning/time_closest_to_ball",
    "boost/percent_full_boost",
    "boost/amount_overfill",
    "positioning/percent_most_back",
    "movement/time_ground",
    "boost/percent_boost_75_100",
    "boost/time_boost_50_75",
    "boost/time_full_boost",
    "positioning/time_most_forward",
    "boost/time_boost_25_50",
    "boost/time_boost_0_25",
    "positioning/percent_behind_ball",
    "positioning/percent_infront_ball",
    "boost/amount_stolen_small",
    "boost/time_boost_75_100",
    "boost/amount_stolen",
    "positioning/time_infront_ball",
    "movement/time_boost_speed",
    "movement/time_slow_speed",
    "positioning/time_farthest_from_ball",
    "positioning/time_neutral_third",
    "boost/amount_collected_big",
    "positioning/percent_offensive_third",
    "boost/amount_collected",
    "positioning/time_offensive_third",
    "positioning/time_most_back",
    "positioning/percent_defensive_third",
    "movement/time_low_air",
    "boost/amount_overfill_stolen",
]
# include_columns = [
#     "positioning/goals_against_while_last_defender",
#     "movement/percent_high_air",
#     "movement/time_high_air",
#     "positioning/avg_distance_to_mates",
#     "movement/percent_supersonic_speed",
#     "movement/count_powerslide",
#     "movement/avg_powerslide_duration",
#     "movement/avg_speed_percentage",
#     "movement/percent_ground",
#     "movement/avg_speed",
#     "boost/bcpm",
#     "boost/amount_collected_small",
#     "positioning/percent_neutral_third",
#     "boost/bpm",
#     "boost/amount_used_while_supersonic",
#     "positioning/percent_closest_to_ball",
#     "movement/time_powerslide",
#     "movement/time_supersonic_speed",
#     "boost/count_collected_small",
# ]

root_folder = Path("./") if Path("./rank_samples").exists() else Path("/project")
for file in root_folder.glob("rank_samples/*.csv"):
    if "bronze" in file.stem:
        continue

    rank_data[file.stem] = pd.read_csv(file)
    # only include the above listed columns, e.x. boost/bcpm
    rank_data[file.stem] = rank_data[file.stem][include_columns]

# create a mapping from rank to number
rank_to_number = {
    # "bronze-1": 1,
    # "bronze-2": 2,
    # "bronze-3": 3,
    # "silver-1": 4,
    # "silver-2": 5,
    "silver-3": 6,
    "gold-1": 7,
    "gold-2": 8,
    "gold-3": 9,
    "platinum-1": 10,
    "platinum-2": 11,
    "platinum-3": 12,
    "diamond-1": 13,
    "diamond-2": 14,
    "diamond-3": 15,
    "champion-1": 16,
    "champion-2": 17,
    "champion-3": 18,
    "grand-champion-1": 19,
    "grand-champion-2": 20,
    "grand-champion-3": 21,
}

# randomly sample at most 75000 // 21 rows from each rank
# for key, df in rank_data.items():
#     rank_data[key] = df.sample(min(400000 // len(rank_to_number.keys()), len(df)))

# combine the data, add the key as a column
combined_data = pd.concat([df.assign(rank=key) for key, df in rank_data.items()])

In [6]:
# "rank" is something like 'bronze-1', 'bronze-2', 'silver-1', etc.
# we need to convert this to a number so we can use it in our models

# convert the rank column to a number, dropping if it doesn't exist in the mapping
combined_data["rank"] = combined_data["rank"].map(rank_to_number)
# combined_data = combined_data.dropna()

In [7]:
# fill in NaNs with the mean of the 'rank' column

# calculate the mean of each column for each rank
mean_data = combined_data.groupby("rank").mean()

# fill in the NaNs with the mean of the rank
for rank in mean_data.index:
    combined_data.loc[combined_data["rank"] == rank] = combined_data.loc[combined_data["rank"] == rank].fillna(mean_data.loc[rank])

combined_data = combined_data.dropna()

In [8]:
# train/test split, rank is the y
X = combined_data.drop(columns=["rank"])
y = combined_data["rank"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# print the shape of the data
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(486968, 63) (486968,) (121742, 63) (121742,)


In [None]:
errors = []
# num_estimators = list(range(350, 500, 50))
num_estimators = [700]
for i in num_estimators:
    print(f"Training with {i} estimators", end="")
    model = RandomForestClassifier(n_estimators=i, n_jobs=24)
    model.fit(X_train, y_train)

    errors.append(1 - model.score(X_test, y_test))
    print(f"\r{i} estimators had an error of {errors[-1]*100:.2f}%; ", end="")

    mse = np.mean((model.predict(X_test) - y_test) ** 2)
    print(f"MSE: {mse}")

    # save the model to a file via pickle
    joblib.dump(model, f"model-{i}.joblib")

# plt.plot(num_estimators, errors)
# plt.xlabel("Number of estimators")
# plt.ylabel("Error rate")
# plt.show()

least_error_idx = np.argmin(errors)
least_error = errors[least_error_idx]
least_error_estimators = num_estimators[least_error_idx]
print(f"The least error rate is {least_error:.2f} with {least_error_estimators} estimators.")

700 estimators had an error of 41.56%; MSE: 3.2545054295148756
