In [None]:
import copy
import random
import warnings
from typing import Dict, List, Tuple, Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.axes import Axes
from numpy import ndarray
from pandas import DataFrame
from sklearn.metrics import mean_absolute_error
from statsbombpy import sb

warnings.filterwarnings("ignore")

In [None]:
# derive data for 1. Bundesliga 2015/2016 from StatsBomb
COMPETITION_ID = 9
SEASON_ID = 27
bundesliga_matches = sb.matches(competition_id=COMPETITION_ID, season_id=SEASON_ID)

### Explorative Analytics

The goal of this block is to get accustomed with data, do some preprocessing/visualisation for further Elo calculation and experimentation

#### Sanity checks

In [None]:
# sanity check for NAs
bundesliga_matches.isna().sum()

In [None]:
def unique_value_counts(df: DataFrame, column_name: str) -> List[int]:
    """Creates a list of unique counts of values in a specified column of a DataFrame.

    Parameters:
    df: DataFrame containing the data.
    column_name: A name of a column to analyze.

    Returns:
       A list of unique counts of values in the specified column.

    """
    return list(df[column_name].value_counts().unique())

In [None]:
# some basic sanity checks:
# 1. All match weeks should have the same number of matches played
# 2. All teams should the same number of matches played at home
# 3. The team should play the same number of matches both at home and away
# 4. The number of matches played at home (or either away) should be equal 2*(the number of matches played at a match week) - 1
assert len(unique_value_counts(bundesliga_matches, "match_week")) == 1
assert len(unique_value_counts(bundesliga_matches, "home_team")) == 1
assert unique_value_counts(bundesliga_matches, "home_team") == unique_value_counts(
    bundesliga_matches, "away_team"
)
assert (
    unique_value_counts(bundesliga_matches, "home_team")[0]
    == 2 * (unique_value_counts(bundesliga_matches, "match_week")[0]) - 1
)

#### Preprocessing

In [None]:
# Preprocessing stage
# Saving columns needed for further Elo calculation
cols_to_save = [
    "match_id",
    "match_week",
    "home_team",
    "away_team",
    "home_score",
    "away_score",
]
bundesliga_matches_filt = bundesliga_matches[cols_to_save].reset_index(drop=True)
# Feature engineering: creating a column 'match_result' - to know the outcome, HW - home win, AW - away win, D - draw
conditions = [
    bundesliga_matches_filt["home_score"] > bundesliga_matches_filt["away_score"],
    bundesliga_matches_filt["home_score"] < bundesliga_matches_filt["away_score"],
]
choices = ["HW", "AW"]
bundesliga_matches_filt["match_result"] = np.select(conditions, choices, default="D")

#### Performance Data Visaulisation

In [None]:
# Data Visualisation stage focusing on teams' performance, that would give us the first assumptions about Elo rating

# Creating the home and away data separately
home_stats = bundesliga_matches_filt[
    ["home_team", "home_score", "away_score", "match_result"]
].rename(
    columns={
        "home_team": "team",
        "home_score": "goals_scored",
        "away_score": "goals_conceded",
    }
)
away_stats = bundesliga_matches_filt[
    ["away_team", "away_score", "home_score", "match_result"]
].rename(
    columns={
        "away_team": "team",
        "away_score": "goals_scored",
        "home_score": "goals_conceded",
    }
)

# Setting match results for home and away
home_stats["win"] = home_stats["match_result"] == "HW"
home_stats["loss"] = home_stats["match_result"] == "AW"
home_stats["draw"] = home_stats["match_result"] == "D"

away_stats["win"] = away_stats["match_result"] == "AW"
away_stats["loss"] = away_stats["match_result"] == "HW"
away_stats["draw"] = away_stats["match_result"] == "D"

# Concatenating home and away data
all_stats = pd.concat([home_stats, away_stats])

# Aggregating results for each team
teams_performance_stats = (
    all_stats.groupby("team")
    .agg(
        goals_scored=("goals_scored", "sum"),
        goals_conceded=("goals_conceded", "sum"),
        wins=("win", "sum"),
        losses=("loss", "sum"),
        draws=("draw", "sum"),
    )
    .reset_index()
)

teams_performance_stats["points"] = (
    teams_performance_stats["wins"] * 3 + teams_performance_stats["draws"]
)

In [None]:
# Preparing data for goals data visualisation
sorted_teams_stats_goals = teams_performance_stats.sort_values(
    by="goals_scored", ascending=True
)
teams_sorted = sorted_teams_stats_goals["team"]
goals_scored_sorted = sorted_teams_stats_goals["goals_scored"]
goals_conceded_sorted = sorted_teams_stats_goals["goals_conceded"]

# Creating the horizontal bar plot
plt.figure(figsize=(10, 6))
plt.barh(teams_sorted, goals_scored_sorted, color="blue", label="Goals Scored")
plt.barh(teams_sorted, -goals_conceded_sorted, color="red", label="Goals Conceded")

plt.xlabel("Number of Goals")
plt.title("Goals Scored vs. Goals Conceded by Team (Sorted by Goals Scored)")
plt.axvline(0, color="black", linewidth=0.5)
plt.legend()
plt.grid(axis="x", linestyle="--", alpha=0.6)
plt.tight_layout()
plt.show()

In [None]:
# Preparing data for match result data visualisation
sorted_teams_stats_wins = teams_performance_stats.sort_values(by="wins", ascending=True)
teams_sorted_wins = sorted_teams_stats_wins["team"]
wins_sorted = sorted_teams_stats_wins["wins"]
losses_sorted = sorted_teams_stats_wins["losses"]
draws_sorted = sorted_teams_stats_wins["draws"]

# Plotting grouped bar
bar_width = 0.2
positions = range(len(teams_sorted_wins))

plt.figure(figsize=(14, 8))
plt.barh(
    [p + bar_width for p in positions],
    wins_sorted,
    height=bar_width,
    color="green",
    label="Wins",
)
plt.barh(positions, losses_sorted, height=bar_width, color="red", label="Losses")
plt.barh(
    [p - bar_width for p in positions],
    draws_sorted,
    height=bar_width,
    color="gray",
    label="Draws",
)

plt.yticks(positions, teams_sorted_wins)
plt.xlabel("Number of Matches")
plt.title("Wins, Losses, and Draws per Team (Sorted by Wins)")
plt.legend()
plt.grid(axis="x", linestyle="--", alpha=0.6)
plt.tight_layout()
plt.show()

In [None]:
# Preparing data for points gained data visualisation
sorted_teams_stats_points = teams_performance_stats.sort_values(
    by="points", ascending=True
)

# Plotting the horizontal bar graph
plt.figure(figsize=(10, 6))
bars = plt.barh(
    sorted_teams_stats_points["team"],
    sorted_teams_stats_points["points"],
    color="skyblue",
)

# Adding text annotations beside each bar
for bar in bars:
    plt.text(
        bar.get_width() + 2,
        bar.get_y() + bar.get_height() / 2,
        f"{int(bar.get_width())}",
        ha="center",
        va="center",
        fontsize=10,
        color="gray",
        fontweight="bold",
    )
plt.xlabel("Points")
plt.title("Points Gained by Teams (Highest to Least)")
plt.grid(axis="x", linestyle="--", alpha=0.7)
plt.tight_layout()
plt.show()

The main performance indicators, such as goals ratio, match results, and total points gained, reveal several common trends:

1. **Top Performers**  
   Two main teams, **Bayern Munich** and **Borussia Dortmund**, significantly outperformed all others. Although both are evidently the strongest, there is a discrepancy in their power, with Bayern Munich holding a lead.

2. **Second Tier Teams**  
   The next level of teams, following the top two, includes **Bayer Leverkusen**, **Borussia Mönchengladbach**, and **Schalke 04**.

3. **Weakest Team**  
   The weakest team in the 2015/2016 Bundesliga season was **Hannover 96**, by a considerable margin.


### Calculate Elo Rating with default hyperparams

The goal of this block is to create an Elo calculation pipeline, tested with default values: `initial_elo`, `s_factor`, and `k_factor`. This pipeline will be used in the next block for Elo experimentation, fine-tuning the `s_factor` and `k_factor` variables.

In [None]:
# functions to intialise the input for Elo calculation
def initialize_elo_ratings(
    team_names: List[str], initial_elo: int = 100
) -> Dict[str, int]:
    """Initialize Elo ratings for each team with the default initial Elo score.

    Args:
        team_names: Names of teams for which to create initial Elo.
        initial_elo (optional): Value of initial Elo, default is 100.

    Returns:
        Dictionary where keys are team's names, values are initial Elo.

    """
    return {team_name: initial_elo for team_name in team_names}


def initialize_teams_elo_stats(team_names: List[str]) -> DataFrame:
    """Initialize the teams_stats dataframe with empty lists for Elo rating tracking.

    Args:
        team_names: Names of teams to use as index for a created DataFrame.

    Returns:
        Dataframe with index as team names, and three columns, all of List types,
        to store the history of important measurements during Elo calculation.

    """
    teams_stats = pd.DataFrame(
        index=team_names,
        columns=[
            "expected_win_prob_history",
            "actual_results_history",
            "elo_rating_history",
        ],
    )
    
    for col in teams_stats.columns:
        teams_stats[col] = [[] for _ in range(len(teams_stats))]
        
    return teams_stats


def initialize_input_for_elo_calc(
    team_names: List[str], initial_elo: int = 100
) -> Tuple[DataFrame, Dict[str, int]]:
    """Initialize two inputs for further Elo calculation.

    Args:
        team_names: Names of teams to use as index for a created DataFrame.
        initial_elo: Value of initial Elo, default is 100.

    Returns:
        _description_

    """
    teams_elo_stats = initialize_teams_elo_stats(team_names)
    elo_ratings = initialize_elo_ratings(team_names, initial_elo)

    return teams_elo_stats, elo_ratings

In [None]:
def calculate_expected_win_prob(
    elo_rating_diff: Union[int, float], s_factor: int = 15, base: int = 10
) -> float:
    """Calculate the expected win probability based on Elo ratings difference.

    Args:
        rating_diff: A difference between Elo ratings of home and away teams.
        s_factor (optional): A scaling factor of Elo rating. Default is 15.
        base (optional): The base value used in the power calculation, defaults to 10
            Although it is uncommon to change this parameter in Elo calculations, it is added for more flexibility.

    Returns:
        The expected probability of a home team win.

    """
    return 1 / (1 + base ** (-elo_rating_diff / s_factor))


def get_actual_result(match_result: str) -> Tuple[float, float]:
    """Convert string match result to actual results values for home and away teams for further Elo calculation.
    1 - for wining, 0 - for loosing, 0.5 - for a draw.

    Args:
        match_result: Indicates the outcome of a match.

    Returns:
        Number-values of actual match results for home and away teams.

    """
    if match_result == "HW":
        return 1.0, 0.0
    elif match_result == "AW":
        return 0.0, 1.0
    elif match_result == "D":
        return 0.5, 0.5


def update_elo_rating(
    current_rating: Union[float, int],
    actual_result: float,
    expected_win_prob: float,
    k_factor: int = 15,
) -> Union[float, int]:
    """Calculate an updated Elo rating based on actual and expected results.

    Args:
        current_rating: Elo rating to  be updated based on difference between actual and predicted results.
        actual_result: An actual result for a team, could be aither 1, 0.5, 0 (see get_actual_result function).
        expected_win_prob: A predicted win probability for a team (see calculate_expected_win_prob function)
        k_factor (optional): A factor influencing how much a difference between actual and predicted results
            will change team's current Elo rating, default is 15.

    Returns:
        Updated team's Elo rating.

    """
    # max - to avoid possible cases when Elo goes negative 
    updated_elo_rating = max(
        round(current_rating + k_factor * (actual_result - expected_win_prob), 4), 0
    )

    return updated_elo_rating

In [None]:
def process_match_week(
    matches_by_week: DataFrame,
    current_elo_ratings: Dict[str, Union[float, int]],
    s_factor: int = 15,
    k_factor: int = 15,
) -> Dict[str, List[Union[float, int]]]:
    """Process each match in a given match week, calculating and storing updated Elo ratings,
    also saving the history of sub-measurements like expected win probabilities and actual results.

    Args:
        matches_by_week: Dataframe with all matches happened in the same macth week.
        current_elo_ratings: Dictionary where keys are teams' names, and values
            their respective Elo ratings of a previous match week that will be updated based on results of this week
        s_factor (optional): Scaling factor of Elo rating, default is 15.
        k_factor (optional): A factor influencing how much the difference between actual and predicted results
            will change team's current Elo rating, default is 15.
    Returns:
        A dictionary with keys as teams' names and values as Elo-related measurements of this match week,
        particularly expected win probs, actual results, and changed Elo ratings.

    """
    week_updates = {}

    for _, match in matches_by_week.iterrows():
        home_team, away_team, result = (
            match["home_team"],
            match["away_team"],
            match["match_result"],
        )
        current_home_elo, current_away_elo = (
            current_elo_ratings[home_team],
            current_elo_ratings[away_team],
        )

        # Calculate expected win probabilities
        expected_home = calculate_expected_win_prob(
            current_home_elo - current_away_elo, s_factor
        )
        expected_away = 1 - expected_home

        # Calculate actual results
        actual_home, actual_away = get_actual_result(result)

        # Update Elo ratings
        updated_home_elo = update_elo_rating(
            current_home_elo, actual_home, expected_home, k_factor
        )
        updated_away_elo = update_elo_rating(
            current_away_elo, actual_away, expected_away, k_factor
        )

        # Store results in the temporary dictionary
        week_updates[home_team] = [expected_home, actual_home, updated_home_elo]
        week_updates[away_team] = [expected_away, actual_away, updated_away_elo]

    return week_updates


def update_teams_stats(
    teams_elo_stats: DataFrame, week_updates: Dict[str, List[Union[float, int]]]
) -> DataFrame:
    """Update history of Elo-related columns in teams_elo_stats dataframe with weekly updates.

    Args:
        teams_elo_stats: DataFrame with indexes as teams' names and with columns of type list
            which stores weekly updates of Elo-related meauserements
        week_updates: A dictionary with Elo-related meauserements' updates (see process_match_week function)
    Returns:
        A df with updated history Elo-related indicators

    """
    for team, updates in week_updates.items():
        for i, value in enumerate(updates):
            teams_elo_stats.loc[team].iat[i].append(value)

    return teams_elo_stats

In [None]:
def calculate_elo_ratings(
    preprocessed_matches_df: DataFrame,
    teams_elo_stats_init: DataFrame,
    elo_ratings_init: Dict[str, int],
    s_factor: int = 15,
    k_factor: int = 15,
) -> DataFrame:
    """Main function to calculate Elo ratings based on match data.

    Args:
        preprocessed_matches_df: Preprocessed match data (see Preprocessing part).
        teams_elo_stats_init: An empty df with predifiened struture to store Elo calculation results (see initialize_teams_elo_stats function)
        elo_ratings_init: Dictionary where keys are teams' names, values are initial Elo (see initialize_elo_ratings function).
        s_factor (optional): Scaling factor of Elo rating, default is 15.
        k_factor (optional): A factor influencing how much the difference between actual and predicted results
            will change team's current Elo rating, default is 15.
    Returns:
        A df with index as teams' names, with the following columns:
        'expected_win_prob_history', 'actual_results_history', 'elo_rating_history', 'final_elo_rating'.

    """

    # create empty teams_elo_stats
    teams_elo_stats = teams_elo_stats_init.applymap(
        lambda x: copy.deepcopy(x) if isinstance(x, list) else x
    )
    # start with initial Elo ratings, after each week current_elo_ratings changes
    current_elo_ratings = elo_ratings_init.copy()

    # Process each match week and consequently update teams_elo_stats
    for week, matches in preprocessed_matches_df.groupby("match_week"):
        week_updates = process_match_week(
            matches, current_elo_ratings, s_factor, k_factor
        )
        update_teams_stats(teams_elo_stats, week_updates)
        # updated Elo ratings are third elements in week_updates
        current_elo_ratings = {k: v[2] for k, v in week_updates.items()}
    # final_elo_rating - Elo rating based on the whole season
    teams_elo_stats["final_elo_rating"] = teams_elo_stats["elo_rating_history"].apply(
        lambda x: x[-1]
    )
    teams_elo_stats.sort_values(by="final_elo_rating", ascending=False, inplace=True)

    return teams_elo_stats

In [None]:
# run the Elo rating pipeline on whole season with default values
teams = teams_performance_stats["team"].unique()
teams_elo_stats_init, elo_ratings_init = initialize_input_for_elo_calc(teams)
teams_elo_stats = calculate_elo_ratings(
    bundesliga_matches_filt, teams_elo_stats_init, elo_ratings_init
)

In [None]:
plt.figure(figsize=(14, 6))

# Iterate through each team and plot their Elo rating history
for team in teams_elo_stats.index:
    elo_history = teams_elo_stats.loc[team, "elo_rating_history"]
    plt.plot(range(len(elo_history)), elo_history, marker="o", label=team)

plt.xlabel("Match Week")
plt.ylabel("Elo Rating")
plt.title("Elo Rating History of Teams")
plt.legend(title="Teams", loc="upper left")
plt.grid(True)
plt.show()

The pipeline with default hyperparameters was tested successfully. However, the visualization of Elo ratings for Bundesliga teams shows that the ranking is chaotic and does not reflect the trends detected during the performance analysis. **Therefore, a better set of hyperparameters needs to be found, which will be addressed in the next block**.

### Find the best s_factor and k_factor for Elo calculation

The next step in developing the Elo rating system is to identify the best combination of `s_factor` and `k_factor`. The approach involves calculating the Mean Absolute Error (MAE) between the expected win probabilities and actual match results for each combination of `s_factor` and `k_factor` across the entire season. To evaluate the performance of each hyperparameter combination, the following visualizations will be used:

1. **Line Graph of Teams' Elo Ratings Evolution Per Week**  
   This graph shows how the Elo ratings of each team change throughout the season.

2. **Line Graph of Teams' MAE Losses Evolution Per Week**  
   This visualization tracks the MAE losses for each team over the course of the season.

3. **Bar Plot of Mean Seasonal MAE Losses**  
   This plot provides a summary of the average MAE loss for each combination across a whole season season.

4. **Bar Plot of Accuracy between Elo Rankings and Actual Rankings (Based on Points Gained)**  
   This comparison assesses how closely the Elo rankings align with the actual team rankings based on points earned.

The most straightforward method for selecting the optimal combination is to choose the `s_factor` and `k_factor` pair with the lowest seasonal MAE loss and highest accuracy. However, additional visualizations will also be considered to guide the final decision.

*Note:* Initially, Mean Squared Error (MSE) was included as a loss metric to be tracked and visualized. However, experimental results indicated that MSE did not offer any additional insights beyond what was already provided by Mean Absolute Error (MAE). As a result, MSE was excluded from the evaluation to enhance the readability of the graphs and keep the visualizations focused and concise, showing only the most relevant information.


In [None]:
def update_loss_history(teams_elo_stats: DataFrame) -> DataFrame:
    """Updates teams_elo_stats dataframe adding information about MAE loss for each match.

    Args:
        teams_elo_stats: A dataframe with Elo-related indicators ('expected_win_prob_history' and 'actual_results_history')

    Returns:
        An updated dataframe with a new column 'mae_loss_history'

    """

    teams_elo_stats["mae_loss_history"] = [[] for _ in range(len(teams_elo_stats))]

    for team in teams_elo_stats.index:
        expected = teams_elo_stats.loc[team, "expected_win_prob_history"]
        actual = teams_elo_stats.loc[team, "actual_results_history"]
        mae_losses = []
        for a, e in zip(actual, expected):
            mae = mean_absolute_error([a], [e])
            mae_losses.append(mae)
        teams_elo_stats.at[team, "mae_loss_history"] = mae_losses

    return teams_elo_stats

In [None]:
# plot evaluation metrics
def create_metrics_line_graph(
    df: DataFrame,
    column: str,
    y_label: str,
    title: str,
    ax: Axes,
) -> None:
    """Helper function to ceate a line graph for metrics of different teams over time.

    This function plots a line graph for each team based on historical data from a specified column in the DataFrame.
    Each line represents a team's Elo-related metric over match weeks, displayed on a specified subplot.

    Args:
        df: A dataframe containing the historical metrics data for multiple teams, with teams as index
            and a column of lists representing metrics history.
        column: The name of the column in the DataFrame containing lists of historical data points for each team.
        y_label: The label for the Y-axis, representing the metric being visualized.
        title: The title of the graph, usually describing the metric being plotted.
        ax: Axes object where the graph will be plotted.

    Returns:
        This function does not return anything. It directly modifies the plot on the given Axes object.

    """
    for team in df.index:
        column_history = df.loc[team, column]
        ax.plot(range(len(column_history)), column_history, marker="o", label=team)

    ax.set_title(title)
    ax.set_ylabel(y_label)
    ax.set_xlabel("Match Week")
    ax.grid(True)
    ax.legend(title="Teams", loc="upper left", fontsize="small")

In [None]:
def plot_weekly_evaluation_metrics(
    teams_stats: DataFrame, s_factor: float, k_factor: float
) -> Tuple[List[float], List[float]]:
    """Create a combined evaluation plot with Elo rating and MAE loss.

    This function generates a combined plot that includes line graphs of Elo ratings and MAE losses for all teams over time.
    Helps in assessing the performance and consistency of Elo calculation with different s_factor and k_factor

    Args:
        teams_stats: A dataframe containing historical data for teams, including Elo rating and MAE loss histories.
        s_factor: The scaling factor used in Elo calculation, shown in the plot title.
        k_factor: The adjustment factor for rating changes, shown in the plot title.

    Returns:
        This function does not return anything; it directly creates the plot

    """
    fig, axes = plt.subplots(2, 1, figsize=(14, 10))

    create_metrics_line_graph(
        teams_stats,
        "elo_rating_history",
        "Elo Rating",
        "Elo Rating History of Teams",
        axes[0],
    )
    create_metrics_line_graph(
        teams_stats,
        "mae_loss_history",
        "MAE Loss",
        "MAE Loss History of Teams",
        axes[1],
    )

    # Set main title reflecting current scale and k factors
    fig.suptitle(
        f"Evaluation Metrics for s_factor={s_factor:.2f} and k_factor={k_factor:.2f}",
        fontsize=14,
    )
    plt.tight_layout(rect=[0, 0, 1, 0.98])
    plt.show()

In [None]:
def calculate_seasonal_evaluational_metrics(
    teams_elo_stats: pd.DataFrame, actual_teams_rankings: List[str]
) -> Tuple[float, int]:
    """Calculates seasonal evaluation metrics:

    1. Seasonal Mean MAE: The average of all Mean Absolute Error (MAE) values recorded throughout the season.
    2. Seasonal Accuracy: The number of teams whose Elo ranking matches their actual ranking, reflecting how accurately
       the Elo system represents the true standings.

    Args:
        teams_elo_stats: DataFrame containing Elo-related statistics for each team
        actual_teams_rankings: A list of final actual teams' rankings for the season based on the points gained

    Returns:
        seasonal_mean_mae (float): The rounded average MAE for the entire season.
        seasonal_accuracy (int): The count of matching positions between Elo rankings and actual team rankings.

    """
    all_mae_values = [
        value for sublist in teams_elo_stats["mae_loss_history"] for value in sublist
    ]
    seasonal_mean_mae = round(sum(all_mae_values) / len(all_mae_values), 4)
    # in previous steps elo_teams_rankings was sorted by final_elo_rating in descending order
    elo_teams_rankings = list(teams_elo_stats.index)
    seasonal_accuracy = sum(
        1
        for elo, actual in zip(elo_teams_rankings, actual_teams_rankings)
        if elo == actual
    )

    return seasonal_mean_mae, seasonal_accuracy

In [None]:
def create_metrics_bar_graph(
    data: List[float],
    keys: List[Tuple[float, float]],
    title: str,
    ylabel: str,
    ax: Axes,
) -> None:
    """Helper function to plot a simple bar graph.

    Args:
        data: A list of values to be plotted as bars.
        keys: A list of (s_factor, k_factor) tuples for the x-axis labels.
        title: The title of the plot.
        ylabel: The label for the y-axis.
        ax: The axes object where the plot will be drawn.

    Returns:
        This function does not return anything; it directly creates a plot.

    """
    x = range(len(data))
    ax.bar(x, data, color="skyblue")
    ax.set_title(title)
    ax.set_xlabel("(s_factor, k_factor)", fontsize=12)
    ax.set_ylabel(ylabel)
    ax.set_xticks(x)
    ax.set_xticklabels([f"{k}" for k in keys], rotation=45)
    ax.grid(axis="y", linestyle="--", alpha=0.7)


def plot_seasonal_evaluation_history(
    results: Dict[Tuple[float, float], List[float]]
) -> None:
    """
    Two bar charts of mean MAE loss of a whole season and Elo rankings accuracy
    for different (s_factor, k_factor) configurations.

    Args:
        results: A dictionary where keys are tuples representing (s_factor, k_factor) configurations,
            and values are lists containing their respective seasonal_mean_mae, seasonal_accuracy values.

    Returns:
        This function does not return anything; it directly creates a plot.

    """
    # Extract keys and values from the dictionary
    keys = list(results.keys())
    seasonal_mean_maes = [v[0] for v in results.values()]
    seasonal_accuracies = [v[1] for v in results.values()]

    # Create figure with two subplots
    fig, axes = plt.subplots(2, 1, figsize=(14, 10))

    # Plot MSE and MAE seasonal means
    create_metrics_bar_graph(
        seasonal_mean_maes, keys, "Mean MAE of the season", "Mean MAE", axes[0]
    )
    create_metrics_bar_graph(
        seasonal_accuracies, keys, "Accuracy of Elo teams' ranking", "Accuracy", axes[1]
    )

    fig.suptitle(f"Seasonal Evaluation Metrics for different hyperparams", fontsize=14)
    plt.tight_layout(rect=[0, 0, 1, 0.98])
    plt.show()

To start finding the best combination, it is useful to understand the approximate ranges of values and the relationships between `s_factor` and `k_factor` that are best suited for the Elo rating. Based on existing literature on this topic, we can assume some potential good combinations.

Below are manually defined combinations aiming to test some 'general cases':

1. **`s_factor = 15`, `k_factor = 15`**  
   Both hyperparameters are equal and small.

2. **`s_factor = 400`, `k_factor = 400`**  
   Both hyperparameters are equal and large.

3. **`s_factor = 15`, `k_factor = 400`**  
   `s_factor` is small, while `k_factor` is large.

4. **`s_factor = 400`, `k_factor = 15`**  
   `s_factor` is large, while `k_factor` is small.

Let's test these combinations and see which one performs the best.


In [None]:
seasonal_evaluation_history = {}
actual_teams_rankings = sorted_teams_stats_points["team"].to_list()[::-1]

combinations = [(15, 15), (400, 400), (15, 400), (400, 15)]

for combination in combinations:

    s_factor, k_factor = combination

    teams_elo_stats = calculate_elo_ratings(
        bundesliga_matches_filt,
        teams_elo_stats_init,
        elo_ratings_init,
        s_factor,
        k_factor,
    )

    teams_elo_stats = update_loss_history(teams_elo_stats)

    plot_weekly_evaluation_metrics(teams_elo_stats, s_factor, k_factor)

    seasonal_evaluation_history[combination] = calculate_seasonal_evaluational_metrics(
        teams_elo_stats, actual_teams_rankings
    )

plot_seasonal_evaluation_history(seasonal_evaluation_history)

The results clearly show the advantage and strength of the combination `s_factor = 400` and `k_factor = 15`.

- **`s_factor = 15` & `k_factor = 15` and `s_factor = 400` & `k_factor = 400`**  
  These combinations show almost similar performance. The Elo ratings evolution produced by these settings is chaotic and does not improve over time, contradicting the performance trends identified earlier. The losses are high and final predictions are not accurate.

- **`s_factor = 15` & `k_factor = 400`**  
  This is by far the worst combination (please do not be misled by accuracy, it is very small). The Elo system evolution is total chaos, failing to show any smoothed trends. The losses are extremely high, with a substantial portion being equal to 1, indicating that the predicted results are completely incorrect.

- **`s_factor = 400` & `k_factor = 15`**  
  This combination is a clear winner. It is the first configuration that, at a high level, reflects real performance results while having the lowest loss rates and good accuracy.

Based on existing literature ([Elo Rating System](https://en.wikipedia.org/wiki/Elo_rating_system), [Rpubs Analysis](https://rpubs.com/DTS098/SPE5AMS_Portfolio02), etc.), the success of the final configuration is understandable, as these values are quite common in general in Elo calculations. The scale factor and the K-factor serve distinct purposes in the Elo rating system, and their values are set differently to balance the system's responsiveness and stability:

- A higher scale factor ensures that expected scores are not overly sensitive to rating differences.
- The K-factor is deliberately kept lower to moderate the impact of individual match results. If it was higher than the scale factor, ratings would change too drastically based on each game, leading to an unstable rating environment.

Thus, the relationship and approximate ranges of `s_factor` and `k_factor` are generally well understood. While `s_factor = 400` and `k_factor = 15` performed very well, exploring other combinations following the consistent logic described above may yield even better results.

In [None]:
def round_to_nearest_half_decade(values: ndarray) -> ndarray:
    """Rounds the given array of values to the nearest half-decade (e.g., 5, 10, 15).

    This function divides each value by 5, rounds it to the nearest integer,
    and then multiplies it back by 5 to achieve rounding to the nearest multiple of 5.

    Args:
        values: An array of numerical values to be rounded.

    Returns:
        A NumPy array of values rounded to the nearest half-decade.

    Example:
        >>> values = np.array([12, 23, 37, 49])
        >>> round_to_nearest_half_decade(values)
        array([10., 25., 35., 50.])

    """
    return np.round(values / 5) * 5

In [None]:
def generate_combinations(
    s_factor_range: List[int] = [100, 800],
    k_factor_range: List[int] = [10, 50],
    n: int = 10,
    seed: int = 42,
) -> List[int]:
    """Generate structured combinations of scale factors and K-factors.

    This function generates linearly spaced values for both scale factors (s_factors)
    and K-factors (k_factors) within the specified ranges. The s_factors are rounded
    to the nearest decade, while the k_factors are rounded to the nearest half-decade.
    Random combinations of these factors are then sampled to create a list of paired values.

    Args:
        s_factor_range: The range for the scale factors, specified as a list of two integers [min, max]. Defaults to [100, 800].
        k_factor_range: The range for the K-factors, specified as a list of two integers [min, max]. Defaults to [10, 50].
        n (optional): The number of values to generate for each factor. Defaults to 10.
        seed: Random seed for reproducibility.

    Returns:
        A list of tuples, each containing a combination of rounded scale factor and K-factor values.

    """
    # Set the seed for reproducibility
    random.seed(seed)
    np.random.seed(seed)
    # Create linear spaced values for more meaningful combinations
    # s_factors are rounded to nearest decade
    # k_factors are rounded to nearest half decade
    s_factors = np.round(np.linspace(s_factor_range[0], s_factor_range[1], n), -1)
    k_factors = round_to_nearest_half_decade(
        np.linspace(k_factor_range[0], k_factor_range[1], n)
    )

    # Randomly sample from the created arrays
    random_indices_s = np.random.choice(range(n), size=n, replace=False)
    random_indices_k = np.random.choice(range(n), size=n, replace=False)
    combinations = [
        (round(s_factors[i_s]), round(k_factors[i_k]))
        for i_s, i_k in zip(random_indices_s, random_indices_k)
    ]

    return combinations

In [None]:
seasonal_evaluation_history = {}
# s_factor_range should be high since we do not want to make the model sensitive to a small rating differences
# k_factor will be more flexible to find, we need to figure out what is a better way of k_factor to change: more drastically or more steadily
combinations = generate_combinations(s_factor_range=[300, 600], n=10)

for combination in combinations:

    s_factor, k_factor = combination

    teams_elo_stats = calculate_elo_ratings(
        bundesliga_matches_filt,
        teams_elo_stats_init,
        elo_ratings_init,
        s_factor,
        k_factor,
    )

    teams_elo_stats = update_loss_history(teams_elo_stats)

    plot_weekly_evaluation_metrics(teams_elo_stats, s_factor, k_factor)

    seasonal_evaluation_history[combination] = calculate_seasonal_evaluational_metrics(
        teams_elo_stats, actual_teams_rankings
    )

plot_seasonal_evaluation_history(seasonal_evaluation_history)

All tested combinations **successfully captured the overall trends of team performances**, accurately reflecting the top five teams—Bayern Munich, Borussia Dortmund, Bayer Leverkusen, Borussia Mönchengladbach, and Schalke 04. The majority of the remaining teams were positioned in the middle, with frequent mixing, and two clear outliers, Stuttgart and Hannover 96, which aligns with the actual results of the season.

Interestingly, **the different models showed minimal variation in MAE loss**. Despite this, a noticeable pattern emerged: the Elo rating system performed exceptionally well in predicting wins for some matches (with losses nearly at 0) but exhibited a higher error margin for others (with MAE fluctuating around the 0.5 mark). 

However, some combinations stood out for their accuracy and for producing Elo rating differences that were more distinct and closer to reality. These combinations consistently featured a smaller `k_factor` (with keeping `s_factor` high), suggesting that, **with this dataset, the Elo algorithm performs best when rating adjustments occur more steadily and gradually.**


In [None]:
s_factor_best, k_factor_best = 570, 10
teams_elo_stats_best = calculate_elo_ratings(
    bundesliga_matches_filt,
    teams_elo_stats_init,
    elo_ratings_init,
    s_factor_best,
    k_factor_best,
)

In [None]:
teams_elo_stats_best['points_gained_history'] = teams_elo_stats_best['actual_results_history'].apply(lambda results: [3 if x == 1 else 1 if x == 0.5 else 0 for x in results])
    
teams_elo_stats_best['points_gained_history'] = teams_elo_stats_best['points_gained_history'].apply(lambda points: pd.Series(points).cumsum().tolist())

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(14, 15))
create_metrics_line_graph(
    teams_elo_stats_best, 
    'points_gained_history', 
    'Points gained',
    'Actuals points progression by match weeks', 
    axes[0]
)
create_metrics_line_graph(
    teams_elo_stats_best, 
    'elo_rating_history',
    'Elo rating with best hyperparameters', 
    'Elo ratings', 
    axes[1]
)

plt.tight_layout(rect=[0, 0, 1, 0.98])
plt.show()

### The single most surprising win

The most surprising win is defined as the one that produces the highest positive value of `actual_result - expected_win_prob`. Since the `k_factor` is stable throughout the Elo calculation, the most unexpected win will be the one that causes the largest increase in Elo rating compared to its previous value. This is determined using the Elo rating update formula: `new_rating = current_rating + k_factor * (actual_result - expected_win_prob)`

In [None]:
def find_highest_increase(df: DataFrame, column_name: str) -> Tuple[str, int]:
    """Find the team with the highest single-week increase in a specified column.

    This function identifies the team that experienced the greatest increase in a specified metric
    (e.g., Elo rating) from one match week to the next. It returns the team name and the match week
    in which this highest increase occurred.

    Args:
        df: DataFrame containing historical data for multiple teams, with teams as index
            and the specified metric as a column of lists.
        column_name: The name of the column containing lists of historical metric values for each team.

    Returns:
        max_team: The name of the team with the highest increase.
        match_week: The match week where the highest increase occurred.

    """
    max_increase = -np.inf
    max_team = None
    max_index = -1

    for team in df.index:
        elo_history = df.loc[team, column_name]
        for i in range(1, len(elo_history)):
            increase = elo_history[i] - elo_history[i - 1]
            if increase > max_increase:
                max_increase = increase
                max_team = team
                max_index = i
    # because in StatsBomb the first match week starts with 1 not with 0
    match_week = max_index + 1

    return max_team, match_week

In [None]:
the_most_suprising_increase_params = find_highest_increase(
    teams_elo_stats_best, "elo_rating_history"
)
# let's find this match in df
# Properly formatted code for filtering matches
suprising_match = bundesliga_matches[
    (
        (bundesliga_matches["home_team"] == the_most_suprising_increase_params[0])
        | (bundesliga_matches["away_team"] == the_most_suprising_increase_params[0])
    )
    & (bundesliga_matches["match_week"] == the_most_suprising_increase_params[1])
]
# This result is logical, as Eintracht Frankfurt was among the worst teams in the season,
# while Borussia Dortmund was the second-best team in the 2015/2016 Bundesliga.
# However, it's important to note that this was the second-to-last match of the season, with nothing at stake for Borussia Dortmund,
# while Eintracht Frankfurt was fighting against relegation — a context that is challenging to capture in the Elo algorithm
display(suprising_match)

### Conclusion

The research developed **a flexible pipeline for Elo calculation**, enabling not only the computation of results but also providing a tool to fine-tune hyperparameters based on evaluation metrics. The pipeline was successfully tested on Bundesliga 2015/2016 data, with the final model accurately predicting the final positions of 13 out of 18 teams. **The model performs well for teams with relatively consistent match outcomes** (e.g., frequently winning or losing); however, it struggles with teams in the middle of the table that have fluctuating results, making them more challenging to predict accurately.

One of **the most critical parameters** in achieving accurate Elo predictions was the **K-factor**, which determines the dynamics of rating changes. Smaller K-factor values tend to yield better results, especially given the limited and inconsistent match data, as gradual and steady rating adjustments are more suitable.

While the final model performs well, there are several suggestions to improve the Elo rating algorithm further:

1. **Use of Historical Data for Initial Elo Ratings**  
   Incorporate previous seasons' data to set more advanced initial Elo ratings, improving the model's starting accuracy.

2. **Dynamic K-factor Adjustment**  
   Implement a dynamic K-factor that is relatively high at the beginning of the season—when teams are still finding their form—and decreases as the season progresses and teams' performances stabilize.

3. **Incorporate Additional Match Context**  
   Consider additional factors beyond the actual match result when updating Elo ratings, such as whether a team plays at home or away, or if the game has higher stakes (e.g., relegation battles). These factors could be integrated with simple additions or subtractions to the rating.

4. **Comprehensive Training and Testing Process**  
   Introduce a full training process using separate train and test datasets to evaluate how well the Elo calculation generalizes to unseen data. For example, the model could be trained on previous seasons to find the best hyperparameters and then used to predict the final results for an unseen season.

These recommendations aim to improve Elo calculation, enhancing its predictive power and adaptability to various team performance patterns and match contexts.