# Rival Rush - EDA

In [1]:
pip install -r requirements.txt

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
import sys
import json
from typing import List, Dict, Union, Any  # Importiere Any
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:

def zeige_ordnerstruktur(ordner_pfad: str, einrueckung: str = "") -> None:
    """
    Zeigt die Ordnerstruktur rekursiv an, beginnend bei einem gegebenen Pfad.

    Args:
        ordner_pfad: Der Pfad des Ordners, dessen Struktur angezeigt werden soll.
        einrueckung: Ein String, der für die Einrückung der Ausgabe verwendet wird (für die Rekursion).
    """
    pfad = Path(ordner_pfad)

    if not pfad.is_dir():
        print(f"Fehler: Der Pfad '{ordner_pfad}' ist kein gültiger Ordner.")
        return

    print(einrueckung + pfad.name + "/")  # Ordnernamen mit Schrägstrich kennzeichnen
    einrueckung += "  "  # Einrückung für die nächste Ebene erhöhen

    # Dateien und Unterordner im aktuellen Ordner auflisten
    inhalte = sorted(pfad.iterdir()) # sortiert, damit die Ausgabe reproduzierbar ist.
    for inhalt in inhalte:
        if inhalt.is_dir():
            # Rekursiver Aufruf für Unterordner
            zeige_ordnerstruktur(str(inhalt), einrueckung)
        else:
            # Dateien ohne Schrägstrich ausgeben
            print(einrueckung + inhalt.name)

# --- Beispielaufruf ---
# Ersetze 'Replays/Replay Data/' durch den Pfad, dessen Struktur du sehen möchtest
ordner_pfad = 'Replays/Replay Data/'
zeige_ordnerstruktur(ordner_pfad)


Replay Data/
  76561197960320249_DrunKingan/
    replay_data_76561197960320249_1.json
    replay_data_76561197960320249_10.json
    replay_data_76561197960320249_11.json
    replay_data_76561197960320249_12.json
    replay_data_76561197960320249_13.json
    replay_data_76561197960320249_14.json
    replay_data_76561197960320249_15.json
    replay_data_76561197960320249_16.json
    replay_data_76561197960320249_17.json
    replay_data_76561197960320249_18.json
    replay_data_76561197960320249_19.json
    replay_data_76561197960320249_2.json
    replay_data_76561197960320249_20.json
    replay_data_76561197960320249_21.json
    replay_data_76561197960320249_22.json
    replay_data_76561197960320249_23.json
    replay_data_76561197960320249_24.json
    replay_data_76561197960320249_25.json
    replay_data_76561197960320249_26.json
    replay_data_76561197960320249_27.json
    replay_data_76561197960320249_28.json
    replay_data_76561197960320249_29.json
    replay_data_76561197960320249



def lese_json_datei(pfad: str) -> Union[Dict, None]:
    """
    Liest eine JSON-Datei und gibt den Inhalt als Dictionary zurück.
    Behandelt Fehler beim Lesen der Datei.

    Args:
        pfad (str): Der Pfad zur JSON-Datei.

    Returns:
        Union[Dict, None]: Der Inhalt der JSON-Datei als Dictionary, oder None bei Fehler.
    """
    try:
        with open(pfad, 'r', encoding='utf-8') as f:
            return json.load(f)
    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Fehler beim Lesen der JSON-Datei '{pfad}': {e}")
        return None

def extrahiere_daten(daten: Dict) -> Union[Dict, None]:
    """
    Extrahiert die relevanten Daten aus dem geladenen JSON-Dictionary,
    indem es rekursiv durch die Struktur geht.  Gibt None zurück, wenn
    das Format unerwartet ist.

    Args:
        daten (Dict): Das Dictionary, das aus der JSON-Datei geladen wurde.

    Returns:
        Union[Dict, None]: Ein Dictionary mit den extrahierten Daten, oder None, wenn die Struktur nicht passt.
    """
    if not isinstance(daten, dict):
        if daten is None:
            return {}
        else:
            return {"wert": daten} # Wrappe den Wert in ein Dictionary, um ihn einheitlich zu behandeln

    extrahierte_daten = {}
    for schluessel, wert in daten.items():
        if isinstance(wert, (dict, list)):
            # Rekursiver Aufruf für verschachtelte Strukturen
            verschachtelte_daten = extrahiere_daten(wert)
            if verschachtelte_daten: # Prüfe, ob etwas zurückkam
               for k, v in verschachtelte_daten.items():
                    extrahierte_daten[f"{schluessel}_{k}"] = v
        else:
            extrahierte_daten[schluessel] = wert
    return extrahierte_daten


def verarbeite_ordner(ordner_pfad: str) -> pd.DataFrame:
    """
    Verarbeitet alle JSON-Dateien in einem Ordner und gibt ein kombiniertes DataFrame zurück.
    Diese Funktion sucht nach Dateien im Format "replay_data_[id]_[number].json".

    Args:
        ordner_pfad (str): Der Pfad zum Ordner, der die JSON-Dateien enthält.

    Returns:
        pd.DataFrame: Ein Pandas DataFrame, das aus den Daten aller passenden JSON-Dateien im Ordner erstellt wurde.
                      Gibt ein leeres DataFrame zurück, wenn keine Daten verarbeitet werden konnten.
    """
    pfad = Path(ordner_pfad)
    if not pfad.is_dir():
        print(f"Fehler: Der Pfad '{ordner_pfad}' ist kein gültiger Ordner.")
        return pd.DataFrame()  # Leeres DataFrame zurückgeben

    daten_liste: List[Dict] = []
    for datei_pfad in pfad.glob("replay_data_*.json"):  # Geht nur Dateien durch, die mit "replay_data_" beginnen und auf ".json" enden
        datei_inhalt = lese_json_datei(str(datei_pfad))
        if datei_inhalt:  # Nur fortfahren, wenn das JSON erfolgreich gelesen wurde
            extrahierte_daten = extrahiere_daten(datei_inhalt)
            if extrahierte_daten:
                # Füge die Daten der Liste hinzu, zusammen mit dem Dateinamen als identifizierende Spalte.
                extrahierte_daten['dateiname'] = datei_pfad.name  # Speichert den Dateinamen
                daten_liste.append(extrahierte_daten)

    if not daten_liste:
        print(f"Keine verwertbaren JSON-Daten im Ordner '{ordner_pfad}' gefunden (oder keine im richtigen Format).")
        return pd.DataFrame()  # Leeres DataFrame zurückgeben

    return pd.DataFrame(daten_liste)



def verarbeite_alle_ordner(basis_pfad: str) -> pd.DataFrame:
    """
    Verarbeitet alle Unterordner eines gegebenen Basisordners und gibt ein kombiniertes DataFrame zurück.

    Args:
        basis_pfad (str): Der Pfad zum Basisordner, der die Unterordner enthält.

    Returns:
        pd.DataFrame: Ein Pandas DataFrame, das alle Daten aus allen Unterordnern enthält.
                      Gibt ein leeres DataFrame zurück, wenn keine Daten verarbeitet werden konnten.
    """
    basis_pfad_obj = Path(basis_pfad)
    if not basis_pfad_obj.is_dir():
        print(f"Fehler: Der Pfad '{basis_pfad}' ist kein gültiger Basisordner.")
        return pd.DataFrame()

    alle_daten = []
    for unterordner_pfad in basis_pfad_obj.iterdir():
        if unterordner_pfad.is_dir():
            df = verarbeite_ordner(str(unterordner_pfad)) # Ruft die Funktion auf, die Dateien im richtigen Format verarbeitet
            if not df.empty:
                alle_daten.append(df)

    if not alle_daten:
        print(f"Keine verwertbaren JSON-Daten in den Unterordnern von '{basis_pfad}' gefunden.")
        return pd.DataFrame()  # Leeres DataFrame zurückgeben

    return pd.concat(alle_daten, ignore_index=True)



# --- Hauptprogramm ---
if __name__ == "__main__":
    basis_ordner = "Replays/Replay Data"  # Ersetze dies durch deinen tatsächlichen Pfad
    alle_replay_daten_df = verarbeite_alle_ordner(basis_ordner) # Ändere den Namen der DataFrame-Variable

    if not alle_replay_daten_df.empty:
        print("Verarbeitung abgeschlossen. Das kombinierte DataFrame:")
        print(alle_replay_daten_df.head())
    else:
        print("Keine Daten zu verarbeiten.")


def plot_verteilung(df: pd.DataFrame, titel: str):
    """
    Erstellt und speichert einen Boxplot und einen kombinierten Barplot mit KDE für alle numerischen Spalten
    in einem DataFrame.

    Args:
        df (pd.DataFrame): Der DataFrame, der die zu visualisierenden Daten enthält.
        titel (str): Ein Titel für die generierten Plots.
    """
    numerische_spalten = df.select_dtypes(include=['number']).columns
    anzahl_spalten = len(numerische_spalten)
    if anzahl_spalten == 0:
        print(f"Keine numerischen Spalten zum Plotten im DataFrame für '{titel}'.")
        return

    # Boxplots erstellen
    plt.figure(figsize=(15, 5 * anzahl_spalten))  # Dynamische Höhe basierend auf der Anzahl der Spalten
    plt.suptitle(f"Verteilungsplots (Boxplots) für: {titel}", fontsize=16)
    for i, spalte in enumerate(numerische_spalten):
        plt.subplot(anzahl_spalten, 1, i + 1)
        sns.boxplot(data=df[spalte], orient='h')
        plt.title(spalte)
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Platz für den Gesamttitel
    plt.savefig(f"{titel}_boxplots.png")
    plt.close()

    # Barplots mit KDE erstellen
    plt.figure(figsize=(15, 5 * anzahl_spalten))
    plt.suptitle(f"Verteilungsplots (Barplots mit KDE) für: {titel}", fontsize=16)
    for i, spalte in enumerate(numerische_spalten):
        plt.subplot(anzahl_spalten, 1, i + 1)
        sns.histplot(data=df, x=spalte, kde=True)
        plt.title(spalte)
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()
plot_verteilung(alle_replay_daten_df, "Alle Replay Daten")

In [4]:
import json
import pandas as pd
from pathlib import Path
from typing import List, Dict

def process_match_file(file_path: Path) -> List[Dict]:
    """
    Liest eine JSON-Match-Datei ein und gibt eine Liste von Dictionaries
    zurück, jeweils eines pro Spieler mit Match‑ und Spieler‑Daten kombiniert.
    Zusätzlich wird 'team_result' gesetzt:
      1 = win, 0 = lose, 2 = draw
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        match = json.load(f)

    # 1) Match‑Metadaten (ohne Teams)
    match_meta = {k: v for k, v in match.items() if k not in ('blue', 'orange')}

    # 2) Team‑Goals berechnen, sicherheitshalber mit .get
    blue_players   = match.get('blue', {}).get('players', [])
    orange_players = match.get('orange', {}).get('players', [])
    goals_blue     = sum(p.get('stats', {}).get('core', {}).get('goals', 0) for p in blue_players)
    goals_orange   = sum(p.get('stats', {}).get('core', {}).get('goals', 0) for p in orange_players)

    # 3) Ergebnis-Klassen festlegen
    if   goals_blue  > goals_orange:
        result_blue, result_orange = 1, 0
    elif goals_blue  < goals_orange:
        result_blue, result_orange = 0, 1
    else:
        result_blue, result_orange = 2, 2

    # 4) Zeilen pro Spieler erzeugen
    rows = []
    for team_color, players, team_result in [
        ('blue',   blue_players,   result_blue),
        ('orange', orange_players, result_orange)
    ]:
        for player in players:
            row = {
                **match_meta,
                'team_color':  team_color,
                'team_result': team_result,
                **player
            }
            # Stats flach extrahieren
            stats = row.pop('stats', {})
            for cat, d in stats.items():
                for stat_name, stat_val in d.items():
                    row[f'stats_{cat}_{stat_name}'] = stat_val
            # rank, camera, id flachziehen
            for fld in ('rank', 'camera', 'id'):
                sub = row.pop(fld, {})
                for k, v in sub.items():
                    row[f'{fld}_{k}'] = v
            rows.append(row)

    return rows

def build_player_dataframe(base_folder: str) -> pd.DataFrame:
    """
    Durchläuft alle JSON-Dateien in base_folder und Unterordnern,
    erstellt eine flache Zeile pro Spieler pro Match und gibt das
    kombinierte DataFrame zurück.
    """
    all_rows = []
    for fn in Path(base_folder).rglob('replay_data_*.json'):
        try:
            all_rows.extend(process_match_file(fn))
        except Exception as e:
            print(f"Warnung: Fehler in {fn.name}: {e}")

    df = pd.DataFrame(all_rows)

    # Spalten neu ordnen (optional)
    cols = ['match_guid', 'team_color', 'team_result'] + \
           [c for c in df.columns if c not in ('match_guid','team_color','team_result')]
    return df[cols]

# Beispielaufruf:
df_players = build_player_dataframe("Replays/Replay Data")
print(df_players[['match_guid','team_color','team_result']].head())


                         match_guid team_color  team_result
0  FE7C43C611EFB191C11E4E9D5E677929       blue            1
1  FE7C43C611EFB191C11E4E9D5E677929       blue            1
2  FE7C43C611EFB191C11E4E9D5E677929     orange            0
3  FE7C43C611EFB191C11E4E9D5E677929     orange            0
4  8C2D2C3811EFED587ABA1BA3E3DA7507       blue            0


In [5]:
df_players.head(1)

Unnamed: 0,match_guid,team_color,team_result,link,created,uploader,status,rocket_league_id,title,map_code,...,camera_pitch,camera_distance,camera_stiffness,camera_swivel_speed,camera_transition_speed,id_platform,id_id,stats_positioning_goals_against_while_last_defender,recorder,id_player_number
0,FE7C43C611EFB191C11E4E9D5E677929,blue,1,https://ballchasing.com/api/replays/16ad9f5b-f...,2024-12-03T16:23:20.966649Z,"{'steam_id': '76561198164643612', 'name': 'jlt...",ok,D7DC7CC54C2FD50CEA673CAF1CC86532,2024-12-03.17.23 jlt. Ranked Doubles Loss,stadium_foggy_p,...,-5,270,0.35,4.0,1.7,epic,170e3432cb96487db2e91417e70fdc33,,,


In [6]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 2000)

# Ersten 5 Zeilen anzeigen
display(df_players.head(1))

Unnamed: 0,match_guid,team_color,team_result,link,created,uploader,status,rocket_league_id,title,map_code,match_type,team_size,playlist_id,duration,overtime,overtime_seconds,season,season_type,date,date_has_timezone,visibility,min_rank,max_rank,groups,playlist_name,map_name,server,start_time,end_time,name,mvp,car_id,car_name,steering_sensitivity,stats_core_shots,stats_core_shots_against,stats_core_goals,stats_core_goals_against,stats_core_saves,stats_core_assists,stats_core_score,stats_core_mvp,stats_core_shooting_percentage,stats_boost_bpm,stats_boost_bcpm,stats_boost_avg_amount,stats_boost_amount_collected,stats_boost_amount_stolen,stats_boost_amount_collected_big,stats_boost_amount_stolen_big,stats_boost_amount_collected_small,stats_boost_amount_stolen_small,stats_boost_count_collected_big,stats_boost_count_stolen_big,stats_boost_count_collected_small,stats_boost_count_stolen_small,stats_boost_amount_overfill,stats_boost_amount_overfill_stolen,stats_boost_amount_used_while_supersonic,stats_boost_time_zero_boost,stats_boost_percent_zero_boost,stats_boost_time_full_boost,stats_boost_percent_full_boost,stats_boost_time_boost_0_25,stats_boost_time_boost_25_50,stats_boost_time_boost_50_75,stats_boost_time_boost_75_100,stats_boost_percent_boost_0_25,stats_boost_percent_boost_25_50,stats_boost_percent_boost_50_75,stats_boost_percent_boost_75_100,stats_movement_avg_speed,stats_movement_total_distance,stats_movement_time_supersonic_speed,stats_movement_time_boost_speed,stats_movement_time_slow_speed,stats_movement_time_ground,stats_movement_time_low_air,stats_movement_time_high_air,stats_movement_time_powerslide,stats_movement_count_powerslide,stats_movement_avg_powerslide_duration,stats_movement_avg_speed_percentage,stats_movement_percent_slow_speed,stats_movement_percent_boost_speed,stats_movement_percent_supersonic_speed,stats_movement_percent_ground,stats_movement_percent_low_air,stats_movement_percent_high_air,stats_positioning_avg_distance_to_ball,stats_positioning_avg_distance_to_ball_possession,stats_positioning_avg_distance_to_ball_no_possession,stats_positioning_avg_distance_to_mates,stats_positioning_time_defensive_third,stats_positioning_time_neutral_third,stats_positioning_time_offensive_third,stats_positioning_time_defensive_half,stats_positioning_time_offensive_half,stats_positioning_time_behind_ball,stats_positioning_time_infront_ball,stats_positioning_time_most_back,stats_positioning_time_most_forward,stats_positioning_time_closest_to_ball,stats_positioning_time_farthest_from_ball,stats_positioning_percent_defensive_third,stats_positioning_percent_offensive_third,stats_positioning_percent_neutral_third,stats_positioning_percent_defensive_half,stats_positioning_percent_offensive_half,stats_positioning_percent_behind_ball,stats_positioning_percent_infront_ball,stats_positioning_percent_most_back,stats_positioning_percent_most_forward,stats_positioning_percent_closest_to_ball,stats_positioning_percent_farthest_from_ball,stats_demo_inflicted,stats_demo_taken,rank_id,rank_tier,rank_division,rank_name,camera_fov,camera_height,camera_pitch,camera_distance,camera_stiffness,camera_swivel_speed,camera_transition_speed,id_platform,id_id,stats_positioning_goals_against_while_last_defender,recorder,id_player_number
0,FE7C43C611EFB191C11E4E9D5E677929,blue,1,https://ballchasing.com/api/replays/16ad9f5b-f4bd-48f2-a5d7-b6275fed6948,2024-12-03T16:23:20.966649Z,"{'steam_id': '76561198164643612', 'name': 'jlt.', 'profile_url': 'https://steamcommunity.com/id/jlt_rl/', 'avatar': 'https://avatars.steamstatic.com/ec8d9c6c8a5c6d7c91e21b4ca4843884ee9bf83c.jpg'}",ok,D7DC7CC54C2FD50CEA673CAF1CC86532,2024-12-03.17.23 jlt. Ranked Doubles Loss,stadium_foggy_p,Online,2.0,ranked-doubles,362.0,True,11.0,16,free2play,2024-12-03T17:23:20+01:00,True,public,"{'id': 'grand-champion-3', 'tier': 21, 'division': 3, 'name': 'Grand Champion III Division 3'}","{'id': 'grand-champion-3', 'tier': 21, 'division': 3, 'name': 'Grand Champion III Division 3'}","[{'id': 'grand-champion-2s-replays-2024-1-ywj89pqkjn', 'name': 'Grand Champion 2s Replays (2024-12-01) Part 14', 'link': 'https://ballchasing.com/api/groups/grand-champion-2s-replays-2024-1-ywj89pqkjn'}]",Ranked Doubles,DFH Stadium (Stormy),"{'name': 'EU716-Burnout11', 'region': 'EU'}",0.0,362.5843,05Mehdi,True,4284.0,Fennec,1.5,4,10,2,2,6,0,879,True,50.0,476,504.68756,49.69,2898,519,2158,310,740,209,27,5,66,19,556,195,549,47.77,13.865266,39.85,11.566482,120.72,87.92,58.08,87.04,34.12483,24.853006,16.41791,24.60425,1642,557986,86.34,140.9,134.86,186.18,154.41,21.51,8.88,99,0.09,71.391304,37.243855,38.9119,23.844242,51.416737,42.642914,5.940348,2383,2068,2606,3109.0,169.77,125.38,66.96,237.04,125.06,289.51,72.6,162.1,175.2,176.6,161.0,46.88354,18.491617,34.624836,65.462585,34.537422,79.95084,20.049156,47.049606,50.851887,51.258236,46.730328,0,0,grand-champion-3,21.0,3.0,Grand Champion III Division 3,110,100,-5,270,0.35,4.0,1.7,epic,170e3432cb96487db2e91417e70fdc33,,,


# Übersicht der Rocket League Features

## Match‑Metadaten
- **map_code**  
  Code des gespielten Stadions (z. B. `chn_stadium_p` für Champions Field).
- **match_type**  
  „Online“, „Local“ etc.
- **team_size**  
  Spieler pro Team (z. B. 2 für Doubles, 3 für Standard).
- **duration**  
  Spieldauer in Sekunden (ohne Overtime).
- **overtime**  
  Boolean: ging das Match in die Verlängerung?
- **overtime_seconds**  
  Länge der Overtime in Sekunden (falls zutreffend).
- **season**  
  Saison‑Nummer im Ranked‑Play (z. B. 18).
- **date**  
  Datum/Uhrzeit des Matches (Zeitstempel).

## Zeitangaben pro Spieler
- **start_time** / **end_time**  
  Sekunden seit Matchbeginn, in denen der Spieler aktiv war.
  
## Spieler‑Attribute
- **player_id**  
  Eindeutige ID pro Spieler-Name.
- **name**  
  In‑Game‑Name.
- **mvp**  
  Boolean: war dieser Spieler MVP des Matches?
- **steering_sensitivity**  
  Lenksensitivität (Input‑Empfindlichkeit).

## Core‑Stats (`stats_core_*`)
- **shots** / **shots_against**  
  Eigene Schüsse / auf dieses Tor abgegebene Schüsse.
- **goals** / **goals_against**  
  Eigene Tore / Gegentore erhalten (als Torwart).
- **saves**  
  Abgefangene Schüsse.
- **assists**  
  Vorlagen‑Assists.
- **score**  
  Punkte‑Summe (Tore, Saves, Assists, Cleans, etc.).
- **mvp**  
  Boolean: MVP‑Status des Spielers.
- **shooting_percentage**  
  Verhältnis Tore zu Schüssen (%).

## Boost‑Stats (`stats_boost_*`)
- **bpm**  
  Boost‑Picks pro Minute.
- **bcpm**  
  Boost‑Picks pro Minute, gewichtet.
- **avg_amount**  
  Durchschnittlicher Boost‑Füllstand.
- **amount_collected** / **amount_stolen**  
  Gesammelter / gestohlener Boost.
- **amount_collected_big/small**, **count_collected_big/small**  
  Große vs. kleine Boost‑Packs und deren Zählung.
- **amount_overfill**  
  Überfüllter Boost (verlorener Überschuss).
- **amount_used_while_supersonic**  
  Boost‑Nutzung bei Überschalltempo.
- **time_zero_boost** / **time_full_boost**  
  Zeit ohne/mit voller Boost‑Bar.
- **time_boost_0_25**, … , **time_boost_75_100**  
  Zeit in Boost‑Bereichen (0–25 %, 25–50 % etc.).
- **percent_…**  
  Entsprechende Zeiten in %.

## Movement‑Stats (`stats_movement_*`)
- **avg_speed**  
  Durchschnittsgeschwindigkeit (Studs/s).
- **total_distance**  
  Gesamte zurückgelegte Distanz (Studs).
- **time_supersonic_speed**, **time_boost_speed**, **time_slow_speed**  
  Zeit in verschiedenen Geschwindigkeitszonen.
- **time_ground** / **time_low_air** / **time_high_air**  
  Zeit auf Boden, niedriger bzw. hoher Luft.
- **time_powerslide** / **count_powerslide**  
  Zeit und Anzahl von Powerslides.
- **avg_powerslide_duration**  
  Durchschnittliche Dauer pro Powerslide.
- **avg_speed_percentage**  
  Durchschnittsgeschwindigkeit relativ zur Maximalgeschwindigkeit.
- **percent_…**  
  Entsprechende Zeiten in %.

## Positioning‑Stats (`stats_positioning_*`)
- **avg_distance_to_ball**  
  Durchschnittlicher Abstand zum Ball (Studs).
- **avg_distance_to_ball_possession** / **_no_possession**  
  Abstand bei eigener Ballkontrolle / ohne Kontrolle.
- **avg_distance_to_mates**  
  Abstand zu Teamkollegen.
- **time_defensive_third**, **time_neutral_third**, **time_offensive_third**  
  Zeit in den Spielfeld-Dritteln (abwehrend / neutral / offensiv).
- **time_defensive_half** / **time_offensive_half**  
  Zeit in der eigenen / gegnerischen Spielfeldhälfte.
- **time_behind_ball** / **time_infront_ball**  
  Zeit hinter / vor dem Ball.
- **time_most_back** / **time_most_forward**  
  Zeit in der rückwärtigsten bzw. vordersten Position im Team.
- **time_closest_to_ball** / **time_farthest_from_ball**  
  Zeit am nächsten / am weitesten entfernt vom Ball.
- **percent_…**  
  Entsprechende Zeiten in %.
- **goals_against_while_last_defender**  
  Gegentore erhalten als letzter Verteidiger.

## Demo‑Stats (`stats_demo_*`)
- **inflicted**  
  Gegner durch Demos zerstört.
- **taken**  
  Selbst durch Gegner-Demos zerstört.

## Rank‑Daten
- **rank_tier**  
  Tier‑Ebene (z. B. 20 = Grand Champion II).
- **rank_division**  
  Division innerhalb des Tiers (1–4).


In [7]:
features = [
    "team_color", "team_result",
    "map_code", "match_type", "team_size", "duration", "overtime", "overtime_seconds",
    "season", "date", "start_time", "end_time", "name", "mvp", "steering_sensitivity",
    "stats_core_shots", "stats_core_shots_against", "stats_core_goals", "stats_core_goals_against",
    "stats_core_saves", "stats_core_assists", "stats_core_score", "stats_core_mvp",
    "stats_core_shooting_percentage", "stats_boost_bpm", "stats_boost_bcpm",
    "stats_boost_avg_amount", "stats_boost_amount_collected", "stats_boost_amount_stolen",
    "stats_boost_amount_collected_big", "stats_boost_amount_stolen_big",
    "stats_boost_amount_collected_small", "stats_boost_amount_stolen_small",
    "stats_boost_count_collected_big", "stats_boost_count_stolen_big",
    "stats_boost_count_collected_small", "stats_boost_count_stolen_small",
    "stats_boost_amount_overfill", "stats_boost_amount_overfill_stolen",
    "stats_boost_amount_used_while_supersonic", "stats_boost_time_zero_boost",
    "stats_boost_percent_zero_boost", "stats_boost_time_full_boost",
    "stats_boost_percent_full_boost", "stats_boost_time_boost_0_25",
    "stats_boost_time_boost_25_50", "stats_boost_time_boost_50_75",
    "stats_boost_time_boost_75_100", "stats_boost_percent_boost_0_25",
    "stats_boost_percent_boost_25_50", "stats_boost_percent_boost_50_75",
    "stats_boost_percent_boost_75_100", "stats_movement_avg_speed",
    "stats_movement_total_distance", "stats_movement_time_supersonic_speed",
    "stats_movement_time_boost_speed", "stats_movement_time_slow_speed",
    "stats_movement_time_ground", "stats_movement_time_low_air",
    "stats_movement_time_high_air", "stats_movement_time_powerslide",
    "stats_movement_count_powerslide", "stats_movement_avg_powerslide_duration",
    "stats_movement_avg_speed_percentage", "stats_movement_percent_slow_speed",
    "stats_movement_percent_boost_speed", "stats_movement_percent_supersonic_speed",
    "stats_movement_percent_ground", "stats_movement_percent_low_air",
    "stats_movement_percent_high_air", "stats_positioning_avg_distance_to_ball",
    "stats_positioning_avg_distance_to_ball_possession",
    "stats_positioning_avg_distance_to_ball_no_possession",
    "stats_positioning_avg_distance_to_mates", "stats_positioning_time_defensive_third",
    "stats_positioning_time_neutral_third", "stats_positioning_time_offensive_third",
    "stats_positioning_time_defensive_half", "stats_positioning_time_offensive_half",
    "stats_positioning_time_behind_ball", "stats_positioning_time_infront_ball",
    "stats_positioning_time_most_back", "stats_positioning_time_most_forward",
    "stats_positioning_time_closest_to_ball", "stats_positioning_time_farthest_from_ball",
    "stats_positioning_percent_defensive_third", "stats_positioning_percent_offensive_third",
    "stats_positioning_percent_neutral_third", "stats_positioning_percent_defensive_half",
    "stats_positioning_percent_offensive_half", "stats_positioning_percent_behind_ball",
    "stats_positioning_percent_infront_ball", "stats_positioning_percent_most_back",
    "stats_positioning_percent_most_forward", "stats_positioning_percent_closest_to_ball", "stats_positioning_goals_against_while_last_defender",
    "stats_positioning_percent_farthest_from_ball", "stats_demo_inflicted", "stats_demo_taken",
    "rank_tier", "rank_division"
]

# Filtere das DataFrame
df_selected = df_players[features].copy()

# Erzeuge eindeutige Spieler-IDs
df_selected.insert(0, 'player_id', pd.factorize(df_selected['name'])[0] + 1)

# Stelle sicher, dass 'name' an zweiter Stelle steht
cols = df_selected.columns.tolist()
cols.insert(1, cols.pop(cols.index('name')))
df_selected = df_selected[cols]

In [8]:
df_selected.head(10)

Unnamed: 0,player_id,name,team_color,team_result,map_code,match_type,team_size,duration,overtime,overtime_seconds,season,date,start_time,end_time,mvp,steering_sensitivity,stats_core_shots,stats_core_shots_against,stats_core_goals,stats_core_goals_against,stats_core_saves,stats_core_assists,stats_core_score,stats_core_mvp,stats_core_shooting_percentage,stats_boost_bpm,stats_boost_bcpm,stats_boost_avg_amount,stats_boost_amount_collected,stats_boost_amount_stolen,stats_boost_amount_collected_big,stats_boost_amount_stolen_big,stats_boost_amount_collected_small,stats_boost_amount_stolen_small,stats_boost_count_collected_big,stats_boost_count_stolen_big,stats_boost_count_collected_small,stats_boost_count_stolen_small,stats_boost_amount_overfill,stats_boost_amount_overfill_stolen,stats_boost_amount_used_while_supersonic,stats_boost_time_zero_boost,stats_boost_percent_zero_boost,stats_boost_time_full_boost,stats_boost_percent_full_boost,stats_boost_time_boost_0_25,stats_boost_time_boost_25_50,stats_boost_time_boost_50_75,stats_boost_time_boost_75_100,stats_boost_percent_boost_0_25,stats_boost_percent_boost_25_50,stats_boost_percent_boost_50_75,stats_boost_percent_boost_75_100,stats_movement_avg_speed,stats_movement_total_distance,stats_movement_time_supersonic_speed,stats_movement_time_boost_speed,stats_movement_time_slow_speed,stats_movement_time_ground,stats_movement_time_low_air,stats_movement_time_high_air,stats_movement_time_powerslide,stats_movement_count_powerslide,stats_movement_avg_powerslide_duration,stats_movement_avg_speed_percentage,stats_movement_percent_slow_speed,stats_movement_percent_boost_speed,stats_movement_percent_supersonic_speed,stats_movement_percent_ground,stats_movement_percent_low_air,stats_movement_percent_high_air,stats_positioning_avg_distance_to_ball,stats_positioning_avg_distance_to_ball_possession,stats_positioning_avg_distance_to_ball_no_possession,stats_positioning_avg_distance_to_mates,stats_positioning_time_defensive_third,stats_positioning_time_neutral_third,stats_positioning_time_offensive_third,stats_positioning_time_defensive_half,stats_positioning_time_offensive_half,stats_positioning_time_behind_ball,stats_positioning_time_infront_ball,stats_positioning_time_most_back,stats_positioning_time_most_forward,stats_positioning_time_closest_to_ball,stats_positioning_time_farthest_from_ball,stats_positioning_percent_defensive_third,stats_positioning_percent_offensive_third,stats_positioning_percent_neutral_third,stats_positioning_percent_defensive_half,stats_positioning_percent_offensive_half,stats_positioning_percent_behind_ball,stats_positioning_percent_infront_ball,stats_positioning_percent_most_back,stats_positioning_percent_most_forward,stats_positioning_percent_closest_to_ball,stats_positioning_goals_against_while_last_defender,stats_positioning_percent_farthest_from_ball,stats_demo_inflicted,stats_demo_taken,rank_tier,rank_division
0,1,05Mehdi,blue,1,stadium_foggy_p,Online,2.0,362.0,True,11.0,16,2024-12-03T17:23:20+01:00,0.0,362.5843,True,1.5,4,10,2,2,6,0,879,True,50.0,476,504.68756,49.69,2898,519,2158,310,740,209,27,5,66,19,556,195,549,47.77,13.865266,39.85,11.566482,120.72,87.92,58.08,87.04,34.12483,24.853006,16.41791,24.60425,1642,557986,86.34,140.9,134.86,186.18,154.41,21.51,8.88,99,0.09,71.391304,37.243855,38.9119,23.844242,51.416737,42.642914,5.940348,2383,2068,2606,3109.0,169.77,125.38,66.96,237.04,125.06,289.51,72.6,162.1,175.2,176.6,161.0,46.88354,18.491617,34.624836,65.462585,34.537422,79.95084,20.049156,47.049606,50.851887,51.258236,,46.730328,0,0,21.0,3.0
1,2,Velho.,blue,1,stadium_foggy_p,Online,2.0,362.0,True,11.0,16,2024-12-03T17:23:20+01:00,0.0,362.5843,,1.44,4,10,1,2,0,0,327,False,25.0,342,398.10757,53.56,2286,380,1562,205,724,175,20,4,68,15,477,196,192,12.68,3.680376,53.61,15.560329,79.88,93.43,66.27,118.85,22.28608,26.066458,18.488964,33.158497,1602,542475,54.0,156.23,151.67,215.61,140.42,5.87,6.81,52,0.13,69.652176,41.909367,43.169384,14.921249,59.577232,38.800774,1.621995,2483,2295,2595,3109.0,172.29,129.87,59.74,254.47,107.43,280.66,81.24,175.5,162.9,161.5,176.6,47.607075,16.507322,35.885605,70.315,29.684996,77.55181,22.44819,50.93896,47.281803,46.875454,2.0,51.258236,1,0,21.0,3.0
2,3,jlt.,orange,0,stadium_foggy_p,Online,2.0,362.0,True,11.0,16,2024-12-03T17:23:20+01:00,0.0,362.5843,,2.0,6,8,2,3,3,0,760,False,33.333332,420,447.91455,45.04,2572,810,1796,542,776,268,22,6,69,23,209,57,86,51.31,14.892753,21.22,6.159115,146.36,74.73,52.42,80.7,41.320118,21.097654,14.799129,22.783092,1562,528098,56.33,150.43,155.21,204.24,145.81,11.91,9.11,95,0.1,67.91304,42.879242,41.558685,15.562062,56.426125,40.28346,3.290419,2097,1988,2196,2804.0,169.45,106.16,86.36,226.26,135.71,271.5,90.47,157.9,183.7,182.5,159.0,46.81327,23.85833,29.328398,62.507942,37.49206,75.00622,24.993784,45.830547,53.319016,52.970715,2.0,46.149826,0,0,21.0,3.0
3,4,Feetlover34,orange,0,stadium_foggy_p,Online,2.0,362.0,True,11.0,16,2024-12-03T17:23:20+01:00,0.0,362.5843,,1.5,4,8,0,3,3,2,510,False,0.0,416,405.42188,44.65,2328,348,1670,176,658,172,18,2,58,16,149,41,261,60.25,17.487593,27.14,7.877398,150.65,71.42,51.7,73.89,43.332565,20.54306,14.870852,21.253525,1506,506552,42.69,146.4,169.61,203.85,135.67,19.18,3.85,48,0.08,65.478264,47.284637,40.814045,11.90131,56.83022,37.822693,5.347086,2357,2419,2332,2804.0,178.78,112.1,67.82,248.85,109.85,266.33,92.37,184.3,154.2,155.9,182.7,49.84109,18.907164,31.251741,69.37552,30.624475,74.24867,25.751326,53.493164,44.756626,45.25005,1.0,53.028763,0,1,21.0,3.0
4,4,Feetlover34,blue,0,eurostadium_night_p,Online,1.0,389.0,False,,17,2025-02-17T19:03:45+01:00,0.0,389.00842,,1.5,12,9,4,5,3,0,1035,False,33.333332,460,467.7343,43.92,2799,560,2226,313,573,247,26,4,51,22,385,88,306,55.63,15.493665,34.09,9.4945,153.41,75.36,52.45,77.99,42.70761,20.979372,14.601487,21.711535,1459,504026,51.29,125.85,205.08,206.22,162.89,13.1,5.38,76,0.07,63.434784,53.654964,32.926064,13.418973,53.95463,42.617935,3.427435,1664,1208,2073,,158.48,116.47,107.26,226.06,156.15,274.12,108.1,351.9,351.9,351.9,351.9,41.46412,28.063107,30.472778,59.145496,40.854504,71.71786,28.28214,98.00864,98.00864,98.00864,5.0,98.00864,2,2,19.0,3.0
5,5,Pineappl,orange,1,eurostadium_night_p,Online,1.0,389.0,False,,17,2025-02-17T19:03:45+01:00,0.0,389.00842,True,2.9,9,12,5,4,6,0,1259,True,55.555557,414,476.92523,45.51,2854,687,2074,421,780,266,28,5,65,23,672,25,110,43.66,12.159866,46.9,13.062248,116.57,93.15,69.91,93.19,31.2671,24.985247,18.751677,24.995975,1457,501510,49.39,138.96,193.6,207.1,172.69,2.17,7.71,90,0.09,63.347828,50.68726,36.381725,12.931011,54.220333,45.211536,0.568122,1665,1627,1761,,178.49,120.95,82.51,246.47,135.48,314.99,66.96,351.7,351.7,351.7,351.7,46.731247,21.602303,31.666447,64.52939,35.47061,82.46891,17.531092,97.952934,97.952934,97.952934,4.0,97.952934,2,2,19.0,4.0
6,6,Jrxnz,blue,0,cs_p,Online,2.0,336.0,False,,18,2025-04-14T22:45:53+01:00,0.0,336.3606,,1.4,2,11,0,2,3,0,404,False,0.0,395,431.35925,57.94,2332,774,1721,558,611,216,22,8,48,20,495,245,127,21.71,6.692974,72.03,22.206123,68.74,67.18,67.32,131.66,20.52553,20.05972,20.101522,39.31323,1501,478646,54.48,120.55,161.18,195.33,127.5,13.38,3.84,33,0.12,65.26087,47.940273,35.855568,16.204159,58.097614,37.922726,3.979655,1879,1645,2062,2407.0,152.46,104.72,79.04,208.59,127.62,238.81,97.4,165.0,173.6,174.3,164.3,45.345314,23.508417,31.146273,62.041584,37.95842,71.030014,28.96999,50.867836,53.51913,53.734932,1.0,50.652035,1,0,,
7,7,Ekb511,blue,0,cs_p,Online,2.0,336.0,False,,18,2025-04-14T22:45:53+01:00,0.0,336.3606,,1.5,4,11,1,2,0,0,308,False,25.0,359,379.01163,48.21,2049,526,1278,339,771,187,16,4,70,16,354,60,294,19.11,5.89142,45.12,13.910041,81.36,101.54,67.83,79.0,24.674736,30.794895,20.571379,23.958998,1501,473361,55.67,113.34,163.99,208.95,99.23,24.83,2.88,21,0.14,65.26087,49.246246,34.036037,16.717718,62.74587,29.797905,7.456233,1855,1654,2011,2407.0,159.32,96.84,76.84,212.28,120.73,256.42,76.59,172.6,155.9,155.2,173.3,47.843845,23.075073,29.081081,63.74583,36.254166,77.000694,22.999308,53.210842,48.062397,47.846596,1.0,53.426643,2,1,,
8,8,Spieglain,orange,1,cs_p,Online,2.0,336.0,False,,18,2025-04-14T22:45:53+01:00,0.0,336.3606,True,1.5,5,6,1,1,3,0,603,True,20.0,360,381.4163,50.08,2062,662,1197,366,865,296,15,4,77,26,332,47,146,38.27,11.798255,20.86,6.430928,114.8,79.94,66.7,71.93,34.43621,23.979362,20.007797,21.576628,1514,486546,43.99,132.11,160.08,188.72,126.09,21.36,8.21,78,0.11,65.82609,47.617348,39.2974,13.085252,56.138268,37.50781,6.353929,2462,2186,2794,3072.0,152.74,111.15,72.29,212.99,123.18,265.19,70.99,185.8,147.4,158.3,175.1,45.434,21.503363,33.062645,63.357822,36.642174,78.88334,21.116665,57.280266,45.441933,48.802296,,53.981564,1,0,,
9,4,Feetlover34,orange,1,cs_p,Online,2.0,336.0,False,,18,2025-04-14T22:45:53+01:00,0.0,336.3606,,1.5,6,6,1,1,2,1,500,False,16.666666,372,365.87848,43.86,1978,344,1406,214,572,130,16,3,46,12,203,85,200,40.79,12.575146,17.88,5.512223,141.93,79.2,31.25,65.77,44.61103,24.893917,9.822411,20.672638,1397,434153,36.2,116.24,174.72,173.93,131.03,22.19,4.27,50,0.09,60.739132,53.40506,35.530014,11.064922,53.165215,40.051964,6.782822,2229,1989,2515,3072.0,147.52,105.3,74.34,207.16,120.0,258.63,68.52,147.2,176.9,166.7,157.2,45.091087,22.722824,32.186085,63.320698,36.6793,79.05548,20.944519,45.380276,54.536488,51.39193,1.0,48.463177,0,3,,


In [9]:
df_selected.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242308 entries, 0 to 242307
Columns: 102 entries, player_id to rank_division
dtypes: bool(2), float64(62), int64(32), object(6)
memory usage: 185.3+ MB


In [10]:
df_selected['date'] = pd.to_datetime(df_selected['date'])

# Überprüfen des Datentyps und Anzeige
df_selected['date'].dtype, df_selected
df_selected.to_csv("matches.csv", index=False)

  df_selected['date'] = pd.to_datetime(df_selected['date'])


In [11]:
df_selected.to_csv("matches.csv", index=False)

In [12]:
output_dir = "plots"
os.makedirs(output_dir, exist_ok=True)

# Numerische Features auswählen
numeric = df_selected.select_dtypes(include=['number']).loc[:, df_selected.nunique() > 1]

# Viridis Colormap
cmap = plt.cm.viridis

for idx, feature in enumerate(numeric.columns):
    # Farbwerte aus Viridis generieren
    hist_color = cmap(0.6)
    kde_color = cmap(0.3)
    box_color = cmap(0.9)
    
    fig, axes = plt.subplots(1, 2, figsize=(10, 4), facecolor='#2b2b2b')
    for ax in axes:
        ax.set_facecolor('#3c3f41')
        for spine in ax.spines.values():
            spine.set_color('#bbbbbb')
        ax.tick_params(colors='#dddddd')
        ax.xaxis.label.set_color('#ffffff')
        ax.yaxis.label.set_color('#ffffff')
        ax.title.set_color('#ffffff')
    
    # Histogram + KDE
    axes[0].hist(numeric[feature].dropna(), bins=30, density=True,
                 facecolor=hist_color, edgecolor='none', alpha=0.7)
    numeric[feature].plot(kind='kde', ax=axes[0], color=kde_color, linewidth=2)
    axes[0].set_title(f'{feature}\nHistogram & KDE')
    axes[0].set_xlabel('Wert')
    axes[0].set_ylabel('Dichte')

    # Boxplot
    axes[1].boxplot(numeric[feature].dropna(), vert=False,
                    patch_artist=True,
                    boxprops=dict(facecolor=box_color, edgecolor='white', alpha=0.8),
                    medianprops=dict(color='white'),
                    whiskerprops=dict(color='white'),
                    capprops=dict(color='white'),
                    flierprops=dict(markerfacecolor='white', markeredgecolor='white'))
    axes[1].set_title(f'{feature}\nBoxplot')
    axes[1].set_xlabel('Wert')

    plt.tight_layout()
    filename = os.path.join(output_dir, f"{feature}.png")
    plt.savefig(filename, facecolor=fig.get_facecolor())
    plt.close(fig)

In [13]:
rank_counts = df_selected['rank_tier'].value_counts(dropna=False).sort_index()

# Als DataFrame aufbereiten
df_rank_counts = pd.DataFrame({
    'rank_tier': rank_counts.index,
    'count': rank_counts.values
})

# Ausgabe
print(df_rank_counts)

    rank_tier   count
0         1.0       1
1         2.0       1
2         7.0       3
3         8.0      14
4         9.0      16
5        10.0      24
6        11.0      44
7        12.0      68
8        13.0     188
9        14.0     395
10       15.0     921
11       16.0    2911
12       17.0    7641
13       18.0   16802
14       19.0   32901
15       20.0   33615
16       21.0   16444
17       22.0    4899
18        NaN  125420


In [14]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# --- 1) Custom Transformer for datetime features ---
class DateTimeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, col='date'):
        self.col = col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Parse timezone-aware and convert to naive UTC
        dt = pd.to_datetime(X[self.col], utc=True)
        dt_naive = dt.dt.tz_convert('UTC').dt.tz_localize(None)
        # Epoch timestamp in seconds and cyclical features
        ts = dt_naive.view('int64') // 10**9
        return pd.DataFrame({
            'timestamp': ts,
            'hour_sin': np.sin(2 * np.pi * dt_naive.dt.hour / 24),
            'hour_cos': np.cos(2 * np.pi * dt_naive.dt.hour / 24),
            'weekday_sin': np.sin(2 * np.pi * dt_naive.dt.weekday / 7),
            'weekday_cos': np.cos(2 * np.pi * dt_naive.dt.weekday / 7)
        }, index=X.index)

# --- 2) Functions for feature engineering ---

def aggregate_player_profiles(df, window=None):
    group = df.groupby('player_id')
    stats = df.filter(regex='stats_core_|stats_boost_|stats_movement_|stats_positioning_')
    aggs = ['mean', 'std', 'min', 'max']
    profile = group[stats.columns].agg(aggs)
    profile.columns = [f"{col}_{stat}" for col, stat in profile.columns]
    profile = profile.reset_index()
    if window:
        profile = profile.groupby('player_id').rolling(window).mean().reset_index(level=0, drop=True).reset_index()
    return profile

def compute_team_features(match_df):
    # Assumes match_df has 'match_guid', 'team_color', and stats columns
    stats = match_df.filter(regex='stats_')
    blue = match_df[match_df['team_color']=='blue'].set_index('match_guid')
    orange = match_df[match_df['team_color']=='orange'].set_index('match_guid')
    # Difference features
    diff = blue[stats.columns].subtract(orange[stats.columns], axis=1).add_prefix('diff_')
    # Correlation features
    corr = blue[stats.columns].corrwith(orange[stats.columns]).to_frame().T.add_prefix('corr_')
    corr.index = diff.index
    return pd.concat([diff, corr], axis=1).reset_index()

# --- 3) Build preprocessing pipeline ---

# Define feature lists
numeric_features = [
    col for col in df_selected.columns 
    if df_selected[col].dtype in [np.int64, np.float64] and col not in ['player_id']
]
categorical_features = ['map_code', 'match_type']

# Pipelines for numeric and categorical
numeric_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])
categorical_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine into ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_features),
    ('cat', categorical_pipeline, categorical_features),
    ('dt', DateTimeTransformer(col='date'), ['date']),
], remainder='drop')

# --- 4) Example: apply preprocessing ---

# Assume df_selected is your cleaned DataFrame
X = df_selected.copy()
# Placeholder target for pipeline demonstration
y = (df_selected['mvp'] == True).astype(int)

# Fit and transform
X_preprocessed = preprocessor.fit_transform(X)

print("Shape of preprocessed feature matrix:", X_preprocessed.shape)


Shape of preprocessed feature matrix: (242308, 177)


  ts = dt_naive.view('int64') // 10**9


In [None]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from typing import List, Dict

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

# -----------------------------
# 1) JSON → DataFrame Aufbau
# -----------------------------
def process_match_file(file_path: Path) -> List[Dict]:
    with open(file_path, 'r', encoding='utf-8') as f:
        match = json.load(f)
    # Match‑Meta
    meta = {k: v for k, v in match.items() if k not in ('blue','orange')}
    # Team Goals
    blue_players   = match.get('blue', {}).get('players', [])
    orange_players = match.get('orange', {}).get('players', [])
    goals_blue     = sum(p.get('stats', {}).get('core', {}).get('goals', 0) for p in blue_players)
    goals_orange   = sum(p.get('stats', {}).get('core', {}).get('goals', 0) for p in orange_players)
    # Determine result
    if   goals_blue > goals_orange: res_blue, res_orange = 1, 0
    elif goals_blue < goals_orange: res_blue, res_orange = 0, 1
    else:                           res_blue, res_orange = 2, 2

    rows = []
    for team_color, players, team_res in [
        ('blue', blue_players, res_blue),
        ('orange', orange_players, res_orange)
    ]:
        for p in players:
            row = {**meta,
                   'team_color':   team_color,
                   'team_result':  team_res,
                   **p}
            stats = row.pop('stats', {})
            for cat, d in stats.items():
                for name, val in d.items():
                    row[f'stats_{cat}_{name}'] = val
            for fld in ('rank','camera','id'):
                sub = row.pop(fld, {})
                for k,v in sub.items():
                    row[f'{fld}_{k}'] = v
            rows.append(row)
    return rows

def build_player_dataframe(folder: str) -> pd.DataFrame:
    all_rows = []
    for fn in Path(folder).rglob('replay_data_*.json'):
        try:
            all_rows.extend(process_match_file(fn))
        except Exception:
            continue
    df = pd.DataFrame(all_rows)
    cols = ['match_guid','team_color','team_result'] + [c for c in df.columns 
           if c not in ('match_guid','team_color','team_result')]
    return df[cols]

# Load data
df_players = build_player_dataframe("Replays/Replay Data")

# -----------------------------
# 2) Zielvariable & Features
# -----------------------------
# Target: team_result (0=lose,1=win,2=draw)
y = df_players['team_result']

# Feature selection: drop identifiers and nested
drop_cols = ['match_guid','team_result','team_color','name',
             'id_platform','id_id','title','uploader','link']
X = df_players.drop(columns=[c for c in drop_cols if c in df_players.columns])

# -----------------------------
# 3) Preprocessing Pipeline
# -----------------------------
# datetime transformer
class DateTimeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self,col='date'): self.col=col
    def fit(self,X,y=None): return self
    def transform(self,X):
        dt = pd.to_datetime(X[self.col], utc=True)
        dtn = dt.dt.tz_convert('UTC').dt.tz_localize(None)
        ts = dtn.view('int64')//10**9
        return pd.DataFrame({
            'timestamp': ts,
            'hour_sin':  np.sin(2*np.pi*dtn.dt.hour/24),
            'hour_cos':  np.cos(2*np.pi*dtn.dt.hour/24),
            'wd_sin':    np.sin(2*np.pi*dtn.dt.weekday/7),
            'wd_cos':    np.cos(2*np.pi*dtn.dt.weekday/7),
        }, index=X.index)

# Numeric & Cat features
num_feats = [c for c in X.columns if X[c].dtype in [np.int64,np.float64]]
cat_feats = ['map_code','match_type']

num_pipe = Pipeline([
    ('imp', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])
cat_pipe = Pipeline([
    ('imp', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, num_feats),
    ('cat', cat_pipe, cat_feats),
    ('dt',  DateTimeTransformer(col='date'), ['date'])
], remainder='drop')

# -----------------------------
# 4) Train/Test Split & Model
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_tr = preprocessor.fit_transform(X_train)
X_te = preprocessor.transform(X_test)

model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_tr, y_train)

print("CV AUC:", cross_val_score(model, X_tr, y_train, cv=5, scoring='roc_auc_ovr').mean())
print("Test Score:", model.score(X_te, y_test))


  ts = dtn.view('int64')//10**9
  ts = dtn.view('int64')//10**9


In [None]:
df_selected