In [44]:
# parquet dateien einlesen
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import re

df_match_info = pd.read_parquet("filtered_data/match_info.parquet")
df_match_player_general = pd.read_parquet("filtered_data/match_player_general.parquet")
df_timestamp_entfaellt = pd.read_parquet("filtered_data/match_player_timestamp.parquet")

In [45]:
df_match_info.head()

Unnamed: 0,match_id,winning_team,duration_s,objectives_mask_team0,objectives_mask_team1,objectives.destroyed_time_s,objectives.team_objective,objectives.team
0,43993369,Team0,1664,65477,45124,"[392, 361, 649, 397, 483, 651, 1352, 0, 0, 982...","[Tier1Lane3, Tier1Lane1, Tier1Lane4, Tier1Lane...","[Team1, Team1, Team1, Team0, Team0, Team0, Tea..."
1,43993437,Team0,2053,65509,41028,"[472, 1718, 775, 694, 965, 994, 753, 0, 1612, ...","[Tier1Lane3, Tier2Lane3, Tier1Lane4, Tier1Lane...","[Team1, Team1, Team1, Team0, Team1, Team0, Tea..."
2,43993682,Team1,1794,41028,65125,"[910, 625, 719, 576, 617, 617, 1607, 0, 933, 1...","[Tier1Lane1, Tier1Lane1, Tier1Lane4, Tier1Lane...","[Team0, Team1, Team0, Team1, Team1, Team0, Tea..."
3,43993801,Team0,1775,65093,8260,"[755, 257, 807, 534, 536, 1220, 1268, 1686, 81...","[Tier1Lane1, Tier1Lane4, Tier1Lane3, Tier1Lane...","[Team0, Team1, Team0, Team1, Team1, Team1, Tea..."
4,43994099,Team1,1803,41028,65221,"[428, 755, 451, 407, 1510, 952, 1350, 827, 0, ...","[Tier1Lane4, Tier1Lane3, Tier1Lane1, Tier1Lane...","[Team0, Team1, Team1, Team0, Team1, Team0, Tea..."


In [46]:
all_objs = set()
for arr in df_match_info["objectives.team_objective"]:
    if isinstance(arr, (list, np.ndarray)):
        for obj in arr:
            all_objs.add(obj)
all_objs = sorted(all_objs)
print(all_objs)

['BarrackBossLane1', 'BarrackBossLane3', 'BarrackBossLane4', 'Core', 'Tier1Lane1', 'Tier1Lane3', 'Tier1Lane4', 'Tier2Lane1', 'Tier2Lane3', 'Tier2Lane4', 'Titan', 'TitanShieldGenerator1', 'TitanShieldGenerator2']


In [47]:
ohe = OneHotEncoder(handle_unknown='ignore',sparse_output= False).set_output(transform='pandas')

In [48]:
def encode_objectives_with_team(
    df,
    obj_col="objectives.team_objective",
    team_col="objectives.team",
    match_col="match_id",
):
    # Arrays expandn → eine Zeile pro Objective
    df_exp = df.explode([obj_col, team_col]).dropna(subset=[obj_col, team_col])

    df_exp["team_id"] = (
        df_exp[team_col]
        .astype(str)
        .str.extract(r"(\d+)")
        .astype(int)
    )

    df_exp["obj_team"] = (
        df_exp[obj_col].astype(str)
        + "_Team"
        + df_exp["team_id"].astype(str)
        + "_lost"
    )

    dummies = pd.get_dummies(df_exp["obj_team"])

    df_obj = dummies.groupby(df_exp[match_col]).max().reset_index()

    base_cols = [match_col, "winning_team", "duration_s"]
    df_base = df[base_cols].drop_duplicates(subset=[match_col])

    df_full = df_base.merge(df_obj, on=match_col, how="inner")

    return df_full

In [49]:
def reorder_objective_columns(df):
    base_cols = [c for c in ["match_id", "winning_team", "duration_s"] if c in df.columns]

    obj_cols = [c for c in df.columns if c.endswith("_lost")]

    def sort_key(col: str):
        name_part = col.rsplit("_Team", 1)[0]   # "Tier1Lane3", "Core", "BarrackBossLane1", ...
        name_lower = name_part.lower()

        m_team = re.search(r"_Team(\d+)_lost$", col)
        team_id = int(m_team.group(1)) if m_team else 0

        # Tier/Lane extrahieren, falls vorhanden
        m_tier_lane = re.match(r"tier(\d+)lane(\d+)", name_lower)
        tier = int(m_tier_lane.group(1)) if m_tier_lane else 99
        lane = int(m_tier_lane.group(2)) if m_tier_lane else 99

        # Prioritäten:
        # 0: Core / Titan
        # 1: TitanShieldGenerator
        # 2: Tier1 Lanes
        # 3: Tier2 Lanes
        # 4: Tier3 Lanes
        # 5: Barracks / BarrackBoss
        # 10: Rest

        if "core" in name_lower or "titan" in name_lower:
            prio = 0
        elif "titanshieldgenerator" in name_lower:
            prio = 1
        elif m_tier_lane and tier == 1:
            prio = 2
        elif m_tier_lane and tier == 2:
            prio = 3
        elif m_tier_lane and tier == 3:
            prio = 4
        elif "barrack" in name_lower:
            prio = 5
        else:
            prio = 10

        # Rückgabe: erst nach Priorität, dann nach Tier/Lane, dann Name, dann Team
        return (prio, tier, lane, name_lower, team_id)

    obj_cols_sorted = sorted(obj_cols, key=sort_key)

    other_cols = [c for c in df.columns if c not in base_cols + obj_cols]

    new_order = base_cols + obj_cols_sorted + other_cols
    return df[new_order]


In [50]:
df_encoded = encode_objectives_with_team(df_match_info)
df_encoded = reorder_objective_columns(df_encoded)
df_encoded.to_parquet("filtered_data/encoded/match_info_encoded.parquet", index=False)
df_encoded.head()

Unnamed: 0,match_id,winning_team,duration_s,Core_Team0_lost,Core_Team1_lost,Titan_Team0_lost,Titan_Team1_lost,TitanShieldGenerator1_Team0_lost,TitanShieldGenerator1_Team1_lost,TitanShieldGenerator2_Team0_lost,...,Tier2Lane3_Team0_lost,Tier2Lane3_Team1_lost,Tier2Lane4_Team0_lost,Tier2Lane4_Team1_lost,BarrackBossLane1_Team0_lost,BarrackBossLane1_Team1_lost,BarrackBossLane3_Team0_lost,BarrackBossLane3_Team1_lost,BarrackBossLane4_Team0_lost,BarrackBossLane4_Team1_lost
0,43993369,Team0,1664,False,True,False,True,False,True,False,...,True,True,True,True,True,True,False,True,False,True
1,43993437,Team0,2053,False,True,False,True,False,True,False,...,True,True,True,True,False,True,False,True,False,True
2,43993682,Team1,1794,True,False,True,False,True,False,True,...,True,True,True,True,True,False,True,False,True,False
3,43993801,Team0,1775,False,True,False,True,False,True,False,...,True,True,True,True,False,True,True,True,True,True
4,43994099,Team1,1803,True,False,True,False,True,False,True,...,True,True,True,True,True,False,True,False,True,False


In [51]:
len(df_encoded)

51335

----Anteil Match_Player Table

In [52]:
df_match_player_general.head()

Unnamed: 0,match_id,account_id,team,hero_name,net_worth,ability_points,player_level
0,43921898,39226397,Team1,Mina,45370,30,33
1,43921898,87624911,Team1,Vyper,39765,27,31
2,43921898,91484677,Team1,Dynamo,32042,23,27
3,43921898,108815945,Team0,The Doorman,35551,25,29
4,43921898,235571100,Team1,Yamato,38998,26,30


In [53]:
def encode_heroes(df, hero_col="hero_name"):
    hero_ohe = OneHotEncoder(
        handle_unknown="ignore",
        sparse_output=False
    ).set_output(transform="pandas")

    hero_ohe_df = hero_ohe.fit_transform(df[[hero_col]])
    hero_ohe_df.index = df.index

    df_enc = pd.concat(
        [df.drop(columns=[hero_col]), hero_ohe_df],
        axis=1
    )

    return df_enc

df_match_player_general_encoded = encode_heroes(df_match_player_general)
df_match_player_general_encoded.to_parquet("filtered_data/encoded/match_player_general_heroes_encoded.parquet",index=False)