In [9]:
# parquet dateien einlesen
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import re

df_match_info = pd.read_parquet("filtered_data/match_info.parquet")
df_match_player_general = pd.read_parquet("filtered_data/match_player_general.parquet")
df_timestamp_entfaellt = pd.read_parquet("filtered_data/match_player_timestamp.parquet")

In [10]:
df_match_info.head()

Unnamed: 0,match_id,winning_team,duration_s,objectives_mask_team0,objectives_mask_team1,objectives.destroyed_time_s,objectives.team_objective,objectives.team
0,45941880,Team0,1754,65253,8260,"[660, 573, 479, 778, 577, 1081, 1752, 912, 0, ...","[Tier1Lane3, Tier1Lane3, Tier1Lane1, Tier1Lane...","[Team0, Team1, Team1, Team1, Team0, Team1, Tea..."
1,45941931,Team1,2356,41028,32325,"[662, 782, 506, 634, 724, 717, 1408, 1215, 182...","[Tier1Lane3, Tier1Lane1, Tier1Lane4, Tier1Lane...","[Team1, Team1, Team1, Team0, Team0, Team0, Tea..."
2,45941981,Team1,1362,8260,65477,"[297, 604, 563, 509, 448, 1292, 539, 1006, 0, ...","[Tier1Lane3, Tier1Lane1, Tier1Lane1, Tier1Lane...","[Team0, Team1, Team0, Team0, Team1, Team0, Tea..."
3,45941984,Team0,1645,65221,8260,"[263, 471, 655, 600, 786, 813, 1016, 1149, 138...","[Tier1Lane4, Tier1Lane1, Tier1Lane3, Tier1Lane...","[Team1, Team1, Team0, Team0, Team1, Team0, Tea..."
4,45942008,Team1,1966,8260,65509,"[773, 599, 603, 512, 843, 1007, 868, 1673, 0, ...","[Tier1Lane1, Tier1Lane3, Tier1Lane1, Tier1Lane...","[Team0, Team0, Team1, Team0, Team1, Team0, Tea..."


In [11]:
all_objs = set()
for arr in df_match_info["objectives.team_objective"]:
    if isinstance(arr, (list, np.ndarray)):
        for obj in arr:
            all_objs.add(obj)
all_objs = sorted(all_objs)
print(all_objs)

['BarrackBossLane1', 'BarrackBossLane3', 'BarrackBossLane4', 'Core', 'Tier1Lane1', 'Tier1Lane3', 'Tier1Lane4', 'Tier2Lane1', 'Tier2Lane3', 'Tier2Lane4', 'Titan', 'TitanShieldGenerator1', 'TitanShieldGenerator2']


In [12]:
ohe = OneHotEncoder(handle_unknown='ignore',sparse_output= False).set_output(transform='pandas')

In [13]:
def encode_objectives_with_team(
    df,
    obj_col="objectives.team_objective",
    team_col="objectives.team",
    match_col="match_id",
):
    # Arrays expandn → eine Zeile pro Objective
    df_exp = df.explode([obj_col, team_col]).dropna(subset=[obj_col, team_col])

    df_exp["team_id"] = (
        df_exp[team_col]
        .astype(str)
        .str.extract(r"(\d+)")
        .astype(int)
    )

    df_exp["obj_team"] = (
        df_exp[obj_col].astype(str)
        + "_Team"
        + df_exp["team_id"].astype(str)
        + "_lost"
    )

    dummies = pd.get_dummies(df_exp["obj_team"])

    df_obj = dummies.groupby(df_exp[match_col]).max().reset_index()

    base_cols = [match_col, "winning_team", "duration_s"]
    df_base = df[base_cols].drop_duplicates(subset=[match_col])

    df_full = df_base.merge(df_obj, on=match_col, how="inner")

    return df_full

In [14]:
def reorder_objective_columns(df):
    base_cols = [c for c in ["match_id", "winning_team", "duration_s"] if c in df.columns]

    obj_cols = [c for c in df.columns if c.endswith("_lost")]

    def sort_key(col: str):
        name_part = col.rsplit("_Team", 1)[0]   # "Tier1Lane3", "Core", "BarrackBossLane1", ...
        name_lower = name_part.lower()

        m_team = re.search(r"_Team(\d+)_lost$", col)
        team_id = int(m_team.group(1)) if m_team else 0

        # Tier/Lane extrahieren, falls vorhanden
        m_tier_lane = re.match(r"tier(\d+)lane(\d+)", name_lower)
        tier = int(m_tier_lane.group(1)) if m_tier_lane else 99
        lane = int(m_tier_lane.group(2)) if m_tier_lane else 99

        # Prioritäten:
        # 0: Core / Titan
        # 1: TitanShieldGenerator
        # 2: Tier1 Lanes
        # 3: Tier2 Lanes
        # 4: Tier3 Lanes
        # 5: Barracks / BarrackBoss
        # 10: Rest

        if "core" in name_lower or "titan" in name_lower:
            prio = 0
        elif "titanshieldgenerator" in name_lower:
            prio = 1
        elif m_tier_lane and tier == 1:
            prio = 2
        elif m_tier_lane and tier == 2:
            prio = 3
        elif m_tier_lane and tier == 3:
            prio = 4
        elif "barrack" in name_lower:
            prio = 5
        else:
            prio = 10

        # Rückgabe: erst nach Priorität, dann nach Tier/Lane, dann Name, dann Team
        return (prio, tier, lane, name_lower, team_id)

    obj_cols_sorted = sorted(obj_cols, key=sort_key)

    other_cols = [c for c in df.columns if c not in base_cols + obj_cols]

    new_order = base_cols + obj_cols_sorted + other_cols
    return df[new_order]


In [15]:
from pathlib import Path

output_path = Path("filtered_data/encoded/match_info_encoded.parquet")
output_path.parent.mkdir(parents=True, exist_ok=True)
df_encoded = encode_objectives_with_team(df_match_info)
df_encoded = reorder_objective_columns(df_encoded)
df_encoded.to_parquet("filtered_data/encoded/match_info_encoded.parquet", index=False)
df_encoded.head()

Unnamed: 0,match_id,winning_team,duration_s,Core_Team0_lost,Core_Team1_lost,Titan_Team0_lost,Titan_Team1_lost,TitanShieldGenerator1_Team0_lost,TitanShieldGenerator1_Team1_lost,TitanShieldGenerator2_Team0_lost,...,Tier2Lane3_Team0_lost,Tier2Lane3_Team1_lost,Tier2Lane4_Team0_lost,Tier2Lane4_Team1_lost,BarrackBossLane1_Team0_lost,BarrackBossLane1_Team1_lost,BarrackBossLane3_Team0_lost,BarrackBossLane3_Team1_lost,BarrackBossLane4_Team0_lost,BarrackBossLane4_Team1_lost
0,45941880,Team0,1754,False,True,False,True,False,True,False,...,True,True,True,True,False,True,False,True,False,True
1,45941931,Team1,2356,True,False,True,False,True,True,True,...,True,True,True,True,True,True,True,False,True,True
2,45941981,Team1,1362,True,False,True,False,True,False,True,...,True,True,True,True,True,True,True,False,True,False
3,45941984,Team0,1645,False,True,False,True,False,True,False,...,True,True,True,True,False,True,False,True,False,True
4,45942008,Team1,1966,True,False,True,False,True,False,True,...,True,True,True,True,True,False,True,False,True,False


In [16]:
len(df_encoded)

58021

----Anteil Match_Player Table

In [17]:
df_match_player_general.head()

Unnamed: 0,match_id,account_id,team,hero_name,net_worth,ability_points,player_level
0,45981489,68834866,Team0,Wraith,28815,21,25
1,45981489,87898718,Team1,Haze,35072,24,28
2,45981489,108350717,Team0,Lash,26432,21,24
3,45981489,135154491,Team1,Paradox,31368,22,26
4,45981489,288341550,Team0,Billy,23477,19,23


In [18]:
def encode_heroes(df, hero_col="hero_name"):
    hero_ohe = OneHotEncoder(
        handle_unknown="ignore",
        sparse_output=False
    ).set_output(transform="pandas")

    hero_ohe_df = hero_ohe.fit_transform(df[[hero_col]])
    hero_ohe_df.index = df.index

    df_enc = pd.concat(
        [df.drop(columns=[hero_col]), hero_ohe_df],
        axis=1
    )

    return df_enc

df_match_player_general_encoded = encode_heroes(df_match_player_general)
df_match_player_general_encoded.to_parquet("filtered_data/encoded/match_player_general_heroes_encoded.parquet",index=False)