In [10]:
import pandas as pd
import numpy as np
import os

In [11]:
# Set base path
RAW_DATA_FILE = "../data/bronze/all_dota_matches.csv"
OUTPUT_FILE = "../data/silver/data_transformed.csv"

In [12]:
def load_data(path):
    df = pd.read_csv(path, sep='|'
                     #, nrows=1000
                     )
    df['start_time'] = pd.to_datetime(df['start_time'], unit='s')
    df['year'] = df['start_time'].dt.year
    df['team_id'] = np.where(df['radiant_win'] == df['win'], df['radiant_team_id'], df['dire_team_id'])
    return df

def engineer_roles_attributes(df):
    role_list = ['Support','Nuker','Initiator','Escape','Durable','Disabler','Carry','Jungler','Pusher']
    for role in role_list:
        df[role] = df['roles'].str.contains(role).astype(int)

    df = pd.get_dummies(df, prefix='attribute', columns=['primary_attr'])
    df = pd.get_dummies(df, prefix='attack', columns=['attack_type'])
    return df

def generate_team_stats(df):
    df_teams = df[['year', 'match_id', 'team_id', 'win']].dropna().drop_duplicates()
    df_teams = pd.get_dummies(df_teams, prefix='win', columns=['win'])
    return df_teams

def group_player_level(df):
    return df.groupby(
        ['win','year','match_id','start_time','account_id','hero_id','name'], as_index=False
    ).agg(
        duration=('duration','mean'), 
        sum_support=('Support','sum'), sum_nuker=('Nuker','sum'), sum_initiator=('Initiator','sum'),
        sum_escape=('Escape','sum'), sum_durable=('Durable','sum'), sum_disabler=('Disabler','sum'),
        sum_carry=('Carry','sum'), sum_jungler=('Jungler','sum'), sum_pusher=('Pusher','sum'),
        sum_agi=('attribute_agi','sum'), sum_int=('attribute_int','sum'), sum_str=('attribute_str','sum'),
        sum_melee=('attack_Melee','sum'), sum_ranged=('attack_Ranged','sum'),
        sum_kills=('kills','sum'), sum_deaths=('deaths','sum'), sum_assists=('assists','sum'),
        mean_lasthits=('last_hits','mean'), mean_denies=('denies','mean'),
        sum_observers=('observers_placed','sum'), sum_towers=('towers_killed','sum'),
        mean_gold_min=('gold_per_min','mean'), mean_exp_min=('xp_per_min','mean')
    )

def group_match_level(df):
    df_grouped = df.groupby(
        ['win','year','match_id','start_time'], as_index=False
    ).agg(
        duration=('duration','mean'), 
        sum_support=('Support','sum'), sum_nuker=('Nuker','sum'), sum_initiator=('Initiator','sum'),
        sum_escape=('Escape','sum'), sum_durable=('Durable','sum'), sum_disabler=('Disabler','sum'),
        sum_carry=('Carry','sum'), sum_jungler=('Jungler','sum'), sum_pusher=('Pusher','sum'),
        sum_agi=('attribute_agi','sum'), sum_int=('attribute_int','sum'), sum_str=('attribute_str','sum'),
        sum_melee=('attack_Melee','sum'), sum_ranged=('attack_Ranged','sum'),
        sum_kills=('kills','sum'), sum_deaths=('deaths','sum'), sum_assists=('assists','sum'),
        mean_lasthits=('last_hits','mean'), mean_denies=('denies','mean'),
        sum_observers=('observers_placed','sum'), sum_towers=('towers_killed','sum'),
        mean_gold_min=('gold_per_min','mean'), mean_exp_min=('xp_per_min','mean')
    )

    # Normalize role, attribute, and type counts by 5 (5 players per team)
    total_roles = df_grouped[[col for col in df_grouped.columns if col.startswith('sum_') and col not in ['sum_kills', 'sum_deaths', 'sum_assists', 'sum_observers', 'sum_towers']]]
    df_grouped['sum_roles'] = total_roles.sum(axis=1)

    for col in total_roles.columns:
        df_grouped[col] = df_grouped[col] / df_grouped['sum_roles']

    df_grouped['sum_melee'] = df_grouped['sum_melee'] / 5
    df_grouped['sum_ranged'] = df_grouped['sum_ranged'] / 5
    df_grouped = pd.get_dummies(df_grouped, prefix='win', columns=['win'])

    return df_grouped

def merge_team_info(df_grouped, df_teams):
    return pd.merge(
        df_grouped, df_teams, 
        left_on=['year','match_id','win_False','win_True'],
        right_on=['year','match_id','win_False','win_True'], how='left'
    )

In [13]:
df = load_data(RAW_DATA_FILE)
df = engineer_roles_attributes(df)
df_teams = generate_team_stats(df)
df_grouped = group_match_level(df)
df_final = merge_team_info(df_grouped, df_teams)
df_final.to_csv(OUTPUT_FILE, sep='|', index=False)

  df = pd.read_csv(path, sep='|'
