# Data Wrangling

In [25]:
import pandas as pd
from sklearn.cluster import MeanShift
from sklearn.preprocessing import StandardScaler 
import numpy as np
import matplotlib.pyplot as plt
import pickle

## Read in CSV data

In [21]:
player_df = pd.read_csv("players.csv")
print(player_df)

          match_id  hero_id  player_slot     team  gold  gold_spent  \
0       7083727502       70            0  Radiant  1965       27465   
1       7083727502      120            1  Radiant  4887       22930   
2       7083727502       33            2  Radiant  1620       21315   
3       7083727502       86            3  Radiant   188       17420   
4       7083727502       75            4  Radiant  2066       13930   
5       7083727502       58          128     Dire  5457       20270   
6       7083727502      106          129     Dire  4576       28085   
7       7083727502       51          130     Dire   956       17370   
8       7083727502      109          131     Dire  5720       37295   
9       7083727502      128          132     Dire  1476       23135   
10      7083723526      106            0  Radiant   995        9650   
11      7083723526       92            1  Radiant   569        9690   
12      7083723526       87            2  Radiant   481        4700   
13    

In [37]:
abilities_df = pd.read_csv("abilities_upgrades.csv")
print(abilities_df)

     ability_id  level  player_slot    match_id
0          5357      1            0  7083727502
1          5359      2            0  7083727502
2          5359      3            0  7083727502
3          5358      4            0  7083727502
4          5359      5            0  7083727502
5          5360      6            0  7083727502
6          5359      7            0  7083727502
7          5358      8            0  7083727502
8          5358      9            0  7083727502
9          6926     10            0  7083727502
10         5358     11            0  7083727502
11         5360     12            0  7083727502
12         5357     13            0  7083727502
13         5357     14            0  7083727502
14         1062     15            0  7083727502
15         5357     16            0  7083727502
16         5360     17            0  7083727502
17         6132     18            0  7083727502
18          424     19            0  7083727502
19         6975     20            0  708

## Clean data

In [6]:
# Assign numeric values to all non-numeric features
player_df['team'] = player_df['team'].replace({'Radiant': 1, 'Dire': 0})

# Remove rows with NaN values 
# This ends up removing 160 rows
player_df['lane'] = player_df['lane'].astype(float) 
player_df = player_df.dropna(subset=['lane'])

## Calculate features -- Resource Priority

In [16]:
# Calculate total team gold and player's gold priority
player_df['team_gold_total'] = player_df.groupby(['match_id', 'team'])['gold_total'].transform('sum')
player_df['gold_priority'] = player_df['gold_total']/player_df['team_gold_total'] 

# Calculate total team XPM and player's XPM priority
player_df['team_xpm_total'] = player_df.groupby(['match_id', 'team'])['xpm'].transform('sum')
player_df['xpm_priority'] = player_df['xpm']/player_df['team_xpm_total'] 

# Resource Priority Dataframe
resource_priority_df = player_df.loc[:, ['match_id', 'team', 'hero_id', 'player_slot', 'xpm_priority', 'gold_priority']]

print(resource_priority_df)

          match_id  team  hero_id  player_slot  xpm_priority  gold_priority
0       7083727502     1       70            0      0.243839       0.258643
1       7083727502     1      120            1      0.241894       0.244468
2       7083727502     1       33            2      0.200065       0.201563
3       7083727502     1       86            3      0.183528       0.154747
4       7083727502     1       75            4      0.130674       0.140580
5       7083727502     0       58          128      0.198818       0.178239
6       7083727502     0      106          129      0.246733       0.226278
7       7083727502     0       51          130      0.156192       0.126964
8       7083727502     0      109          131      0.209085       0.298012
9       7083727502     0      128          132      0.189172       0.170507
10      7083723526     1      106            0      0.240911       0.237331
11      7083723526     1       92            1      0.163354       0.228725
12      7083

## Calculate features -- Ability Priority

In [None]:
# Get how many times each ability_id is present for each player in each match
# This indicates how many times an ability was upgraded
grouped_abilities_df = abilities_df.groupby(['match_id', 'player_slot', 'ability_id']).agg({
    'ability_id': 'count',
    'level': 'max'
}).rename(columns={'ability_id': 'ability_id_count', 'level': 'max_level'}).reset_index()

# Get the max hero level for each ability
# This indicates what level the hero was on when they upgraded each ability for the last time
grouped_abilities_df = abilities_df.groupby(['match_id', 'player_slot', 'ability_id']).agg({
    'ability_id_count': 'max',
    'max_level': 'max'
}).reset_index()

# Rename columns for readability
abilities_df = abilities_df.rename(columns={'ability_id_count': 'ability_count', 'max_level': 'max_hero_level'})

# Calculate ability priority for each priority
# Refer to formula in paper
abilities_df['ability_priority'] = abilities_df['ability_count'] / abilities_df['max_hero_level']
print(abilities_df)

# Group by match_id and player_slot
grouped = abilities_df.groupby(['match_id', 'player_slot'])

# Function to get the top 4 ability prioritization values for each group
def get_top_4_priorities(group):
    # Sort by the ability_priority column in descending order
    sorted_group = group.sort_values('ability_priority', ascending=False)
    
    top_4 = sorted_group.head(4)
    # Return a Series with the A1-A4 values
    return pd.Series({
        'A1': top_4.iloc[0]['ability_priority'],
        'A2': top_4.iloc[1]['ability_priority'],
        'A3': top_4.iloc[2]['ability_priority'],
        'A4': top_4.iloc[3]['ability_priority'],
        'A1_id': top_4.iloc[0]['ability_id'],
        'A2_id': top_4.iloc[1]['ability_id'],
        'A3_id': top_4.iloc[2]['ability_id'],
        'A4_id': top_4.iloc[3]['ability_id']
    })

# Apply the function to each group and reset the index
result = grouped.apply(get_top_4_priorities).reset_index()

# Ability Priority Dataframe
ability_priority_df = result[['match_id', 'player_slot', 'A1', 'A2', 'A3', 'A4', 'A1_id', 'A2_id', 'A3_id', 'A4_id']]

print(ability_priority_df)

In [38]:
# Get how many times each ability_id is present for each player in each match
# This indicates how many times an ability was upgraded
abilities_df = abilities_df.groupby(['match_id', 'player_slot', 'ability_id']).agg({
    'ability_id': 'count',
    'level': 'max'
}).rename(columns={'ability_id': 'ability_id_count', 'level': 'max_level'}).reset_index()

# Get the max hero level for each ability
# This indicates what level the hero was on when they upgraded each ability for the last time
abilities_df = abilities_df.groupby(['match_id', 'player_slot', 'ability_id']).agg({
    'ability_id_count': 'max',
    'max_level': 'max'
}).reset_index()

# Rename columns for readability
abilities_df = abilities_df.rename(columns={'ability_id_count': 'ability_count', 'max_level': 'max_hero_level'})

# Calculate ability priority for each priority
# Refer to formula in paper
abilities_df['ability_priority'] = abilities_df['ability_count'] / abilities_df['max_hero_level']
print(abilities_df)

# Group by match_id and player_slot
grouped = abilities_df.groupby(['match_id', 'player_slot'])

# Function to get the top 4 ability prioritization values for each group
def get_top_4_priorities(group):
    # Sort by the ability_priority column in descending order
    sorted_group = group.sort_values('ability_priority', ascending=False)
    
    top_4 = sorted_group.head(4)
    # Return a Series with the A1-A4 values
    return pd.Series({
        'A1': top_4.iloc[0]['ability_priority'],
        'A2': top_4.iloc[1]['ability_priority'],
        'A3': top_4.iloc[2]['ability_priority'],
        'A4': top_4.iloc[3]['ability_priority'],
        'A1_id': top_4.iloc[0]['ability_id'],
        'A2_id': top_4.iloc[1]['ability_id'],
        'A3_id': top_4.iloc[2]['ability_id'],
        'A4_id': top_4.iloc[3]['ability_id']
    })

# Apply the function to each group and reset the index
result = grouped.apply(get_top_4_priorities).reset_index()

# Ability Priority Dataframe
ability_priority_df = result[['match_id', 'player_slot', 'A1', 'A2', 'A3', 'A4', 'A1_id', 'A2_id', 'A3_id', 'A4_id']]

print(ability_priority_df)

       match_id  player_slot  ability_id  ability_count  max_hero_level  \
0    7083690375            0        6343              3              17   
1    7083690375            0        6344              4               7   
2    7083690375            0        6461              4               9   
3    7083690375            0        7307              4              15   
4    7083690375            0        9063              1              11   
5    7083690375            0        9135              1              16   
6    7083690375            1         850              1              10   
7    7083690375            1        5237              4               7   
8    7083690375            1        5238              4              11   
9    7083690375            1        5239              4              15   
10   7083690375            1        5240              2              12   
11   7083690375            2         343              4               8   
12   7083690375          

  stacked_values = np.vstack(map(np.asarray, values))


## Save Dataframes to File

In [39]:
# save resource_priority_df to a file
with open('resource_priority_df.pickle', 'wb') as f:
    pickle.dump(resource_priority_df, f)
    
# save ability_priority_df to a file
with open('ability_priority_df.pickle', 'wb') as f:
    pickle.dump(ability_priority_df, f)
    
# save player_df to a file (contains lane info)
with open('player_df.pickle', 'wb') as f:
    pickle.dump(player_df, f)