# Data Wrangling

In [2]:
import pandas as pd
from sklearn.cluster import MeanShift
from sklearn.preprocessing import StandardScaler 
import numpy as np
import matplotlib.pyplot as plt
import pickle

## Read in CSV data

In [3]:
player_df = pd.read_csv("datasets/players.csv")
print(player_df)

          match_id  hero_id  player_slot     team  gold  gold_spent  \
0       7083727502       70            0  Radiant  1965       27465   
1       7083727502      120            1  Radiant  4887       22930   
2       7083727502       33            2  Radiant  1620       21315   
3       7083727502       86            3  Radiant   188       17420   
4       7083727502       75            4  Radiant  2066       13930   
5       7083727502       58          128     Dire  5457       20270   
6       7083727502      106          129     Dire  4576       28085   
7       7083727502       51          130     Dire   956       17370   
8       7083727502      109          131     Dire  5720       37295   
9       7083727502      128          132     Dire  1476       23135   
10      7083723526      106            0  Radiant   995        9650   
11      7083723526       92            1  Radiant   569        9690   
12      7083723526       87            2  Radiant   481        4700   
13    

In [166]:
abilities_df = pd.read_csv("datasets/7_abilities_upgrades.csv")
print(abilities_df)

        match_id  player_slot  ability_id  level
0     7083305097            1        5023      1
1     7083305097            1        5024      2
2     7083305097            1        5025      3
3     7083305097            1        5023      4
4     7083305097            1        5023      5
5     7083305097            1        5026      6
6     7083305097            1        5023      7
7     7083305097            1        5025      8
8     7083305097            1        5025      9
9     7083305097            1         744     10
10    7083305097            1        5025     11
11    7083305097            1        5026     12
12    7083305097            1        5024     13
13    7083305097            1        5024     14
14    7083305097            1         858     15
15    7083305097            1        5024     16
16    7083305097            1        5026     17
17    7083305097            1        6511     18
18    7082722396          131        5023      1
19    7082722396    

## Clean data

In [167]:
# Assign numeric values to all non-numeric features
player_df['team'] = player_df['team'].replace({'Radiant': 1, 'Dire': 0})

# Remove rows with NaN values 
# This ends up removing 160 rows
player_df['lane'] = player_df['lane'].astype(float) 
player_df = player_df.dropna(subset=['lane'])

## Calculate features -- Resource Priority

In [168]:
# Calculate total team gold and player's gold priority
player_df['team_gold_total'] = player_df.groupby(['match_id', 'team'])['gold_total'].transform('sum')
player_df['gold_priority'] = player_df['gold_total']/player_df['team_gold_total'] 

# Calculate total team XPM and player's XPM priority
player_df['team_xpm_total'] = player_df.groupby(['match_id', 'team'])['xpm'].transform('sum')
player_df['xpm_priority'] = player_df['xpm']/player_df['team_xpm_total'] 

# Resource Priority Dataframe
resource_priority_df = player_df.loc[:, ['match_id', 'team', 'hero_id', 'player_slot', 'xpm_priority', 'gold_priority']]

print(resource_priority_df)

          match_id  team  hero_id  player_slot  xpm_priority  gold_priority
0       7083727502     1       70            0      0.243839       0.258643
1       7083727502     1      120            1      0.241894       0.244468
2       7083727502     1       33            2      0.200065       0.201563
3       7083727502     1       86            3      0.183528       0.154747
4       7083727502     1       75            4      0.130674       0.140580
5       7083727502     0       58          128      0.198818       0.178239
6       7083727502     0      106          129      0.246733       0.226278
7       7083727502     0       51          130      0.156192       0.126964
8       7083727502     0      109          131      0.209085       0.298012
9       7083727502     0      128          132      0.189172       0.170507
10      7083723526     1      106            0      0.240911       0.237331
11      7083723526     1       92            1      0.163354       0.228725
12      7083

## Calculate features -- Ability Priority

### Keep only the 4 main abilities of a hero

In [169]:
abilities_df = abilities_df[(abilities_df['ability_id'] >= 5003) & (abilities_df['ability_id'] <= 5754)]

In [170]:
# Get how many times each ability_id is present for each player in each match
# This indicates how many times an ability was upgraded
abilities_df = abilities_df.groupby(['match_id', 'player_slot', 'ability_id']).agg({
    'ability_id': 'count',
    'level': 'max'
}).rename(columns={'ability_id': 'ability_id_count', 'level': 'max_level'}).reset_index()

# Get the max hero level for each ability
# This indicates what level the hero was on when they upgraded each ability for the last time
abilities_df = abilities_df.groupby(['match_id', 'player_slot', 'ability_id']).agg({
    'ability_id_count': 'max',
    'max_level': 'max'
}).reset_index()

# Rename columns for readability
abilities_df = abilities_df.rename(columns={'ability_id_count': 'ability_count', 'max_level': 'max_hero_level'})

# Calculate ability priority for each priority
# Refer to formula in paper
abilities_df['ability_priority'] = abilities_df['ability_count'] / abilities_df['max_hero_level']
print(abilities_df)

# Group by match_id and player_slot
grouped = abilities_df.groupby(['match_id', 'player_slot'])

def get_top_4_priorities(group, ability_ids):
    # Filter the group to only include rows with the desired ability_ids
    filtered_group = group[group['ability_id'].isin(ability_ids)]
    
    # Sort the filtered group by ability_priority in descending order
    sorted_group = filtered_group.sort_values('ability_priority', ascending=False)
    
    # Extract the top four ability_priority values, or NaN if there are fewer than four rows
    if len(sorted_group) >= 4:
        top_4 = sorted_group.head(4)
        # Use the string representations of the ability_ids as the Series names
        return pd.Series({'A1': top_4.iloc[0]['ability_priority'],
                          'A2': top_4.iloc[1]['ability_priority'],
                          'A3': top_4.iloc[2]['ability_priority'],
                          'A4': top_4.iloc[3]['ability_priority'],
                          'A1_id': str(ability_ids[0]),
                          'A2_id': str(ability_ids[1]),
                          'A3_id': str(ability_ids[2]),
                          'A4_id': str(ability_ids[3]),
                         })
    else:
        # Use the string representations of the ability_ids as the Series names
        return pd.Series({'A1': np.nan,
                          'A2': np.nan,
                          'A3': np.nan,
                          'A4': np.nan,
                          'A1_id': np.nan,
                          'A2_id': np.nan,
                          'A3_id': np.nan,
                          'A4_id': np.nan,
                         })

# Apply the function to each group and reset the index
result = grouped.apply(get_top_4_priorities, [5023, 5024, 5025, 5026]).reset_index()

# Ability Priority Dataframe
ability_priority_df = result[['match_id', 'player_slot', 'A1', 'A2', 'A3', 'A4', 'A1_id', 'A2_id', 'A3_id', 'A4_id']]

print(ability_priority_df)

        match_id  player_slot  ability_id  ability_count  max_hero_level  \
0     6905504043          130        5023              4              11   
1     6905504043          130        5024              4              16   
2     6905504043          130        5025              4               7   
3     6905504043          130        5026              3              17   
4     6905733931            3        5023              4               8   
5     6905733931            3        5024              4              16   
6     6905733931            3        5025              4              11   
7     6905733931            3        5026              3              17   
8     6905867438            4        5023              4              10   
9     6905867438            4        5024              4              16   
10    6905867438            4        5025              4               8   
11    6905867438            4        5026              2              12   
12    690596

  stacked_values = np.vstack(map(np.asarray, values))


## Save Dataframes to File

In [171]:
# save resource_priority_df to a file
with open('pickles/resource_priority_df.pickle', 'wb') as f:
    pickle.dump(resource_priority_df, f)
    
# save ability_priority_df to a file
with open('pickles/ability_priority_df.pickle', 'wb') as f:
    pickle.dump(ability_priority_df, f)
    
# save player_df to a file (contains lane info)
with open('pickles/player_df.pickle', 'wb') as f:
    pickle.dump(player_df, f)