In [1]:
import requests
import json
import time
import numpy as np
import pandas as pd
from statistics import mean
from sklearn import preprocessing, impute
from collections import Counter

In [2]:
api_url = "https://api.opendota.com/api"

In [3]:
rate_limit = {'count' : 0,
            'curr_time' : time.time(),
             'total_cnt': 0}

In [4]:
def get_request(url_extensiton, rate_limit, params={}):
    #Ensure rate limit of 60 calls/min is not exceeded
    if rate_limit['count'] == 60:
        rate_limit['count'] = 0
        time_elapsed = time.time() - rate_limit['curr_time']
        if time_elapsed < 60:
            time.sleep(62 - time_elapsed)
        rate_limit['curr_time'] = time.time()
    
    rate_limit['count'] += 1   
    rate_limit['total_cnt'] += 1
    if rate_limit['total_cnt'] == 50000:
        print('monthly limit reached')
    
    # make get request
    response = requests.get(api_url + url_extensiton, params=params)
    if response.status_code != 200:
        # if response.status_code == 429:
        #     print(response.text)
        return {'response_code': response.status_code,
               'error': response.text}
    else:
        json_response = json.loads(response.text)
        return {'response_code': 200,
                'body': json_response}

In [5]:
# Retrieve all hero names
heroes_json = get_request('/heroes', rate_limit)

heroes = dict()
for hero in heroes_json['body']:
    heroes[hero['id']] = hero['localized_name']
    
len(heroes)

122

## Large Instances and Small dimensionality

### Build the dataset with API SQL query

#### Create single SQL Query

In [30]:
# Function that makes the full SQL query to the DOTA2 API
def sql_query(match_id=None):
    
    # Define full query
    query = """
    SELECT

    matches.match_id,
    matches.radiant_team_id,
    matches.dire_team_id,
    matches.game_mode,
    matches.cluster,
    matches.lobby_type,
    matches.radiant_win,
    dire_team.her_o_ids as dire_heros,
    radiant_team.her_o_ids as radiant_heros,
    leagues.tier,
    tr1.rating as dire_rating,
    tr1.wins as dire_wins,
    tr1.losses as dire_losses,
    tr2.rating as radiant_rating,
    tr2.wins as radiant_wins,
    tr2.losses as radiant_losses
    FROM matches
    JOIN (SELECT match_id, string_agg(pl.hero_id::text, ',') as her_o_ids FROM player_matches as pl where player_slot < 5 group by match_id) as dire_team using (match_id)
    JOIN (SELECT match_id, string_agg(pl.hero_id::text, ',') as her_o_ids FROM player_matches as pl where player_slot > 5 group by match_id) as radiant_team using (match_id)
    JOIN leagues using(leagueid)
    JOIN team_rating as tr1 ON tr1.team_id = matches.dire_team_id
    JOIN team_rating as tr2 ON tr2.team_id = matches.radiant_team_id
    WHERE matches.human_players = 10
    AND matches.radiant_team_id IS NOT NULL
    AND matches.dire_team_id IS NOT NULL
    %s
    ORDER BY matches.match_id DESC
    LIMIT 20000;
    """ % ("AND matches.match_id < {}".format(match_id) if match_id else "")

    # Request data matching query
    response = get_request('/explorer', rate_limit, {'sql':query})

    #ensure error is not returned
    if response['response_code'] == 200:
        df = pd.DataFrame(response['body']['rows'])
    else:
        df = None
        print("{} {}".format(response['response_code'], response['error']))

    return df

#### Fetch all possible matches that fits the query

THIS CODE MAY NEED TO BE RUN A COUPLE TIMES FOR IT TO WORK. (API is glitchy)

In [62]:
# All instances cannot be retrieved at once, so it is done in batches of 20k

# storage df
df = pd.DataFrame()

# Last ID that's retrieved
min_id = None

# Keep fetching until the maximum available matches have been found
while True:
    # Get a batch of matches
    query_df = sql_query(min_id)
    print(query_df.shape)
    
    #store the min match_id so next batch can be retrieve
    min_id = min(query_df['match_id'])
    print(min_id)
    
    # add batch to storage
    df = pd.concat([df, query_df], ignore_index=False)
    
    if df.shape[0] > 103000: #manually found the max number of matches avialable
        break

df.shape

(20000, 16)
5627655668
(20000, 16)
5068880258
(20000, 16)
3604296148
(20000, 16)
1943364461
(20000, 16)
357589264
(4945, 16)
19150047


(104945, 16)

In [63]:
## Switch dire and radiant data to double the dataset
df_inv = df.copy()
df_inv['radiant_win'] = ~df_inv['radiant_win']
    
# Flip all columns with dire and radiant directly in them
flip = ['heros', 'rating', 'wins', 'losses']
dire_flip = ['dire_' + i for i in flip]
radiant_flip = ['radiant_' + i for i in flip]

df_inv[radiant_flip] = df[dire_flip]
df_inv[dire_flip] = df[radiant_flip]

# add the duplicated instances to the original df
df = pd.concat([df, df_inv], ignore_index=True)


### Preprocess Dataset

In [64]:
#Apply one hot encoding to clusters and game_mode, as they are arbritary numbers relating to region
le = preprocessing.LabelEncoder()
df['tier'] = le.fit_transform(df['tier'])

enc = preprocessing.OneHotEncoder()
for column in ['cluster', 'game_mode', 'tier']:
    encoded = enc.fit_transform(np.array(df[column]).reshape(-1,1))
    encoded_names = [column + '_' + str(i) for i in range(len(encoded.toarray()[0]))]
    df.loc[:, encoded_names] = encoded.toarray()

In [65]:
dfe = df.copy()

#### Create dataset with one-hot encoding on heroes (dire and radiant separately)

In [66]:
# Add one-hot encoding to heros, such that 121 heroes for both dire and radiant
names = ['dire_{}'.format(str(i)) for i in heroes.values()]
df[names] = [[1 if str(i) in j.split(',') else 0 for i in heroes.keys()] for j in df['dire_heros']]

names = ['radiant_{}'.format(str(i)) for i in heroes.values()]
df[names] = [[1 if str(i) in j.split(',') else 0 for i in heroes.keys()] for j in df['radiant_heros']]

  self[col] = igetitem(value, i)


In [67]:
# Store the relevant columns
skip = ['match_id', 'radiant_team_id', 'dire_team_id', 'dire_heros', 'radiant_heros', 'cluster', 'game_mode', 'tier']
columns = [i for i in df.columns if i not in skip]
df[columns].to_csv("../data/dota2_matches_large_encoded.csv", index=False)

In [68]:
df[columns].shape

(209890, 326)

#### Create dataset without encoding

In [69]:
df = dfe.copy()

In [70]:
# Function to determine whether hero is chosen by a player in either team
def hero_exists(hero, dires, radiants):
    if hero in dires:
        return 1
    elif hero in radiants:
        return 2
    else:
        return 0

In [71]:
# Create all hero choice columns
df[list(heroes.values())] = [[hero_exists(str(i), d, r) for i in heroes.keys()] for d, r in zip(df['dire_heros'], df['radiant_heros'])]

In [72]:
# Store all the relevant features
skip = ['match_id', 'radiant_team_id', 'dire_team_id', 'dire_heros', 'radiant_heros', 'cluster', 'game_mode', 'tier']
columns = [i for i in df.columns if i not in skip]
df[columns].to_csv("../data/dota2_matches_large.csv", index=False)

In [73]:
df[columns].shape

(209890, 204)

## Small Instances and Large Dimensionality

WARNING: This section takes an 1-2 Hours to run

### Build dataset from API

##### Retrieve pro matches, that have complete teams

In [23]:
#GET 100 public matches
matchIds = set()
min_id = float('inf')
while len(matchIds) < 500:
    params = {} if min_id == float('inf') else {'less_than_match_id':min_id}        #Ensure same matches arent returned
    pro_matches_json = get_request('/proMatches', rate_limit, params)               #100 pro matches
    matchIds.update([match['match_id'] for match in pro_matches_json['body']])
    min_id = min(min_id, min(matchIds))

##### Retrieve match information

In [24]:
#Retrieve full match information for all proMatches
matches = list()
# for match in matchIds:
while matchIds:
    match = matchIds.pop()
    match_json = get_request('/matches/' + str(match), rate_limit)
    
    # Ensure match IDs are actually returned
    if match_json['response_code'] == 429:
        matchIds.add(match)
        continue    
    elif match_json['response_code'] != 200:
        print('{} --> {}'.format(match, match_json['response_code']))
        continue
        
    matches.append(match_json['body'])

In [25]:
len(matches)

1000

##### Retrieve player information 

In [None]:
#Create list of player IDs to go through
player_ids = set()
for match in matches:
    accountIds = [player['account_id'] for player in match['players']]
    player_ids.update(accountIds)
len(player_ids)

In [None]:
# Retrieve individual player information
count = 0
players = dict()
while player_ids:
    # current player
    accountId = player_ids.pop()
    players[accountId] = dict()
    
    # Retrieve ranking information for player
    playerInfo = get_request('/players/' + str(accountId), rate_limit)
    rank_attr = ['solo_competitive_rank', 'leaderboard_rank', 'rank_tier', 'competitive_rank']
    
    # Format and store player statistics appropriately 
    if playerInfo['response_code'] == 200:
        playerInfo = playerInfo['body']            
        for attr in rank_attr:                
             players[accountId][attr] = playerInfo[attr] if attr in playerInfo else np.NaN   
        players[accountId]['mmr_estimate'] = playerInfo['mmr_estimate']['estimate'] if ('mmr_estimate' in playerInfo and 'estimate' in playerInfo['mmr_estimate']) else np.NaN
    elif playerInfo['response_code'] == 429:
        player_ids.add(accountId)
        continue
    else:
        for attr in rank_attr:
            players[accountId][attr] = np.NaN
        players[accountId]['mmr_estimate'] = np.NaN

        
    # Retrieve win/lose ratio
    playerInfo = get_request('/players/' + str(accountId) + '/wl', rate_limit)
    if playerInfo['response_code'] == 200:
        playerInfo = playerInfo['body']
        players[accountId]['wl_ratio'] = playerInfo['win']/(playerInfo['win'] + playerInfo['lose'])
    elif playerInfo['response_code'] == 429:
        player_ids.add(accountId)
        continue
    else:
        players[accountId]['wl_ratio'] = None
            
    # Retrieve win/lose ratio for hero
    playerInfo = get_request('/players/' + str(accountId) + '/heroes', rate_limit)
    players[accountId]['wl_hero_ratio'] = dict()
    if playerInfo['response_code'] == 200:
        playerInfo = playerInfo['body']
        for hero in playerInfo:
            win_ratio = 0 if hero['games'] == 0 else hero['win']/hero['games']
            players[accountId]['wl_hero_ratio'][hero['hero_id']] = win_ratio
    elif playerInfo['response_code'] == 429:
        player_ids.add(accountId)
        continue
    else:
        for hero in heroes.keys():
               players[accountId]['wl_hero_ratio'][str(hero)] = 0
    
    # Retrieve average kill death rates for the previous matches
    playerMatches = get_request('/players/' + str(accountId) + '/matches', rate_limit)
    kda = ['kills', 'deaths', 'assists']
    if playerMatches['response_code'] == 200:
        playerMatches = playerMatches['body']
        for attr in kda:
            players[accountId]['avg_' + attr] = np.mean([mat[attr] for mat in playerMatches])
    elif playerInfo['response_code'] == 429:
        player_ids.add(accountId)
        continue
    else:
        for attr in kda:
            players[accountId]['avg_' + attr] = None
    
    # # Retrieve behavioral score and percent rank
    # response = requests.get(api_url + '/players/' + str(accountId) + 'rankings')
    # playerInfo = json.loads(response.text)
    # players[accountId]['behavior_score'] = playerInfo['score']
    # players[accountId]['percent_rank'] = playerInfo['percent_rank']

In [None]:
len(players)

#### Store values in a formatted dataframe

In [None]:
# inverts hero choices by dire and radiant team
def player_team(player):
    if player['isRadiant']:
        return 2
    elif not player['isRadiant']:
        return 1
    else:
        return 0

In [None]:
# Determine list of columns to average per team and attributes just related to the match
columns = ['match_id', 'game_mode', 'cluster', 'league_tier' ]
team_stats_names = ['solo_competitive_rank', 'leaderboard_rank', 'mmr_estimate', 'rank_tier', 'competitive_rank', 'wl_ratio', 'wl_hero_ratio', 'avg_kills', 'avg_deaths', 'avg_assists']
columns.extend(heroes.values())

data = pd.DataFrame(dict(), columns=columns)
for match in matches:
    # add basic match info to new rows
    new_row = {'match_id': match['match_id'],
               'game_mode': match['game_mode'], 
               'cluster': match['cluster'], 
               'human_players': match['human_players'],
              'league_tier': match['league']['tier'] if ('league' in match and 'tier' in match['league']) else 'public'}
        
    #add heroes chosen by each team
    for player in match['players']:
        hero_name = heroes[player['hero_id']]
        new_row[hero_name] = player_team(player)
    
    # add player information to new row
    teams = dict()
    teams['dire'] = [player['account_id'] for player in match['players'] if not player['isRadiant']]
    teams['radiant'] = [player['account_id'] for player in match['players'] if player['isRadiant']]      
        
    # calculate distribute of player statistics per team
    for attr in team_stats_names:
        for team, team_ids in teams.items():
            team_stats = list()
            if attr == 'wl_hero_ratio':
                for player in match['players']:
                    accId = player['account_id']
                    if accId in team_ids:
                        hero_id = player['hero_id']
                        if str(hero_id) in players[accId][attr]:
                            team_stats.append(players[accId][attr][str(hero_id)])
                        else:
                            team_stats.append(players[accId][attr][heroes[hero_id]])
            else:
                team_stats = [players[i][attr] for i in team_ids]
                
            name = attr + '_' + team
            
            # Handle n/a cases
            team_stats = [i for i in team_stats if i != None]
            
            # add player statistics 
            skip = ['leaderboard_rank', 'rank_tier']
            if attr not in skip:
                new_row['max_' + name] = max(team_stats) if team_stats else np.NaN
                new_row['min_' + name] = min(team_stats) if team_stats else np.NaN
                new_row['avg_' + name] = np.mean(team_stats) if team_stats else np.NaN
                new_row['std_' + name] = np.std(team_stats) if team_stats else np.NaN    
    
    #add target 
    new_row['radiant_win'] = match['radiant_win']
        
    data = data.append(new_row, ignore_index=True)

### Preprocessing

#### Impute the missing data, as some players have missing data

In [None]:
# fill in the n/a values as 0 for hero choices
data.loc[:, heroes.values()] = data[heroes.values()].fillna(0)

In [None]:
#Remove rows with large amounts of missing values (>=7)
to_remove = [data.index[row] for row in range(data.shape[0]) if data.iloc[row].isnull().sum() >= 8]
data.drop(to_remove, inplace=True)

In [None]:
#Apply one hot encoding to clusters and game_mode, as they are arbritary numbers relating to region
enc = preprocessing.OneHotEncoder()
encoded = enc.fit_transform(data[['cluster', 'game_mode', 'league_tier']])
encoded_names = ['cluster_mode_league' + str(i) for i in range(len(encoded.toarray()[0]))]
data.loc[:, encoded_names] = encoded.toarray()
data.drop(['game_mode', 'cluster', 'league_tier'], axis=1, inplace=True)

In [None]:
# Impute with mean
imputer = impute.SimpleImputer()
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

#### Double instances by reversing existing instances

In [None]:
df = data.copy()

In [54]:
# helper function to invert team choice
def hero_selection(team):
    if team == 1:
        return 2
    elif team == 2:
        return 1
    else:
        return 0

In [55]:
## Create inverse sets
df_inv = df.copy()
df_inv['radiant_win'] = [not i for i in df['radiant_win']]
    
# Flip all the radiant and dire teams
flip = ['solo_competitive_rank', 'leaderboard_rank', 'mmr_estimate', 'rank_tier', 'competitive_rank', 'wl_ratio', 'wl_hero_ratio', 'avg_kills', 'avg_deaths', 'avg_assists']
flip = [i for i in df.columns if i.replace('_dire','').replace('_radiant','') in flip]
dire_flip = [i for i in flip if 'dire' in i]
radiant_flip = [i for i in flip if 'radiant' in i]

df_inv[radiant_flip] = df[dire_flip]
df_inv[dire_flip] = df[radiant_flip]

#Flip heroes
for col in heroes.values():
    if col in df.columns:
        df_inv[col] = [hero_selection(i) for i in df[col]]

df = pd.concat([df, df_inv], ignore_index=True)
df.shape

(844, 216)

In [56]:
#Write to csv
df.drop(["match_id", "human_players"], axis=1, inplace=True)
df.to_csv('../data/dota2_matches_small.csv', index=False)

### Create an alternative dataset with one-hot encoding of heroes

In [None]:
df = data.copy()

In [None]:
# One-hot encode all the heroes columns
for hero in heroes.values():
    if hero in df.columns:
        df['dire_' + hero] = [1 if i==1 else 0 for i in df[hero]]
        df['radiant_' + hero] = [1 if i==2 else 0 for i in df[hero]]
        df.drop(columns=hero, inplace=True)

In [55]:
## Create inverse sets
df_inv = df.copy()
df_inv['radiant_win'] = [not i for i in df['radiant_win']] # flip target class
    
# flip all the features for dire and radiant
flip = ['solo_competitive_rank', 'leaderboard_rank', 'mmr_estimate', 'rank_tier', 'competitive_rank', 'wl_ratio', 'wl_hero_ratio', 'avg_kills', 'avg_deaths', 'avg_assists']
flip = [i for i in df.columns if i.replace('_dire','').replace('_radiant','') in flip]
dire_flip = [i for i in flip if 'dire' in i]
radiant_flip = [i for i in flip if 'radiant' in i]

df_inv[radiant_flip] = df[dire_flip]
df_inv[dire_flip] = df[radiant_flip]

df = pd.concat([df, df_inv], ignore_index=True)
df.shape

(844, 216)

In [None]:
#Write to csv
df.drop(["match_id", "human_players"], axis=1, inplace=True)
df.to_csv('../data/dota2_matches_small_encoded.csv', index=False)