In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import time
import datetime
import pytz
from itertools import combinations
from scipy.sparse import hstack, csr_matrix
import os

# Sklearn stuff
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
PATH_TO_DATA = '../../data/dota_2/'
SEED = 17

In [3]:
# Train dataset
df_train_features = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_features.csv'), 
                                    index_col='match_id_hash')
df_train_targets = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_targets.csv'), 
                                   index_col='match_id_hash')

y_train = df_train_targets['radiant_win'].map({True: 1, False: 0})

# Test dataset
df_test_features = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_features.csv'), 
                                   index_col='match_id_hash')

In [4]:
df_full_features = pd.concat([df_train_features, df_test_features])

# Index to split the training and test data sets
idx_split = df_train_features.shape[0]

heroes_df = df_full_features[[f'{t}{i}_hero_id' for t in ['r', 'd'] for i in range(1, 6)]]

## Bag of words

In [5]:
def bag_of_heroes(df, N=1, r_val=1, d_val=-1, r_d_val=0, return_as='csr'):
    '''
    Bag of Heroes. Returns a csr matrix (+ list of feature names) or dataframe where each column represents
    a hero (ID) and each row represents a match.
    
    The value of a cell (i, j) in the returned matrix is:
        cell[i, j] = 0, if the hero or combination of heroes of the j-th column is not present in the i-th match
        cell[i, j] = r_val, if the hero (N = 1) or combination of heroes (N > 1, synergy) of the j-th column is within the Radiant team,
        cell[i, j] = d_val, if the hero (N = 1) or combination of heroes (N > 1, synergy) of the j-th column is within the Dire team,
        cell[i, j] = r_d_val, if the combination of heroes of the j-th column is between the Radiant and Dire teams (N>1, anti-synergy).
    
    Parameters:
    -----------
        df: dataframe with hero IDs, with columns ['r1_hero_id', ..., 'r5_hero_id', 'd1_hero_id', ..., 'd5_hero_id']
        N: integer 1 <= N <= 10, for N heroes combinations
        return_as: 'csr' for scipy csr sparse matrix, 'df' for pandas dataframe
    '''
    if N < 1 or N > df.shape[1]:
        raise Exception(f'The number N of hero-combinations should be 1 <= N <= {df.shape[1]}')
        
    # Convert the integer IDs to strings of the form id{x}{x}{x}
    df = df.astype(str).applymap(lambda x: 'id' + '0'*(3 - len(x)) + x)
    
    # Create a list of all hero IDs present in df
    hero_ids = np.unique(df).tolist()

    # Break df into teams Radiant (r) and Dire (d)
    df_r = df[[col for col in df.columns if col[0] == 'r']]
    df_d = df[[col for col in df.columns if col[0] == 'd']]
    
    # Create a list of all the hero IDs in df, df_r and df_d respectively
    f = lambda x: ' '.join(['_'.join(c) for c in combinations(sorted(x), N)])
    
    df_list = df.apply(f, axis=1).tolist()
    df_list.append(' '.join(['_'.join(c) for c in combinations(hero_ids, N)]))

    df_r_list = df_r.apply(f, axis=1).tolist()
    df_r_list.append(' '.join(['_'.join(c) for c in combinations(hero_ids, N)]))
    
    df_d_list = df_d.apply(f, axis=1).tolist()
    df_d_list.append(' '.join(['_'.join(c) for c in combinations(hero_ids, N)]))
    
    # Create countvectorizers
    vectorizer = CountVectorizer()
    vectorizer_r = CountVectorizer()
    vectorizer_d = CountVectorizer()
    
    X = vectorizer.fit_transform(df_list)[:-1]
    X_r = vectorizer_r.fit_transform(df_r_list)[:-1]
    X_d = vectorizer_d.fit_transform(df_d_list)[:-1]
    X_r_d = (X - (X_r + X_d))  
    X = (r_val * X_r + d_val * X_d + r_d_val * X_r_d)
    
    feature_names = vectorizer.get_feature_names()
    
    if return_as == 'csr':
        return X, feature_names
    elif return_as == 'df':
        return pd.DataFrame(X.toarray(), columns=feature_names, index=df.index).to_sparse(0)

In [9]:
boh = bag_of_heroes(heroes_df, N=1, r_val=1, d_val=-1, return_as='df')

X_heroes_train = boh[:idx_split]
X_heroes_test  = boh[idx_split:]

In [10]:
X_heroes_train.head(5)

Unnamed: 0_level_0,id001,id002,id003,id004,id005,id006,id007,id008,id009,id010,...,id107,id108,id109,id110,id111,id112,id113,id114,id119,id120
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a400b8f29dece5f4d266f49f1ae2e98a,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
b9c57c450ce74a2af79c9ce96fac144d,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6db558535151ea18ca70a6892197db41,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46a0ddce8f7ed2a8d9bd5edcbb925682,0,0,0,0,0,0,0,-1,0,0,...,0,0,0,0,0,0,0,0,0,0
b1b35ff97723d9b7ade1c9c3cf48f770,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
pd.to_pickle(X_heroes_train, "./train_heroes_id.pkl")
pd.to_pickle(X_heroes_test, "./test_heroes_id.pkl")