In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['test_matches.jsonl', 'train_features.csv', 'sample_submission.csv', 'train_matches.jsonl', 'test_features.csv', 'train_targets.csv']


In [2]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [3]:
train = pd.read_csv('../input/train_features.csv', index_col='match_id_hash')
target = pd.read_csv('../input/train_targets.csv', index_col='match_id_hash')
test = pd.read_csv('../input/test_features.csv', index_col='match_id_hash')

In [4]:
params_lgb = {'objective' : 'binary', 'metric' : 'auc', 'max_depth' : -1}

In [5]:
SEED = 17
n_fold = 5
skf = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=SEED)

In [6]:
y_train = target['radiant_win']
y_train = y_train.map({True: 1, False: 0})

In [7]:
def train_and_predict_lgb(train, y_train, test):
    predict = 0
    for ind_trn, ind_test in skf.split(train, y_train):
        X_train_new = train.iloc[ind_trn]
        X_valid = train.iloc[ind_test]
        y_train_new = y_train.iloc[ind_trn]
        y_valid = y_train.iloc[ind_test]

        train_dataset = lgb.Dataset(X_train_new, y_train_new)
        eval_dataset = lgb.Dataset(X_valid, y_valid)

        model = lgb.train(params_lgb, train_dataset, valid_sets=eval_dataset, 
                          num_boost_round=20000,
                          verbose_eval=1000,
                          early_stopping_rounds = 200)

        pred = model.predict(test, num_iteration=model.best_iteration)/5
        predict += pred 
        
        return predict

In [8]:
def write_to_submission_file(prediction, out_file):
    sub = pd.read_csv('../input/sample_submission.csv')
    sub['radiant_win_prob'] = prediction
    sub.to_csv(out_file, index=False)

In [9]:
import json as json
from tqdm import tqdm_notebook as tqdm

In [10]:
with open(os.path.join('../input/', 'train_matches.jsonl')) as fin:
    # read the 18-th line
    for i in range(18):
        line = fin.readline()
    
    # read JSON into a Python object 
    match = json.loads(line)

In [11]:
match.keys()

dict_keys(['game_time', 'match_id_hash', 'teamfights', 'objectives', 'chat', 'game_mode', 'lobby_type', 'players', 'targets'])

In [12]:
def read_matches(matches_file):
    
    MATCHES_COUNT = {
        'test_matches.jsonl': 10000,
        'train_matches.jsonl': 39675,
    }
    _, filename = os.path.split(matches_file)
    total_matches = MATCHES_COUNT.get(filename)
    
    with open(matches_file) as fin:
        for line in tqdm(fin, total=total_matches):
            yield json.loads(line)

In [13]:
import collections

MATCH_FEATURES = [
    ('game_time', lambda m: m['game_time']),
    ('game_mode', lambda m: m['game_mode']),
    ('lobby_type', lambda m: m['lobby_type']),
    ('objectives_len', lambda m: len(m['objectives'])),
    ('chat_len', lambda m: len(m['chat'])),
]

PLAYER_FIELDS = [
    'hero_id',
    
    'kills',
    'deaths',
    'assists',
    'denies',
    
    'gold',
    'lh',
    'xp',
    'health',
    'max_health',
    'max_mana',
    'level',

    'x',
    'y',
    
    'stuns',
    'creeps_stacked',
    'camps_stacked',
    'rune_pickups',
    'firstblood_claimed',
    'teamfight_participation',
    'towers_killed',
    'roshans_killed',
    'obs_placed',
    'sen_placed'
]

def extract_features_csv(match):
    row = [
        ('match_id_hash', match['match_id_hash']),
    ]
    
    for field, f in MATCH_FEATURES:
        row.append((field, f(match)))
        
    for slot, player in enumerate(match['players']):
        if slot < 5:
            player_name = 'r%d' % (slot + 1)
        else:
            player_name = 'd%d' % (slot - 4)

        for field in PLAYER_FIELDS:
            column_name = '%s_%s' % (player_name, field)
            row.append((column_name, player[field]))
        row.append((f'{player_name}_ability_level', len(player['ability_upgrades'])))
        row.append((f'{player_name}_max_hero_hit', player['max_hero_hit']['value']))
        row.append((f'{player_name}_purchase_count', len(player['purchase_log'])))
        row.append((f'{player_name}_count_ability_use', sum(player['ability_uses'].values())))
        row.append((f'{player_name}_damage_dealt', sum(player['damage'].values())))
        row.append((f'{player_name}_damage_received', sum(player['damage_taken'].values())))
        #row.append((f'{player_name}_hero_inventory', 
                    #' '.join(dct['id'].strip('item_') for dct in player['hero_inventory'])))
            
    return collections.OrderedDict(row)
    
def extract_targets_csv(match, targets):
    return collections.OrderedDict([('match_id_hash', match['match_id_hash'])] + [
        (field, targets[field])
        for field in ['game_time', 'radiant_win', 'duration', 'time_remaining', 'next_roshan_team']
    ])

In [14]:
%%time
PATH_TO_DATA = '../input/'
df_new_features = []
df_new_targets = []

for match in read_matches(os.path.join(PATH_TO_DATA, 'train_matches.jsonl')):
    match_id_hash = match['match_id_hash']
    features = extract_features_csv(match)
    targets = extract_targets_csv(match, match['targets'])
    
    df_new_features.append(features)
    df_new_targets.append(targets)

HBox(children=(IntProgress(value=0, max=39675), HTML(value='')))


CPU times: user 1min 25s, sys: 4.16 s, total: 1min 29s
Wall time: 1min 29s


In [15]:
df_new_features = pd.DataFrame.from_records(df_new_features).set_index('match_id_hash')
df_new_targets = pd.DataFrame.from_records(df_new_targets).set_index('match_id_hash')

In [16]:
df_new_features.head()

Unnamed: 0_level_0,game_time,game_mode,lobby_type,objectives_len,chat_len,r1_hero_id,r1_kills,r1_deaths,r1_assists,r1_denies,r1_gold,r1_lh,r1_xp,r1_health,r1_max_health,r1_max_mana,r1_level,r1_x,r1_y,r1_stuns,r1_creeps_stacked,r1_camps_stacked,r1_rune_pickups,r1_firstblood_claimed,r1_teamfight_participation,r1_towers_killed,r1_roshans_killed,r1_obs_placed,r1_sen_placed,r1_ability_level,r1_max_hero_hit,r1_purchase_count,r1_count_ability_use,r1_damage_dealt,r1_damage_received,r2_hero_id,r2_kills,r2_deaths,r2_assists,r2_denies,...,d4_towers_killed,d4_roshans_killed,d4_obs_placed,d4_sen_placed,d4_ability_level,d4_max_hero_hit,d4_purchase_count,d4_count_ability_use,d4_damage_dealt,d4_damage_received,d5_hero_id,d5_kills,d5_deaths,d5_assists,d5_denies,d5_gold,d5_lh,d5_xp,d5_health,d5_max_health,d5_max_mana,d5_level,d5_x,d5_y,d5_stuns,d5_creeps_stacked,d5_camps_stacked,d5_rune_pickups,d5_firstblood_claimed,d5_teamfight_participation,d5_towers_killed,d5_roshans_killed,d5_obs_placed,d5_sen_placed,d5_ability_level,d5_max_hero_hit,d5_purchase_count,d5_count_ability_use,d5_damage_dealt,d5_damage_received
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
a400b8f29dece5f4d266f49f1ae2e98a,155,22,7,1,11,11,0,0,0,0,543,7,533,358,600,350.938,2,116,122,0.0,0,0,1,0,0.0,0,0,0,0,0,136,6,3,1287,973,78,0,0,0,3,...,0,0,1,0,0,62,9,3,868,16,34,0,0,0,0,851,11,870,593,680,566.938,3,128,128,0.0,0,0,0,0,0.0,0,0,0,0,0,164,6,4,2332,681
b9c57c450ce74a2af79c9ce96fac144d,658,4,0,3,10,15,7,2,0,7,5257,52,3937,1160,1160,566.938,8,76,78,0.0,0,0,0,0,0.438,0,0,0,0,6,216,28,9,20882,4857,96,3,1,2,3,...,0,0,0,0,5,78,23,17,7437,5893,92,0,2,0,1,1423,8,1136,800,800,446.938,4,180,176,0.0,0,0,0,0,0.0,0,0,0,0,4,164,11,7,2308,2154
6db558535151ea18ca70a6892197db41,21,23,0,0,0,101,0,0,0,0,176,0,0,680,680,506.938,1,118,118,0.0,0,0,0,0,0.0,0,0,0,0,0,0,4,0,0,0,51,0,0,0,0,...,0,0,0,0,0,0,5,0,0,0,17,0,0,0,0,96,0,0,640,640,446.938,1,162,162,0.0,0,0,0,0,0.0,0,0,0,0,0,0,4,0,0,0
46a0ddce8f7ed2a8d9bd5edcbb925682,576,22,7,1,4,14,1,0,3,1,1613,0,1471,900,900,290.938,4,170,96,2.366,0,0,5,0,0.571,0,0,0,0,2,182,17,28,3350,3103,99,1,0,1,2,...,0,0,0,0,2,80,18,11,6010,4201,86,0,1,0,1,1333,2,1878,630,740,518.938,5,82,160,8.665,3,1,3,0,0.0,0,0,2,0,3,135,13,19,3955,3317
b1b35ff97723d9b7ade1c9c3cf48f770,453,22,7,1,3,42,0,1,1,0,1404,9,1351,1000,1000,338.938,4,80,164,9.931,0,0,4,0,0.5,0,0,0,0,2,60,8,12,4547,1972,69,1,0,0,0,...,0,0,0,0,2,133,9,14,8388,3160,1,0,1,1,8,2199,32,1919,692,740,302.938,5,104,162,0.0,2,1,2,0,0.25,0,0,0,0,2,67,7,9,10739,2785


In [17]:
test_new_features = []
for match in read_matches(os.path.join(PATH_TO_DATA, 'test_matches.jsonl')):
    match_id_hash = match['match_id_hash']
    features = extract_features_csv(match)
    
    test_new_features.append(features)
test_new_features = pd.DataFrame.from_records(test_new_features).set_index('match_id_hash')

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




In [18]:
train_new_feat = pd.DataFrame()
test_new_feat = pd.DataFrame()

In [19]:
for c in ['kills', 'deaths', 'assists', 'denies', 'gold', 'lh', 'xp', 'health', 
          'max_health', 'max_mana', 'stuns', 'creeps_stacked', 
          'camps_stacked', 'rune_pickups', 'firstblood_claimed', 
          'teamfight_participation', 'towers_killed', 'roshans_killed', 
          'obs_placed', 'sen_placed', 'ability_level', 'max_hero_hit', 'purchase_count',
          'count_ability_use', 'damage_dealt', 'damage_received']:
    r_columns = [f'r{i}_{c}' for i in range(1, 6)]
    d_columns = [f'd{i}_{c}' for i in range(1, 6)]
    
    train_new_feat['r_total_' + c] = df_new_features[r_columns].sum(1).values
    train_new_feat['d_total_' + c] = df_new_features[d_columns].sum(1).values
    train_new_feat['total_' + c + '_ratio'] = (train_new_feat['r_total_' + c] / train_new_feat['d_total_' + c]).values
    
    test_new_feat['r_total_' + c] = test_new_features[r_columns].sum(1).values
    test_new_feat['d_total_' + c] = test_new_features[d_columns].sum(1).values
    test_new_feat['total_' + c + '_ratio'] = (test_new_feat['r_total_' + c] / test_new_feat['d_total_' + c]).values
    
    train_new_feat['r_std_' + c] = df_new_features[r_columns].std(1).values
    train_new_feat['d_std_' + c] = df_new_features[d_columns].std(1).values
    train_new_feat['std_' + c + '_ratio'] = (train_new_feat['r_std_' + c] / train_new_feat['d_std_' + c]).values
    
    test_new_feat['r_std_' + c] = test_new_features[r_columns].std(1).values
    test_new_feat['d_std_' + c] = test_new_features[d_columns].std(1).values
    test_new_feat['std_' + c + '_ratio'] = (test_new_feat['r_std_' + c] / test_new_feat['d_std_' + c]).values
    
    train_new_feat['r_mean_' + c] = df_new_features[r_columns].mean(1).values
    train_new_feat['d_mean_' + c] = df_new_features[d_columns].mean(1).values
    train_new_feat['mean_' + c + '_ratio'] = (train_new_feat['r_mean_' + c] / train_new_feat['d_mean_' + c]).values
    
    test_new_feat['r_mean_' + c] = test_new_features[r_columns].mean(1).values
    test_new_feat['d_mean_' + c] = test_new_features[d_columns].mean(1).values
    test_new_feat['mean_' + c + '_ratio'] = (test_new_feat['r_mean_' + c] / test_new_feat['d_mean_' + c]).values

In [20]:
for c in ['level', 'x', 'y']:
    r_columns = [f'r{i}_{c}' for i in range(1, 6)]
    d_columns = [f'd{i}_{c}' for i in range(1, 6)]
    
    train_new_feat['r_std_' + c] = df_new_features[r_columns].std(1).values
    train_new_feat['d_std_' + c] = df_new_features[d_columns].std(1).values
    train_new_feat['std_' + c + '_ratio'] = (train_new_feat['r_std_' + c] / train_new_feat['d_std_' + c]).values
    
    test_new_feat['r_std_' + c] = test_new_features[r_columns].std(1).values
    test_new_feat['d_std_' + c] = test_new_features[d_columns].std(1).values
    test_new_feat['std_' + c + '_ratio'] = (test_new_feat['r_std_' + c] / test_new_feat['d_std_' + c]).values
    
    train_new_feat['r_mean_' + c] = df_new_features[r_columns].mean(1).values
    train_new_feat['d_mean_' + c] = df_new_features[d_columns].mean(1).values
    train_new_feat['mean_' + c + '_ratio'] = (train_new_feat['r_mean_' + c] / train_new_feat['d_mean_' + c]).values
    
    test_new_feat['r_mean_' + c] = test_new_features[r_columns].mean(1).values
    test_new_feat['d_mean_' + c] = test_new_features[d_columns].mean(1).values
    test_new_feat['mean_' + c + '_ratio'] = (test_new_feat['r_mean_' + c] / test_new_feat['d_mean_' + c]).values

In [21]:
X_train = pd.DataFrame() 
X_train = train_new_feat.reset_index(drop=True)
X_test = pd.DataFrame()
X_test = test_new_feat.copy().reset_index(drop=True)

In [22]:
X_train['lobby_type'] = train['lobby_type'].fillna(0).astype(np.uint16)
X_test['lobby_type'] = test['lobby_type'].fillna(0).astype(np.uint16)

In [23]:
X_train['game_time'] = train['game_time'].values
X_test['game_time'] = test['game_time'].values

In [24]:
#X_train['game_mode'] = train['game_mode'].values
#X_test['game_mode'] = test['game_mode'].values

In [25]:
X_train['objectives_len'] = train['objectives_len'].values
X_test['objectives_len'] = test['objectives_len'].values

In [26]:
X_train = X_train.fillna(0)
X_train = X_train.replace(np.inf, 0)
X_test = X_test.fillna(0)
X_test = X_test.replace(np.inf, 0)

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

In [28]:
def hero_dammies(X_train, X_test, let):
    r_cols = [let +'%s_hero_id' %i for i in range(1, 6)]
    X = pd.concat([X_train, X_test])
    X['herois'+ let] = X.apply(lambda row: ' '.join(row.loc[r_cols].map(int).map(str)), axis=1)
    cvv = CountVectorizer()
    heroes = pd.DataFrame(cvv.fit_transform(X['herois'+let]).todense(), 
                          columns=cvv.get_feature_names(), index=X.index)
    return heroes.loc[X_train.index], heroes.loc[X_test.index]

In [29]:
train_r, test_r = hero_dammies(df_new_features, test_new_features, 'r')
train_d, test_d = hero_dammies(df_new_features, test_new_features, 'd')

In [30]:
X_train = X_train.join(train_r.reset_index(drop=True), rsuffix='_r')
X_train = X_train.join(train_d.reset_index(drop=True), rsuffix='_d')

In [31]:
X_test = X_test.join(test_r.reset_index(drop=True), rsuffix='_r')
X_test = X_test.join(test_d.reset_index(drop=True), rsuffix='_d')

In [32]:
y_predict = train_and_predict_lgb(X_train, y_train, X_test)

Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[120]	valid_0's auc: 0.841772


In [33]:
write_to_submission_file(y_predict, 'baseline_11.csv') # Public LB - 0.84686