In [None]:
import os 
import pandas as pd
import numpy as np
import datetime
import json

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ShuffleSplit, KFold

%matplotlib inline
from matplotlib import pyplot as plt

In [None]:
PATH_TO_DATA = '../input'
df_train_features = pd.read_csv(os.path.join(PATH_TO_DATA,'train_features.csv'), index_col='match_id_hash')
df_train_targets = pd.read_csv(os.path.join(PATH_TO_DATA,'train_targets.csv'), index_col='match_id_hash')
df_test_features = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_features.csv'), index_col='match_id_hash')

In [None]:
print("train feature shape {0}".format(df_train_features.shape))
print("train target shape {0}".format(df_train_targets.shape))

In [None]:
df_train_features.head(5)

In [None]:
df_train_targets.head(5)

In [None]:
X = df_train_features.values
y = df_train_targets['radiant_win'].values
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.3, random_state=17)

# Random Forest

In [None]:
%%time
model = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=17)
model.fit(X_train, y_train)

In [None]:
def fit_and_score(model, X=X_train, y=y_train, X_val=X_valid, y_val=y_valid):
    model.fit(X, y)
    y_pred = model.predict_proba(X_valid)[:, 1]
    
    valid_score = roc_auc_score(y_valid, y_pred)
    print('Validation ROC-AUC score:', valid_score)

    valid_accuracy = accuracy_score(y_valid, y_pred > 0.5)
    print('Validation accuracy of P>0.5 classifier:', valid_accuracy)

In [None]:
fit_and_score(model)

# Submission

In [None]:
def write_submission_file(df_submission):
    submission_filename = 'submission_{}.csv'.format(datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    df_submission.to_csv(submission_filename)
    print('Submission saved to {}'.format(submission_filename))

In [None]:
def predict_and_write_submission_file(model):
    X_test = df_test_features.values
    y_test_pred = model.predict_proba(X_test)[:, 1]
    Y_pred_submission = pd.DataFrame({'radiant_win_prob': y_test_pred}, index=df_test_features.index)
    
    write_submission_file(Y_pred_submission)

In [None]:
predict_and_write_submission_file(model)

# Cross-validation

In [None]:
# 5 splits with 70%/30%
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=17)
def get_cv_scores(model, X=X, y=y, cv=cv, scoring='roc_auc'):
    return cross_val_score(model, X, y, cv=cv, scoring=scoring, n_jobs=-1)

In [None]:
# %%time

# model_rf1 = RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=None, random_state=17)

# # calcuate ROC-AUC for each split
# cv_scores_rf1 = cross_val_score(model_rf1, X, y, cv=cv, scoring='roc_auc')

In [None]:
%%time

model_rf2 = RandomForestClassifier(n_estimators=100, n_jobs=-1, min_samples_leaf=3, random_state=17)

cv_scores_rf2 = get_cv_scores(model_rf2)
cv_scores_rf2

In [None]:
print('Model 2 mean score:', cv_scores_rf2.mean())

In [None]:
fit_and_score(model_rf2)
predict_and_write_submission_file(model_rf2)

# Working with all available information on Dota games
> Raw data descriptions for all games are given in files `train_matches.jsonl` and `test_matches.jsonl`. Each file has one entry for each game in [JSON](https://en.wikipedia.org/wiki/JSON) format. You only need to know that it can be easily converted to Python objects via the `json.loads` method.

In [None]:
with open(os.path.join(PATH_TO_DATA, 'train_matches.jsonl')) as fin:
    # read the 18-th line
    for i in range(18):
        line = fin.readline()
    
    # read JSON into a Python object 
    match = json.loads(line)

In [None]:
#match

In [None]:
player = match['players'][2]
#player

In [None]:
player['kills'], player['deaths'], player['assists']

In [None]:
player['ability_uses']

In [None]:
for player in match['players']:
    plt.plot(player['times'], player['gold_t'])
    
plt.title('Gold change for all players');

In [None]:
#!pip install ujson
#!pip install tqdm

In [None]:
import os

try:
    import ujson as json
except ModuleNotFoundError:
    import json
    print ('Please install ujson to read JSON oblects faster')
    
try:
    from tqdm import tqdm_notebook
except ModuleNotFoundError:
    tqdm_notebook = lambda x: x
    print ('Please install tqdm to track progress with Python loops')

def read_matches(matches_file):
    
    MATCHES_COUNT = {
        'test_matches.jsonl': 10000,
        'train_matches.jsonl': 39675,
    }
    _, filename = os.path.split(matches_file)
    total_matches = MATCHES_COUNT.get(filename)
    
    with open(matches_file) as fin:
        for line in tqdm_notebook(fin, total=total_matches):
            yield json.loads(line)

In [None]:
# for match in read_matches(os.path.join(PATH_TO_DATA, 'train_matches.jsonl')):
#     match_id_hash = match['match_id_hash']
#     game_time = match['game_time']
    
#     # processing each game
    
#     for player in match['players']:
#         pass  # processing each player

# Feature engineering

In [None]:
def add_new_features(df_features, matches_file):
    
    # Process raw data and add new features
    for match in read_matches(matches_file):
        match_id_hash = match['match_id_hash']

        # Counting ruined towers for both teams
        radiant_tower_kills = 0
        dire_tower_kills = 0
        for objective in match['objectives']:
            if objective['type'] == 'CHAT_MESSAGE_TOWER_KILL':
                if objective['team'] == 2:
                    radiant_tower_kills += 1
                if objective['team'] == 3:
                    dire_tower_kills += 1

        # Write new features
        df_features.loc[match_id_hash, 'radiant_tower_kills'] = radiant_tower_kills
        df_features.loc[match_id_hash, 'dire_tower_kills'] = dire_tower_kills
        df_features.loc[match_id_hash, 'diff_tower_kills'] = radiant_tower_kills - dire_tower_kills
        
        # ... here you can add more features ...
        

In [None]:
# copy the dataframe with features
df_train_features_extended = df_train_features.copy()

# add new features
add_new_features(df_train_features_extended, 
                 os.path.join(PATH_TO_DATA, 
                              'train_matches.jsonl'))

In [None]:
df_train_features_extended.head()

# Feature evaluation

In [None]:
%%time

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, n_jobs=4, min_samples_leaf=3, random_state=17)

cv_scores_base = cross_val_score(model, X, y, cv=cv, scoring='roc_auc', n_jobs=-1)
cv_scores_extended = cross_val_score(model, df_train_features_extended.values, y, 
                                     cv=cv, scoring='roc_auc', n_jobs=-1)

In [None]:
print('Base features: mean={} scores={}'.format(cv_scores_base.mean(), 
                                                cv_scores_base))
print('Extended features: mean={} scores={}'.format(cv_scores_extended.mean(), 
                                                    cv_scores_extended))

In [None]:
cv_scores_extended > cv_scores_base

In [None]:
%%time
# Build the same features for the test set
df_test_features_extended = df_test_features.copy()
add_new_features(df_test_features_extended, 
                 os.path.join(PATH_TO_DATA, 'test_matches.jsonl'))

In [None]:
model = RandomForestClassifier(n_estimators=100, n_jobs=4, random_state=17)
model.fit(X, y)
df_submission_base = pd.DataFrame(
    {'radiant_win_prob': model.predict_proba(df_test_features.values)[:, 1]}, 
    index=df_test_features.index,
)
df_submission_base.to_csv('submission_base_rf.csv')

In [None]:
model_extended = RandomForestClassifier(n_estimators=100, n_jobs=4, random_state=17)
model_extended.fit(df_train_features_extended.values, y)
df_submission_extended = pd.DataFrame(
    {'radiant_win_prob': model_extended.predict_proba(df_test_features_extended.values)[:, 1]}, 
    index=df_test_features.index,
)
df_submission_extended.to_csv('submission_extended_rf.csv')

In [None]:
# this one will be used as a final submission in this kernel
!cp submission_extended_rf.csv submission.csv