# Imports

In [198]:
from dota.persistence import load
from dota.data import json_to_dataframe

from functools import partial
import pandas as pd

In [199]:
def damped_mean(series, group_size=100, dump_value=0.5):
    return (series.mean()*len(series) + group_size*dump_value) / (len(series) + group_size)

# Data

In [250]:
df = json_to_dataframe(load('data/18.08-3.09.pkl'))

df = df[df['picks_0'].apply(lambda s: s != ['', '', '', '', ''])].reset_index(drop=True)

df = df[['location', 'team_0', 'team_1', 'country_0', 'country_1', 'winner']]

In [251]:
locations = []
teams0 = []
teams1 = []
cnts0 = []
cnts1 = []
winners = []

for i in df.index:
    locations.append( df.loc[i, 'location'] )
    
    teams0.append( df.loc[i, 'team_1'] )
    teams1.append( df.loc[i, 'team_0'] )
    
    cnts0.append( df.loc[i, 'country_1'] )
    cnts1.append( df.loc[i, 'country_0'] )
    
    winners.append( 1 - df.loc[i, 'winner'] )

df_aug = pd.DataFrame(data={
    'location': locations,
    'team_0': teams0,
    'team_1': teams1,
    'country_0': cnts0,
    'country_1': cnts1,
    'winner': winners,
})

df = pd.concat([df, df_aug]).reset_index(drop=True)

In [260]:
df.isnull().sum()

location     0
team_0       0
team_1       0
country_0    0
country_1    0
winner       0
dtype: int64

In [261]:
df.loc[55, 'team_1']

'unk'

In [263]:
(df['team_0'] == 'unk').sum() + (df['team_1'] == 'unk').sum()

2

In [252]:
df.shape

(918, 6)

---

# Dumb baseline

In [267]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold

from sklearn.preprocessing import LabelEncoder, LabelBinarizer

In [268]:
import numpy as np

In [269]:
skf = StratifiedKFold(10, shuffle=True, random_state=123)

feats = ['location','team_0','team_1','country_0','country_1','winner']
cols_to_encode = [x for x in feats if x != 'winner']

for idx_train, idx_valid in skf.split(df[feats], df['winner']):
    
    X_train, X_valid = df[feats].loc[idx_train], df[feats].loc[idx_valid]
    y_train, y_valid = df['winner'].loc[idx_train], df['winner'].loc[idx_valid]
    
    
    # 1. Label Binarizing
    
#     arrs_train = []
#     arrs_valid = []
#     for col in cols_to_encode:
        
#         label = LabelBinarizer()
        
#         arr_train = label.fit_transform(X_train[col])
#         arr_valid = label.transform(X_valid[col])
        
#         arrs_train.append(arr_train)
#         arrs_valid.append(arr_valid)
    
#     new_X_train = np.concatenate(arrs_train, axis=1)
#     new_X_valid = np.concatenate(arrs_valid, axis=1)

#     clf = LogisticRegression()
#     clf.fit(new_X_train, y_train)
#     y_train_hat = clf.predict(new_X_train)
#     y_valid_hat = clf.predict(new_X_valid)
    
    
    # 2. Mean Target Encoding
    
    for col in cols_to_encode:
        priors = (X_train.groupby(col)
                         .agg({'winner': damped_mean})
                         .to_dict()['winner'])
        X_train[col] = X_train[col].map(priors)
        X_valid[col] = X_valid[col].map(priors)
        X_train[col] = X_train[col].fillna(0.5)
        X_valid[col] = X_valid[col].fillna(0.5)
        
    X_train.drop('winner', axis=1, inplace=True)
    X_valid.drop('winner', axis=1, inplace=True)
    
    
    clf = RandomForestClassifier(n_estimators=40)
    clf.fit(X_train, y_train)
    y_train_hat = clf.predict(X_train)
    y_valid_hat = clf.predict(X_valid)
    
    print(f'Train: {accuracy_score(y_train, y_train_hat):.3f}; Valid: {accuracy_score(y_valid, y_valid_hat):.3f}' )


Train: 0.793; Valid: 0.663
Train: 0.809; Valid: 0.587
Train: 0.803; Valid: 0.565
Train: 0.799; Valid: 0.587
Train: 0.810; Valid: 0.543
Train: 0.803; Valid: 0.565
Train: 0.792; Valid: 0.707
Train: 0.806; Valid: 0.630
Train: 0.798; Valid: 0.615
Train: 0.802; Valid: 0.637
