In [22]:
import sys
import math

In [23]:
import pandas as pd
import numpy as np
from sklearn.grid_search import GridSearchCV

In [24]:
from sklearn.calibration import calibration_curve
from sklearn.cross_validation import KFold
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV

from sklearn.naive_bayes import GaussianNB

from sklearn.linear_model import LogisticRegression

from sklearn.svm import LinearSVC
from sklearn.svm import SVC

from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

In [25]:
sys.path.append('/home/scitator/Documents/Stuff/ML/xgboost/')
import xgboost as xgb

In [26]:
class XGBoostClassifier():
    def __init__(self, num_boost_round=10, **params):
        self.clf = None
        self.num_boost_round = num_boost_round
        self.params = params
        self.params.update({'objective': 'multi:softprob'})
 
    def fit(self, X, y, num_boost_round=None):
        num_boost_round = num_boost_round or self.num_boost_round
        self.label2num = dict((label, i) for i, label in enumerate(sorted(set(y))))
        dtrain = xgb.DMatrix(X, label=[self.label2num[label] for label in y])
        self.clf = xgb.train(params=self.params, dtrain=dtrain, num_boost_round=num_boost_round)
 
    def predict(self, X):
        num2label = dict((i, label)for label, i in self.label2num.items())
        Y = self.predict_proba(X)
        y = np.argmax(Y, axis=1)
        return np.array([num2label[i] for i in y])
 
    def predict_proba(self, X):
        dtest = xgb.DMatrix(X)
        return self.clf.predict(dtest)
 
    def score(self, X, y):
        Y = self.predict_proba(X)
        return 1 / logloss(y, Y)
 
    def get_params(self, deep=True):
        return self.params
 
    def set_params(self, **params):
        if 'num_boost_round' in params:
            self.num_boost_round = params.pop('num_boost_round')
        if 'objective' in params:
            del params['objective']
        self.params.update(params)
        return self
    
    
def logloss(y_true, Y_pred):
    label2num = dict((name, i) for i, name in enumerate(sorted(set(y_true))))
    return -1 * sum(math.log(y[label2num[label]]) if y[label2num[label]] > 0 else -np.inf for y, label in zip(Y_pred, y_true)) / len(Y_pred)


In [27]:
clf = XGBoostClassifier(
        eval_metric = 'auc',
        num_class = 2,
        nthread = 4,
        eta = 0.1,
        num_boost_round = 80,
        max_depth = 12,
        subsample = 0.5,
        colsample_bytree = 1.0,
        silent = 1,
        )
parameters = {
    'num_boost_round': [100, 250, 500],
    'eta': [0.05, 0.1, 0.3],
    'max_depth': [6, 9, 12],
    'subsample': [0.9, 1.0],
    'colsample_bytree': [0.9, 1.0],
}

In [28]:
def fillMissingData(df, fillNumber):
    table = df.count()/df.shape[0]
    i = 0
    for x in table:
        if x < 1.0: 
            if 'time' in df.columns.values[i]:
                df[df.columns.values[i]].fillna(fillNumber, inplace=True)
            else:
                df[df.columns.values[i]].fillna(df[df.columns.values[i]].mean(), inplace=True)
        i = i + 1
    return df
#     return df[-np.isnan(df).any(axis=1)]

In [29]:
def prepareData(fillNumber=300):
    df = pd.read_csv('./features.csv', index_col='match_id')
    y = df.radiant_win.values[:]
    #df.fillna(600, inplace=True) # like 10 min after start
    df = fillMissingData(df, fillNumber)
    df.drop(['duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire'], axis = 1, inplace=True)
    return (df, y)

In [30]:
def binaryHeroVectorize(data):
    N = np.max(np.unique(data[['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']].values))
    X_pick = np.zeros((data.shape[0], N))

    for i, match_id in enumerate(data.index):
        for p in xrange(5):
            X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
            X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
    for i, match_id in enumerate(data.index):
        for p in xrange(5):
            X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
            X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
    
    res = data.drop(['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis = 1)
    
    return np.hstack((res.values[:,:], X_pick))

In [31]:
def prepareData2(fillNumber=300, scale=True):
    (train_data, train_labels) = prepareData(fillNumber)
    X = binaryHeroVectorize(train_data)
    y = train_labels
    if scale:
        scaler = preprocessing.StandardScaler()
        X_scale = scaler.fit_transform(X)
        return (X_scale, y)
    else:
        return (X, y)

In [32]:
(X, y) = prepareData2(300, False)

In [33]:
kf = KFold(X.shape[0], n_folds=5, shuffle=True, random_state=42)

In [34]:
gs = GridSearchCV(clf, parameters, scoring='roc_auc', cv=kf, n_jobs=1)

In [35]:
gs.fit(X, y)

KeyboardInterrupt: 