In [1]:
import numpy as np
import pandas as pd

import xgboost as xgb

from xgboost import XGBClassifier

from datetime import datetime,timedelta

import os
os.chdir('../Python')
from utilities import *
os.chdir('../development')

# Loading data

In [36]:
data = load("../Generated Data/atp_data_f")
features=load("../Generated Data/atp_data_features")
eval_odds=load("../Generated Data/eval_odds")

In [523]:
def get_cals_to_drop(start_date,nb_players,nb_tournaments):
    
    test_beginning_match=data[data.Date>=start_date].index[0] #id of the first match of the testing set
    span_matches=len(data)-test_beginning_match+1
    #duration_val_matches=700 + 10
    duration_val_matches=300 
    duration_train_matches=10400
    #duration_train_matches=12122
    duration_test_matches=2000
    
    # Number of matches in our dataset (ie. nb. of outcomes divided by 2)
    nm=int(len(features)/2)

    # Id of the first and last match of the testing,validation,training set
    beg_test=test_beginning_match
    end_test=min(test_beginning_match+duration_test_matches-1,nm-1)
    end_val=min(beg_test-1,nm-1)
    beg_val=beg_test-duration_val_matches
    end_train=beg_val-1
    beg_train=beg_val-duration_train_matches
    
    # We limit the number of players and tournaments one-hot encoded : we'll keep only the 
    # players that won the most matches to avoid overfitting and make the process quicker
    # Biggest players :
    biggest_players=data.iloc[range(beg_train,end_train),:][["Winner","Loser"]]
    biggest_players=pd.concat([biggest_players.Winner,biggest_players.Loser],0)
    biggest_players=list(biggest_players.value_counts().index[:nb_players])
    player_columns=[el for el in xtrain.columns if el[:6]=="player"]
    to_drop_players=[el for el in player_columns if el[7:] not in biggest_players]
    # Biggest Tournaments
    biggest_tournaments=data.iloc[range(beg_train,end_train),:]["Tournament"]
    biggest_tournaments=list(biggest_tournaments.value_counts().index[:nb_tournaments])
    tournament_columns=[el for el in xtrain.columns if el[:10]=="tournament"]
    to_drop_tournaments=[el for el in tournament_columns if el[11:] not in biggest_tournaments]
    
    return to_drop_players, to_drop_tournaments
    

In [524]:
def make_split(days_diff,to_drop_players, to_drop_tournaments, start_date):
    
    
    #start_date=datetime(2016,2,1) #first day of testing set
    test_beginning_match=data[data.Date>=start_date].index[0] #id of the first match of the testing set
    span_matches=len(data)-test_beginning_match+1
    #duration_val_matches=700 + 10
    duration_val_matches=300 + days_diff
    #duration_train_matches=10400
    duration_train_matches=12122
    duration_test_matches=2000
    
    # Number of matches in our dataset (ie. nb. of outcomes divided by 2)
    nm=int(len(features)/2)

    # Id of the first and last match of the testing,validation,training set
    beg_test=test_beginning_match
    end_test=min(test_beginning_match+duration_test_matches-1,nm-1)
    end_val=min(beg_test-1,nm-1)
    beg_val=beg_test-duration_val_matches
    end_train=beg_val-1
    beg_train=beg_val-duration_train_matches
    
    global train_indices
    global val_indices
    global test_indices
    global xtest

    train_indices=range(2*beg_train,2*end_train+2)
    val_indices=range(2*beg_val,2*end_val+2)
    test_indices=range(2*beg_test,2*end_test+2)
    


    # Split in train/validation/test
    xval=features.iloc[val_indices,:].reset_index(drop=True)
    xtest=features.iloc[test_indices,:].reset_index(drop=True)
    xtrain=features.iloc[train_indices,:].reset_index(drop=True)
    ytrain=pd.Series([1,0]*int(len(train_indices)/2))
    yval=pd.Series([1,0]*int(len(val_indices)/2))
    
    '''
    # We limit the number of players and tournaments one-hot encoded : we'll keep only the 
    # players that won the most matches to avoid overfitting and make the process quicker
    # Biggest players :
    biggest_players=data.iloc[range(beg_train,end_train),:][["Winner","Loser"]]
    biggest_players=pd.concat([biggest_players.Winner,biggest_players.Loser],0)
    biggest_players=list(biggest_players.value_counts().index[:nb_players])
    player_columns=[el for el in xtrain.columns if el[:6]=="player"]
    to_drop_players=[el for el in player_columns if el[7:] not in biggest_players]
    # Biggest Tournaments
    biggest_tournaments=data.iloc[range(beg_train,end_train),:]["Tournament"]
    biggest_tournaments=list(biggest_tournaments.value_counts().index[:nb_tournaments])
    tournament_columns=[el for el in xtrain.columns if el[:10]=="tournament"]
    to_drop_tournaments=[el for el in tournament_columns if el[11:] not in biggest_tournaments]
    # We drop smallest Tournaments and players
    '''
    xtrain=xtrain.drop(to_drop_players+to_drop_tournaments,1)
    xval=xval.drop(to_drop_players+to_drop_tournaments,1)
    xtest=xtest.drop(to_drop_players+to_drop_tournaments,1)
    
    
    
    
    
    
    return xtrain, ytrain, xval, yval, xtest, train_indices, val_indices, test_indices
    
    #return xtrain, ytrain, xval, yval, xtest


In [14]:
len(train_indices)

24244

In [7]:
xtrain, ytrain, xval, yval, xtest, train_indices, val_indices, test_indices = make_split(5)

In [39]:

## XGB parameters
#learning_rate=[0.295] 
learning_rate=[0.3] 
max_depth=[19]
#max_depth=[10]
min_child_weight=[1]
gamma=[0.8]
csbt=[0.5]
lambd=[0]
alpha=[2]
#alpha=[1]
num_rounds=[300]
early_stop=[10]
params=np.array(np.meshgrid(learning_rate,max_depth,min_child_weight,gamma,csbt,lambd,alpha,num_rounds,early_stop)).T.reshape(-1,9).astype(np.float)
#xgb_params=params[0]
p=params[0]


def evalerror(preds, dtrain):
#thr = 0.5
    #labels = dtrain.get_label()
    global eval_odds
    global train_indices
    global val_indices
    global test_indices


    if len(preds) == len(train_indices):
        odds = eval_odds[train_indices]
        gain = odds * preds       
    elif len(preds) == len(val_indices):
        odds = eval_odds[val_indices]
        gain = odds * preds 
    elif len(preds) == len(test_indices):
        odds = eval_odds[test_indices]
        gain = odds * preds 
    else:
        return 'function_error', 0

    
    #print(preds.shape)
    
    #return 'neg_gain_error', 0

    return 'neg_gain_error', -gain.sum()




dtrain=xgb.DMatrix(xtrain,label=ytrain)

#the_metric = "logloss"
#the_metric = "error"
#the_metric = "auc"
#the_metric = ["error","auc"]
the_metric = 'evalerror'

dval=xgb.DMatrix(xval,label=yval)
eval_set = [(dtrain,"train_loss"),(dval, 'eval')]
#eval_set = [(dval, 'eval')]
#params={'eval_metric':the_metric,"objective":"binary:logistic",'subsample':0.8,
#        'min_child_weight':p[2],'alpha':p[6],'lambda':p[5],'max_depth':int(p[1]),
#        'gamma':p[3],'eta':p[0],'colsample_bytree':p[4]}
#model=xgb.train(params, dtrain, int(p[7]),evals=eval_set,early_stopping_rounds=int(p[8]))

params={"objective":"binary:logistic",'subsample':0.8,
        'min_child_weight':p[2],'alpha':p[6],'lambda':p[5],'max_depth':int(p[1]),
        'gamma':p[3],'eta':p[0],'colsample_bytree':p[4],  'disable_default_eval_metric' : 1}
#model=xgb.train(params, dtrain, int(p[7]),evals=eval_set,early_stopping_rounds=int(p[8]), eval_metric = evalerror)
model=xgb.train(params, dtrain, int(p[7]),evals=eval_set,early_stopping_rounds=int(p[8]), feval = evalerror)
#model=xgb.train(params, dtrain, int(p[7]),evals=eval_set,early_stopping_rounds=int(p[8]))

#model=xgb.train(params, dtrain, int(p[7]),evals=eval_set,early_stopping_rounds=int(p[8]))


[0]	train_loss-neg_gain_error:-110.61	eval-neg_gain_error:15.8456
Multiple eval metrics have been passed: 'eval-neg_gain_error' will be used for early stopping.

Will train until eval-neg_gain_error hasn't improved in 10 rounds.
[1]	train_loss-neg_gain_error:-570.352	eval-neg_gain_error:12.8859
[2]	train_loss-neg_gain_error:-970.036	eval-neg_gain_error:13.3708
[3]	train_loss-neg_gain_error:-1366.11	eval-neg_gain_error:11.8475
[4]	train_loss-neg_gain_error:-1761.72	eval-neg_gain_error:12.0528
[5]	train_loss-neg_gain_error:-2126.39	eval-neg_gain_error:10.9817
[6]	train_loss-neg_gain_error:-2429.23	eval-neg_gain_error:12.5219
[7]	train_loss-neg_gain_error:-2700.23	eval-neg_gain_error:11.7474
[8]	train_loss-neg_gain_error:-2948.49	eval-neg_gain_error:10.5577
[9]	train_loss-neg_gain_error:-3167.83	eval-neg_gain_error:9.65078
[10]	train_loss-neg_gain_error:-3371.46	eval-neg_gain_error:8.85426
[11]	train_loss-neg_gain_error:-3588.64	eval-neg_gain_error:9.93691
[12]	train_loss-neg_gain_error:-

In [538]:
def get_trained_model(days_diff,to_drop_players, to_drop_tournaments,start_date):
    xtrain, ytrain, xval, yval, xtest, train_indices, val_indices, test_indices = make_split(days_diff,to_drop_players, to_drop_tournaments,start_date)
    
    
    ## XGB parameters
    learning_rate=[0.295] 
    #learning_rate=[0.2] 
    max_depth=[19]
    #max_depth=[10]
    min_child_weight=[1]
    gamma=[0.8]
    csbt=[0.5]
    lambd=[0]
    alpha=[2]
    #alpha=[1]
    num_rounds=[300]
    early_stop=[10]
    params=np.array(np.meshgrid(learning_rate,max_depth,min_child_weight,gamma,csbt,lambd,alpha,num_rounds,early_stop)).T.reshape(-1,9).astype(np.float)
    #xgb_params=params[0]
    p=params[0]


    def evalerror(preds, dtrain):
    #thr = 0.5
        #labels = dtrain.get_label()
        global eval_odds
        global train_indices
        global val_indices
        global test_indices


        if len(preds) == len(train_indices):
            odds = eval_odds[train_indices]
            gain = odds * preds       
            return 'train_gain_error', -gain.sum()
        elif len(preds) == len(val_indices):
            odds = eval_odds[val_indices]
            gain = odds * preds 
            return 'val_gain_error', -gain.sum()
        elif len(preds) == len(test_indices):
            odds = eval_odds[test_indices]
            gain = odds * preds 
            return 'test_gain_error', -gain.sum()
        else:
            return 'function_error', 0


        #print(preds.shape)

        #return 'neg_gain_error', 0

        #return 'neg_gain_error', -gain.sum()




    dtrain=xgb.DMatrix(xtrain,label=ytrain)

    the_metric = "logloss"
    #the_metric = "error"
    #the_metric = "auc"
    #the_metric = ["error","auc"]
    #the_metric = 'evalerror'
    

    dval=xgb.DMatrix(xval,label=yval)
    eval_set = [(dtrain,"train_loss"),(dval, 'eval')]
    #eval_set = [(dval, 'eval')]
    params={'eval_metric':the_metric,"objective":"binary:logistic",'subsample':0.8,
            'min_child_weight':p[2],'alpha':p[6],'lambda':p[5],'max_depth':int(p[1]),
            'gamma':p[3],'eta':p[0],'colsample_bytree':p[4],  'disable_default_eval_metric' : 0}
    #model=xgb.train(params, dtrain, int(p[7]),evals=eval_set,early_stopping_rounds=int(p[8]))

    #params={"objective":"binary:logistic",'subsample':0.8,
    #        'min_child_weight':p[2],'alpha':p[6],'lambda':p[5],'max_depth':int(p[1]),
    #        'gamma':p[3],'eta':p[0],'colsample_bytree':p[4],  'disable_default_eval_metric' : 1}
    #model=xgb.train(params, dtrain, int(p[7]),evals=eval_set,early_stopping_rounds=int(p[8]), eval_metric = evalerror)
    #model=xgb.train(params, dtrain, int(p[7]),evals=eval_set,early_stopping_rounds=int(p[8]), feval = evalerror)
    model=xgb.train(params, dtrain, int(p[7]),evals=eval_set,early_stopping_rounds=int(p[8]))

    #model=xgb.train(params, dtrain, int(p[7]),evals=eval_set,early_stopping_rounds=int(p[8]))

    return model, train_indices, val_indices, test_indices , xtest

In [539]:
start_date=datetime(2015,2,1) #first day of testing set

to_drop_players, to_drop_tournaments= get_cals_to_drop(start_date,50,5)
    
    


model1, train_indices, val_indices, test_indices, xtest = get_trained_model(0,to_drop_players, to_drop_tournaments,start_date)
model2, train_indices, val_indices, test_indices, xtest = get_trained_model(10,to_drop_players, to_drop_tournaments,start_date)
model3, train_indices, val_indices, test_indices, xtest = get_trained_model(-10,to_drop_players, to_drop_tournaments,start_date)
model4, train_indices, val_indices, test_indices, xtest = get_trained_model(30,to_drop_players, to_drop_tournaments,start_date)
model5, train_indices, val_indices, test_indices, xtest = get_trained_model(-30,to_drop_players, to_drop_tournaments,start_date)
model6, train_indices, val_indices, test_indices, xtest = get_trained_model(45,to_drop_players, to_drop_tournaments,start_date)
model7, train_indices, val_indices, test_indices, xtest = get_trained_model(-45,to_drop_players, to_drop_tournaments,start_date)
model8, train_indices, val_indices, test_indices, xtest = get_trained_model(17,to_drop_players, to_drop_tournaments,start_date)
model9, train_indices, val_indices, test_indices, xtest = get_trained_model(-17,to_drop_players, to_drop_tournaments,start_date)



clf_list = [model1, model2, model3, model4, model5, model6, model7, model8, model9]
classif = VotingClassifier(clf_list)
xtest_0=features.iloc[test_indices,:].reset_index(drop=True)
#Y, K, KT, y_mean_pr, y_mean, Y_voting, y_voting, deci, y_mean_pr_conf, y_mean_conf= classif.predict(xtest, 0.7, vote_thr = (7/9))
y_mean,  y_voting, y_mean_pr_conf, y_mean_conf, y_voting_conf = classif.predict(xtest, 0.5, vote_thr = (5/9), conf_thr = 0.5)


print(evalerror(y_voting, dtrain))
print(evalerror(y_mean, dtrain))
print(evalerror(y_mean_conf, dtrain))
print(evalerror(y_voting_conf, dtrain))

[0]	train_loss-logloss:0.608268	eval-logloss:0.6497
Multiple eval metrics have been passed: 'eval-logloss' will be used for early stopping.

Will train until eval-logloss hasn't improved in 10 rounds.
[1]	train_loss-logloss:0.549762	eval-logloss:0.616655
[2]	train_loss-logloss:0.498343	eval-logloss:0.590843
[3]	train_loss-logloss:0.455137	eval-logloss:0.571737
[4]	train_loss-logloss:0.424434	eval-logloss:0.566318
[5]	train_loss-logloss:0.393896	eval-logloss:0.556175
[6]	train_loss-logloss:0.369101	eval-logloss:0.557205
[7]	train_loss-logloss:0.349339	eval-logloss:0.558828
[8]	train_loss-logloss:0.330947	eval-logloss:0.558322
[9]	train_loss-logloss:0.317442	eval-logloss:0.558456
[10]	train_loss-logloss:0.301404	eval-logloss:0.559939
[11]	train_loss-logloss:0.287966	eval-logloss:0.557166
[12]	train_loss-logloss:0.277834	eval-logloss:0.551912
[13]	train_loss-logloss:0.266238	eval-logloss:0.551181
[14]	train_loss-logloss:0.256173	eval-logloss:0.551316
[15]	train_loss-logloss:0.247767	eval-

[21]	train_loss-logloss:0.210692	eval-logloss:0.572759
[22]	train_loss-logloss:0.204902	eval-logloss:0.573062
[23]	train_loss-logloss:0.199576	eval-logloss:0.576331
[24]	train_loss-logloss:0.195328	eval-logloss:0.576335
[25]	train_loss-logloss:0.191885	eval-logloss:0.578269
[26]	train_loss-logloss:0.187285	eval-logloss:0.582708
[27]	train_loss-logloss:0.183713	eval-logloss:0.584545
[28]	train_loss-logloss:0.181204	eval-logloss:0.584324
Stopping. Best iteration:
[18]	train_loss-logloss:0.226885	eval-logloss:0.567597

[0]	train_loss-logloss:0.609662	eval-logloss:0.645366
Multiple eval metrics have been passed: 'eval-logloss' will be used for early stopping.

Will train until eval-logloss hasn't improved in 10 rounds.
[1]	train_loss-logloss:0.551859	eval-logloss:0.619629
[2]	train_loss-logloss:0.500612	eval-logloss:0.59643
[3]	train_loss-logloss:0.457473	eval-logloss:0.582395
[4]	train_loss-logloss:0.426213	eval-logloss:0.57088
[5]	train_loss-logloss:0.396305	eval-logloss:0.565996
[6]	tra

In [98]:
pred_test= model1.predict(xgb.DMatrix(xtest,label=None)) 

In [99]:
pred_test

array([0.51694536, 0.33998728, 0.5996222 , ..., 0.10786606, 0.86088526,
       0.38840255], dtype=float32)

In [58]:
ytest=pd.Series([1,0]*int(len(test_indices)/2))

In [55]:
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import LabelEncoder

In [96]:

clf_list = [model1, model2, model3]



In [499]:
class VotingClassifier(object):
    """ Implements a voting classifier for pre-trained classifiers"""

    def __init__(self, estimators):
        self.estimators = estimators

    def predict(self, X, thr = 0.5 ,vote_thr = 0.5, conf_thr = 0.5):
        # get values
        Y = np.zeros([X.shape[0], len(self.estimators)], dtype='float32')
        for i, clf in enumerate(self.estimators):
            if str(type(clf)) == "<class 'xgboost.core.Booster'>":              
                Y[:, i] = np.array(clf.predict(xgb.DMatrix(X,label=None)))
            else:
                print('Hallo')
                Y[:, i] = clf.predict_proba(X)
        # apply voting 
        
        # odd confidence voting
        odds = xtest_0.odds.fillna(1)        
        conf = 1/odds        
        Y_conf = np.zeros([X.shape[0], len(self.estimators)], dtype='float32')
        for i in range(len(self.estimators)):
            Y_conf[:, i] = Y[:, i] * conf
            
        # odd conf mean proba voting
        y_mean_conf = np.zeros(X.shape[0])
        y_mean_pr_conf = np.mean(Y_conf, axis=1)
        y_mean_conf = [1 if i >=conf_thr else 0 for i in y_mean_pr_conf]      

        # majority conf voting
        y_voting_conf = np.zeros(X.shape[0])
        y_mer = np.zeros(X.shape[0])
        Y_voting_conf = np.zeros([X.shape[0], len(self.estimators)], dtype='int')
        for i in range(len(self.estimators)):  
            #K = np.array([1 if i >=conf_thr else 0 for i in Y[:,i]])
            Y_voting_conf[:,i] = [1 if i >=conf_thr else 0 for i in Y_conf[:,i]]
            
        for i in range(X.shape[0]):
            deci = Y_voting_conf[i,:].sum()/len(self.estimators)
            y_voting_conf[i] = 1 if deci >= vote_thr else 0
            #y_mer[]        

        # mean proba voting
        y_mean = np.zeros(X.shape[0])
        y_mean_pr = np.mean(Y, axis=1)
        y_mean = [1 if i >=thr else 0 for i in y_mean_pr]
        
        # majority voting
        y_voting = np.zeros(X.shape[0])
        y_mer = np.zeros(X.shape[0])
        Y_voting = np.zeros([X.shape[0], len(self.estimators)], dtype='int')
        for i in range(len(self.estimators)):  
            K = np.array([1 if i >=thr else 0 for i in Y[:,i]])
            Y_voting[:,i] = [1 if i >=thr else 0 for i in Y[:,i]]
            
        for i in range(X.shape[0]):
            deci = Y_voting[i,:].sum()/len(self.estimators)
            y_voting[i] = 1 if deci >= vote_thr else 0
            #y_mer[]
         
        #return Y, K, KT, y_mean_pr, y_mean, Y_voting, y_voting, deci, y_mean_pr_conf, y_mean_conf
        return  y_mean,  y_voting, y_mean_pr_conf, y_mean_conf, y_voting_conf
        #return y, Y, K, KT

In [531]:
clf_list = [model1, model2, model3, model4, model5, model6, model7, model8, model9]
classif = VotingClassifier(clf_list)
xtest_0=features.iloc[test_indices,:].reset_index(drop=True)
#Y, K, KT, y_mean_pr, y_mean, Y_voting, y_voting, deci, y_mean_pr_conf, y_mean_conf= classif.predict(xtest, 0.7, vote_thr = (7/9))
y_mean,  y_voting, y_mean_pr_conf, y_mean_conf, y_voting_conf = classif.predict(xtest, 0.5, vote_thr = (5/9), conf_thr = 0.5)


print(evalerror(y_voting, dtrain))
print(evalerror(y_mean, dtrain))
print(evalerror(y_mean_conf, dtrain))
print(evalerror(y_voting_conf, dtrain))

('test_gain_error', 29.220000000000013)
('test_gain_error', 0.6899999999999977)
('test_gain_error', 9.769999999999996)
('test_gain_error', 7.559999999999999)


In [490]:
y_mean_pr_conf

array([0.38350666, 0.16743562, 0.30840665, ..., 0.46068716, 0.6384262 ,
       0.04607432], dtype=float32)

In [249]:
from sklearn.metrics import confusion_matrix

In [466]:
cnf_matrix = confusion_matrix(ytest, y_voting)
cnf_matrix

array([[1840,  160],
       [1318,  682]])

In [351]:
evalerror(y_voting, dtrain)

('test_gain_error', -5.280000000000001)

In [None]:
class VotingClassifier(object):
    """ Implements a voting classifier for pre-trained classifiers"""

    def __init__(self, estimators):
        self.estimators = estimators

    def predict(self, X, thr = 0.5 ,vote_thr = 0.5):
        # get values
        Y = np.zeros([X.shape[0], len(self.estimators)], dtype='float32')
        for i, clf in enumerate(self.estimators):
            if str(type(clf)) == "<class 'xgboost.core.Booster'>":              
                Y[:, i] = np.array(clf.predict(xgb.DMatrix(X,label=None)))
            else:
                print('Hallo')
                Y[:, i] = clf.predict_proba(X)
        # apply voting 
        
        # mean proba voting
        
        y_mean = np.zeros(X.shape[0])
        y_mean_pr = np.mean(Y, axis=1)
        y_mean = [1 if i >=thr else 0 for i in y_mean_pr]
        
        
        y_voting = np.zeros(X.shape[0])
        y_mer = np.zeros(X.shape[0])
        Y_voting = np.zeros([X.shape[0], len(self.estimators)], dtype='int')
        for i in range(len(self.estimators)):  
            K = np.array([1 if i >=thr else 0 for i in Y[:,i]])
            Y_voting[:,i] = [1 if i >=thr else 0 for i in Y[:,i]]
            
        for i in range(X.shape[0]):
            deci = Y_voting[i,:].sum()/len(self.estimators)
            y_voting[i] = 1 if deci >= vote_thr else 0
            #y_mer[]
         
        return Y, K, KT, y_mean_pr, y_mean, Y_voting, y_voting, deci
        #return y, Y, K, KT

In [310]:
def evalerror(preds, dtrain):
#thr = 0.5
    #labels = dtrain.get_label()
    global eval_odds
    global train_indices
    global val_indices
    global test_indices


    if len(preds) == len(train_indices):
        odds = eval_odds[train_indices]
        gain = odds * preds       
        return 'train_gain_error', -gain.sum()
    elif len(preds) == len(val_indices):
        odds = eval_odds[val_indices]
        gain = odds * preds 
        return 'val_gain_error', -gain.sum()
    elif len(preds) == len(test_indices):
        odds = eval_odds[test_indices]
        gain = odds * preds 
        return 'test_gain_error', -gain.sum()
    else:
        return 'function_error', 0