In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy as sc

from NNClassifier import Net, NNClassifier
from util import MyUtil

from sklearn.model_selection import train_test_split, ShuffleSplit,GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import warnings

#https://github.com/scikit-learn/scikit-learn/issues/10449
#https://stackoverflow.com/questions/49545947/sklearn-deprecationwarning-truth-value-of-an-array
#remove when fix becomes available
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

util = MyUtil()

# for auto-reloading extenrnal modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [None]:
data=pd.read_csv(filepath_or_buffer="data/final.csv",delimiter=";",index_col=False).round(2)

# Preprocessing

In [None]:
# impute missing past values with 0
data.fillna({"past_resultA":0,"past_resultB":0},inplace=True)
# drop id columns
data.drop(["gameid","teamidA","teamidB"],axis=1,inplace=True)

In [None]:
# Encode gametype
rounds = {
    "Finale": 1,
    "Spiel um Platz Drei": 2,
    "Halbfinale": 3,
    "Viertelfinale": 4,
    "Achtelfinale": 5,
    "Gruppenphase": 6
}
def map_to_round(x):
    if x.startswith("Gruppe"):
        return rounds["Gruppenphase"]
    else:
        return rounds[x]
data["gametype"]=data["gametype"].apply(map_to_round)

In [None]:
data.dtypes

In [None]:
#combine features
data["mvA"]=data["teamA_off_val"]+data["teamA_def_val"]
data["mvB"]=data["teamB_off_val"]+data["teamB_def_val"]
data["mvAminB"]=data["mvA"]/data["mvB"]
data["ageAminB"]=data["teamA_age"]-data["teamB_age"]
data["fragAminB"]=data["teamA_frag"]/data["teamB_frag"]

In [None]:
# knockout stage
data_pen=data.drop(data[data.addinfo != 'n.E.'].index,axis=0)

# knockout stage
data_ko=data.drop(data[data.gametype == 6].index,axis=0)
data_ko_nopen=data_ko.drop(data_ko[data_ko.addinfo == 'n.E.'].index,axis=0) # drop pens

# group stage
data_group=data.drop(data[data.gametype != 6].index,axis=0)

#exclude games decided by penalties
data_nopen=data.drop(data[data.addinfo == 'n.E.'].index,axis=0)

# Modelling
A single negative binomial regression is not powerful enough to predict the correct result. Evaluation of different metrics shows that the optimal hyperparameter for the negative binomial regression is often conflicting, depending on which metric is to be optimized. This is because information on the game outcome is needed. TODO

## Classifier
Our first goal is to create a classifier that can correctly predict the tendency of a game. The three categories are win/draw/loss. We will test the classifer on four subsets:
* the full game set
* the knockout stage without penalty shootouts
* all games excluding games with penalty shootouts
* the group stage
* penalty shootouts

In [None]:
def scale(data_train, data_test):
    meanMv = data_train["mvAminB"].mean()
    stdMv = data_train["mvAminB"].std()
    meanAge = data_train["ageAminB"].mean()
    stdAge = data_train["ageAminB"].std()
    meanFrag = data_train["fragAminB"].mean()
    stdFrag = data_train["fragAminB"].std()
    data_train["mvAminB"]=(data_train["mvAminB"]-meanMv)/stdMv
    data_train["ageAminB"]=(data_train["ageAminB"]-meanAge)/stdAge
    data_train["fragAminB"]=(data_train["fragAminB"]-meanFrag)/stdFrag
    data_test["mvAminB"]=(data_test["mvAminB"]-meanMv)/stdMv
    data_test["ageAminB"]=(data_test["ageAminB"]-meanAge)/stdAge
    data_test["fragAminB"]=(data_test["fragAminB"]-meanFrag)/stdFrag


In [None]:
# split into train and test set
# training set will be used with cross validation
col2=["gametype","mvAminB","ageAminB","fragAminB","past_resultA","past_resultB"]
target=["resultA","resultB"]

data_train, data_test = train_test_split(data.copy(),test_size=0.2)
# scale
scale(data_train, data_test)

X_train = data_train.drop(target,axis=1)[col2]
y_train = data_train[target]
X_test = data_test.drop(target,axis=1)[col2]
y_test = data_test[target]
print("Shape of X_test",X_test.shape)
print("Shape of y_test",y_test.shape)
print("Shape of X_train",X_train.shape)
print("Shape of y_train",y_train.shape)

# ko stage w/o penalty shootouts
data_train_ko, data_test_ko = train_test_split(data_ko_nopen.copy(),test_size=0.2)
# scale
scale(data_train_ko, data_test_ko)

X_train_ko = data_train_ko.drop(target,axis=1)[col2]
y_train_ko = data_train_ko[target]
X_test_ko = data_test_ko.drop(target,axis=1)[col2]
y_test_ko = data_test_ko[target]
print("Shape of X_test_ko",X_test_ko.shape)
print("Shape of y_test_ko",y_test_ko.shape)
print("Shape of X_train_ko",X_train_ko.shape)
print("Shape of y_train_ko",y_train_ko.shape)

# group stage
data_train_gr, data_test_gr = train_test_split(data_group.copy(),test_size=0.2)
scale(data_train_gr, data_test_gr)

X_train_gr = data_train_gr.drop(target,axis=1)[col2]
y_train_gr = data_train_gr[target]
X_test_gr = data_test_gr.drop(target,axis=1)[col2]
y_test_gr = data_test_gr[target]
print("Shape of X_test_gr",X_test_gr.shape)
print("Shape of y_test_gr",y_test_gr.shape)
print("Shape of X_train_gr",X_train_gr.shape)
print("Shape of y_train_gr",y_train_gr.shape)

# penalty shootout
data_train_pen, data_test_pen = train_test_split(data_pen.copy(),test_size=0.2)
scale(data_train_pen, data_test_pen)

X_train_pen = data_train_pen.drop(target,axis=1)[col2]
y_train_pen = data_train_pen[target]
X_test_pen = data_test_pen.drop(target,axis=1)[col2]
y_test_pen = data_test_pen[target]
print("Shape of X_test_pen",X_test_pen.shape)
print("Shape of y_test_pen",y_test_pen.shape)
print("Shape of X_train_pen",X_train_pen.shape)
print("Shape of y_train_pen",y_train_pen.shape)


We will use an ensemble of different classifiers.

In [None]:
clf_gbt = GradientBoostingClassifier()
clf_lr = LogisticRegression()
clf_rft = RandomForestClassifier()
clf_nn = NNClassifier(X_train.values.shape[1])
clf_vot = VotingClassifier(estimators=[
    ('lr', clf_lr), 
    ('rft', clf_rft), 
    ('gbt', clf_gbt),
    ('nn', clf_nn)
], voting='soft')

### Full data set

The accuracy is around 50%.

In [None]:
y_tr = np.vectorize(util.encode_tendency)(y_train.values[:,0],y_train.values[:,1],0,1,2)

In [None]:
param={
    "n_estimators": [50],
    "learning_rate": [0.03,0.1,0.3],
    "max_depth": [1,2,4,8],
    "random_state": [0],
    "min_samples_leaf": [1,2,3]
}    
cv_gbt = GridSearchCV(estimator=clf_gbt,param_grid=param,cv=5,scoring='accuracy')
cv_gbt.fit(X=X_train.values, y=y_tr)

In [None]:
cv_gbt.best_score_ , cv_gbt.best_params_

In [None]:
#cv_gbt.best_estimator_.predict(X_test)
#cv_gbt.cv_results_
param={
    "C": [0.1,0.3,1]
}   
cv_lr = GridSearchCV(estimator=clf_lr,param_grid=param,cv=5,scoring='accuracy',refit=True)
cv_lr.fit(X=X_train.values, y=y_tr)
cv_lr.best_score_ , cv_lr.best_params_
# C really akes no difference...

In [None]:
param={
    "dropout": [0.2],
    "iter": [400],
    "lr": [0.01,0.05,0.1],
    "alpha": [0.99, 0.8],
    "momentum": [0,0.5]
}
cv_nn = GridSearchCV(estimator=clf_nn,param_grid=param,cv=5,scoring='accuracy')
cv_nn.fit(X=X_train.values,y=y_tr)

In [None]:
cv_nn.best_score_ , cv_nn.best_params_

In [None]:
param_grid = {
    "nn__dropout": [0.2],
    "nn__iter": [400],
    "nn__lr": [0.01],
    "nn__alpha": [0.8],
    "nn__momentum": [0.5],
    "lr__C": [0.1,1],
    "gbt__n_estimators": [50],
    "gbt__learning_rate": [0.1,0.3],
    "gbt__max_depth": [2,4],
    "gbt__min_samples_leaf": [1,2,4],
    "gbt__random_state": [0],
    "rft__max_depth": [2,4,8,10]
}
cv_clf_vot = GridSearchCV(clf_vot,param_grid=param_grid, cv=5,refit=True)
cv_clf_vot.fit(X_train.values,y=y_tr)
cv_clf_vot.best_params_, cv_clf_vot.best_score_

### KO stage without penalty shootouts
The accuracy is around 70%.

In [None]:
y_tr_ko = np.vectorize(util.encode_tendency)(y_train_ko.values[:,0],y_train_ko.values[:,1],0,1,2)

In [None]:
param={
    "n_estimators": [50],
    "learning_rate": [0.03,0.1,0.3],
    "max_depth": [1,2,4],
    "random_state": [0]
}
cv_gbt = GridSearchCV(clf_gbt,param_grid=param, cv=5,refit=True)
cv_gbt.fit(X=X_train_ko, y=y_tr_ko)
cv_gbt.best_score_ , cv_gbt.best_params_

In [None]:
#cv_gbt.best_estimator_.predict(X_test_ko)

In [None]:
param={
    "C": [0.1,0.3,1]
}   
cv_lr = GridSearchCV(estimator=clf_lr,param_grid=param,cv=5,scoring='accuracy',refit=True)
cv_lr.fit(X=X_train_ko, y=y_tr_ko)
cv_lr.best_score_ , cv_lr.best_params_

In [None]:
#cv_lr.best_estimator_.predict_proba(X_test_ko)

In [None]:
#np.sum(y_tr_nopen==2)/y_tr_nopen.shape[0]
param_grid={
    "max_depth": [2,4,8,16]
}

cv_clf_rft = GridSearchCV(clf_rft,param_grid=param_grid, cv=5,refit=True)
cv_clf_rft.fit(X_train_ko.values,y=y_tr_ko)
cv_clf_rft.best_score_,cv_clf_rft.best_params_

In [None]:
#cv_clf_rft.best_estimator_.predict_proba(X_test_ko)

In [None]:
param_grid={
    "dropout": [0.2],
    "iter": [300],
    "lr": [0.01,0.05,0.1],
    "alpha": [0.99, 0.8],
    "momentum": [0,0.5]
}
cv_nn = GridSearchCV(estimator=clf_nn,param_grid=param_grid,cv=5,scoring='accuracy')
cv_nn.fit(X=X_train_ko.values,y=y_tr_ko)
cv_nn.best_score_,cv_nn.best_params_

In [None]:
#cv_nn.predict_proba(X_test_ko.values)

In [None]:
param_grid = {
    "nn__classes": [2], 
    #otherwise GridSearchCV will try to broadcast it to (19,2)
    "nn__dropout": [0.2],
    "nn__iter": [400],
    "nn__lr": [0.01],
    "nn__alpha": [0.8],
    "nn__momentum": [0.5],
    "lr__C": [0.1,1],
    "gbt__n_estimators": [50],
    "gbt__learning_rate": [0.1,0.3],
    "gbt__max_depth": [2,4],
    "gbt__min_samples_leaf": [1,2,4],
    "gbt__random_state": [0],
    "rft__max_depth": [2,4,8,10]
}
cv_clf_vot = GridSearchCV(clf_vot,param_grid=param_grid, cv=5,refit=True)
cv_clf_vot.fit(X_train_ko.values,y=y_tr_ko)
cv_clf_vot.best_params_, cv_clf_vot.best_score_

In [None]:
clf_nn.get_params().keys()

### Group Stage
The accuracy is around 50%.

In [None]:
y_tr_gr=np.vectorize(util.encode_tendency)(y_train_gr.values[:,0],y_train_gr.values[:,1],0,1,2) 

In [None]:
param_grid={
    "n_estimators": [100,150],
    "learning_rate": [0.03,0.1,0.3],
    "max_depth": [1,2,4],
    "random_state": [0]
}
cv_gbt = GridSearchCV(clf_gbt,param_grid=param_grid, cv=5,refit=True)
cv_gbt.fit(X=X_train_gr, y=y_tr_gr)
cv_gbt.best_estimator_, cv_gbt.best_score_ , cv_gbt.best_params_

In [None]:
param_grid={
    "C": [0.1,10]
}
cv_clf_lr = GridSearchCV(clf_lr,param_grid=param_grid, cv=5)
cv_clf_lr.fit(X_train_gr.values,y=y_tr_gr)
cv_clf_lr.best_score_ , cv_clf_lr.best_params_

In [None]:
#cv_lr.best_estimator_.predict(X_test_gr)

In [None]:
param={
    "dropout": [0.2],
    "iter": [300],
    "lr": [0.01,0.05,0.1],
    "alpha": [0.99, 0.8],
    "momentum": [0,0.5]
}
cv_nn = GridSearchCV(estimator=clf_nn,param_grid=param,cv=5,scoring='accuracy')
cv_nn.fit(X=X_train_gr.values,y=y_tr_gr)
cv_nn.best_score_ , cv_nn.best_params_

In [None]:
param_grid = {
    "nn__dropout": [0.2],
    "nn__iter": [300],
    "nn__lr": [0.01],
    "nn__alpha": [0.99],
    "nn__momentum": [0.5],
    "lr__C": [0.1,1],
    "gbt__n_estimators": [50,100],
    "gbt__learning_rate": [0.1,0.3],
    "gbt__max_depth": [2,4],
    "gbt__min_samples_leaf": [1,2,4],
    "gbt__random_state": [0],
    "rft__max_depth": [2,4,8,10]
}
cv_clf_vot = GridSearchCV(clf_vot,param_grid=param_grid, cv=5,refit=True)
cv_clf_vot.fit(X_train_gr.values,y=y_tr_gr)
cv_clf_vot.best_params_,cv_clf_vot.best_score_

### Penalty shootouts
The accuracy is around 60%.

In [None]:
y_tr_pen=np.vectorize(util.encode_tendency)(y_train_pen.values[:,0],y_train_pen.values[:,1],0,1,2) 

In [None]:
param_grid={
    "n_estimators": [100,150],
    "learning_rate": [0.03,0.1,0.3],
    "max_depth": [1,2,4],
    "random_state": [0]
}
cv_gbt = GridSearchCV(clf_gbt,param_grid=param_grid, cv=5,refit=True)
cv_gbt.fit(X=X_train_pen, y=y_tr_pen)
cv_gbt.best_estimator_, cv_gbt.best_score_ , cv_gbt.best_params_

In [None]:
param_grid={
    "C": [0.1,10]
}
cv_clf_lr = GridSearchCV(clf_lr,param_grid=param_grid, cv=5)
cv_clf_lr.fit(X_train_pen.values,y=y_tr_pen)
cv_clf_lr.best_score_ , cv_clf_lr.best_params_

In [None]:
param={
    "dropout": [0.2],
    "iter": [300],
    "lr": [0.01,0.05,0.1],
    "alpha": [0.99, 0.8],
    "momentum": [0,0.5]
}
cv_nn = GridSearchCV(estimator=clf_nn,param_grid=param,cv=5,scoring='accuracy')
cv_nn.fit(X=X_train_pen.values,y=y_tr_pen)
cv_nn.best_score_ , cv_nn.best_params_

In [None]:
param_grid = {
    "nn__classes": [2],
    "nn__dropout": [0.2],
    "nn__iter": [300],
    "nn__lr": [0.01],
    "nn__alpha": [0.99],
    "nn__momentum": [0.5],
    "lr__C": [0.1,1],
    "gbt__n_estimators": [50],
    "gbt__learning_rate": [0.1,0.3],
    "gbt__max_depth": [2,4],
    "gbt__min_samples_leaf": [1,2,4],
    "gbt__random_state": [0],
    "rft__max_depth": [2,4,8,10]
}
cv_clf_vot = GridSearchCV(clf_vot,param_grid=param_grid, cv=5,refit=True)
cv_clf_vot.fit(X_train_pen.values,y=y_tr_pen)
cv_clf_vot.best_params_,cv_clf_vot.best_score_

## Summary
We were able to predict the correct tendency in 70% of the games during the knockout stages.
For the group stage, only an accuracy of between 50-60% can be achieved.
Penalty shootout can also be classified, but it will be difficult to find a model that gets the number of goals correct both in a penalty shootout and a game decided in normal time. We will exclude penalty shootout results from our set for count prediction.