In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBClassifier
from skopt import BayesSearchCV 
from skopt.space import Real, Integer

In [17]:
data=pd.read_csv("./data/raw.csv")

In [18]:
data.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,match_outcome
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,2
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,1
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,1
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,2
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,1


In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47598 entries, 0 to 47597
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   date           47598 non-null  object
 1   home_team      47598 non-null  object
 2   away_team      47598 non-null  object
 3   home_score     47598 non-null  int64 
 4   away_score     47598 non-null  int64 
 5   tournament     47598 non-null  object
 6   city           47598 non-null  object
 7   country        47598 non-null  object
 8   neutral        47598 non-null  bool  
 9   match_outcome  47598 non-null  int64 
dtypes: bool(1), int64(3), object(6)
memory usage: 3.3+ MB


In [20]:
data["match_outcome"].value_counts()

match_outcome
1    23330
0    13439
2    10829
Name: count, dtype: int64

In [21]:
data.drop(columns=["date", "city", "country","neutral"])

Unnamed: 0,home_team,away_team,home_score,away_score,tournament,match_outcome
0,Scotland,England,0,0,Friendly,2
1,England,Scotland,4,2,Friendly,1
2,Scotland,England,2,1,Friendly,1
3,England,Scotland,2,2,Friendly,2
4,Scotland,England,3,0,Friendly,1
...,...,...,...,...,...,...
47593,Kuwait,Iraq,0,0,FIFA World Cup qualification,2
47594,Palestine,Jordan,1,3,FIFA World Cup qualification,0
47595,Bahrain,Japan,0,5,FIFA World Cup qualification,0
47596,Indonesia,Australia,0,0,FIFA World Cup qualification,2


In [22]:
X = data.drop(labels="match_outcome",axis=1)
Y = data["match_outcome"]

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=500, stratify=Y)

In [23]:
pipeline = Pipeline([
    ("Encoder", TargetEncoder()), 
    ("clf", XGBClassifier(random_state=8, enable_categorical=True)) 
])

In [24]:
search_space = {
    'clf__max_depth': Integer(2, 8),
    'clf__learning_rate': Real(0.001, 1.0, prior='log-uniform'),
    'clf__subsample': Real(0.5, 1.0),
    'clf__colsample_bytree': Real(0.5, 1.0), 
    'clf__colsample_bylevel': Real(0.5, 1.0),
    'clf__colsample_bynode': Real(0.5, 1.0),
    'clf__reg_alpha': Real(0.0, 10.0),
    'clf__reg_lambda': Real(0.0, 10.0),
    'clf__gamma': Real(0.0, 10.0)
}


opt = BayesSearchCV(pipeline, search_space, cv=3, n_iter=18, scoring='roc_auc_ovr', random_state=8)


In [25]:
opt.fit(x_train, y_train)

In [26]:
opt.best_estimator_

In [27]:
opt.best_score_

np.float64(0.9999599767040706)

In [28]:
opt.score(x_test, y_test)

np.float64(0.999995793980711)

In [29]:
opt.predict(x_test)

array([1, 2, 0, ..., 1, 1, 1])

In [30]:
opt.predict_proba(x_test)

array([[5.9939112e-04, 9.9833906e-01, 1.0615154e-03],
       [3.4772360e-03, 3.6615930e-03, 9.9286121e-01],
       [9.9802423e-01, 3.2907998e-04, 1.6467548e-03],
       ...,
       [1.4090665e-04, 9.9954200e-01, 3.1712645e-04],
       [7.8984041e-05, 9.9953616e-01, 3.8486221e-04],
       [1.1194838e-03, 9.9541771e-01, 3.4628620e-03]], dtype=float32)