In [2]:
import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

import pickle

# Import dataset

In [3]:
data = pd.read_csv("../DataFormating/compressed_final.csv")

# Set `X` and `y`

In [4]:
data.columns[:10]

Index(['Stage', 'Home Team Name', 'Home Team Goals', 'Away Team Goals',
       'Away Team Name', 'Attendance', 'Half-time Home Goals',
       'Half-time Away Goals', 'Home Team Initials', 'Away Team Initials'],
      dtype='object')

In [5]:
X = data.drop(["Home Team Goals", "Away Team Goals",
               "Half-time Home Goals", "Half-time Away Goals", 
               "Home Team Initials", "Away Team Initials"], axis=1)

y = []
for i in range(len(data)):
    home_team_goals = data.iloc[i]["Home Team Goals"]
    away_team_goals = data.iloc[i]["Away Team Goals"]
    
    if home_team_goals > away_team_goals:
        y.append(1)
    elif home_team_goals < away_team_goals:
        y.append(2)
    else:
        y.append(0)

In [6]:
# Test
assert len(X) == len(y)

### Encode textual features from the `X` dataset

In [7]:
word_cup_teams = [
    "Egypt",
    "Morocco",
    "Nigeria",
    "Senegal",
    "Tunisia",
    "Australia",
    "IR Iran",
    "Japan",
    "Korea DPR",
    "Saudi Arabia",
    "Belgium",
    "Croatia",
    "Denmark",
    "England",
    "France",
    "Germany",
    "Iceland",
    "Poland",
    "Portugal",
    "Russia",
    "Serbia",
    "Spain",
    "Sweden",
    "Switzerland",
    "Costa Rica",
    "Mexico",
    "Panama",
    "Argentina",
    "Brazil",
    "Colombia",
    "Peru",
    "Uruguay"
]

team_names = list(data["Home Team Name"].unique()) + list(data["Away Team Name"].unique()) + word_cup_teams

In [8]:
stage_encoder = LabelEncoder().fit(X["Stage"])
team_name_encoder = LabelEncoder().fit(team_names)

In [9]:
X["Stage"] = stage_encoder.transform(X["Stage"])
X["Home Team Name"] = team_name_encoder.transform(X["Home Team Name"])
X["Away Team Name"] = team_name_encoder.transform(X["Away Team Name"])

### Feature Selection

In [10]:
feature_names = [
    "Stage", "Home Team Name", "Away Team Name",
    "Attendance", "Overall",
    "Mean Home Team Goals", "Mean Away Team Goals"
]

COLUMNS = []

for column_name in X.columns:
    for feature_name in feature_names:
        if feature_name in column_name:
            COLUMNS.append(column_name)
            break

X = X[COLUMNS]

In [11]:
COLUMNS

['Stage',
 'Home Team Name',
 'Away Team Name',
 'Attendance',
 'Player 1 Overall Diff',
 'Player 2 Overall Diff',
 'Player 3 Overall Diff',
 'Player 4 Overall Diff',
 'Player 5 Overall Diff',
 'Player 6 Overall Diff',
 'Player 7 Overall Diff',
 'Player 8 Overall Diff',
 'Player 9 Overall Diff',
 'Player 10 Overall Diff',
 'Player 11 Overall Diff',
 'Mean Home Team Goals',
 'Mean Away Team Goals']

# Traning Session

In [12]:
xgb_model = XGBClassifier(
    n_estimators=4000,
    max_depth=20,
    learning_rate=0.03,
    booster="gbtree",
    n_jobs=-1
)

In [13]:
xgb_model.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.03, max_delta_step=0,
       max_depth=20, min_child_weight=1, missing=None, n_estimators=4000,
       n_jobs=-1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

# Save model and encoders

In [14]:
with open("xgb_model.b", "wb") as f:
    pickle.dump(xgb_model, f)
    
with open("stage_encoder.b", "wb") as f:
    pickle.dump(stage_encoder, f)
    
with open("team_name_encoder.b", "wb") as f:
    pickle.dump(team_name_encoder, f)

In [15]:
X

Unnamed: 0,Stage,Home Team Name,Away Team Name,Attendance,Player 1 Overall Diff,Player 2 Overall Diff,Player 3 Overall Diff,Player 4 Overall Diff,Player 5 Overall Diff,Player 6 Overall Diff,Player 7 Overall Diff,Player 8 Overall Diff,Player 9 Overall Diff,Player 10 Overall Diff,Player 11 Overall Diff,Mean Home Team Goals,Mean Away Team Goals
0,1,42,26,84490.0,-8,-4,7,-18,16,16,7,12,12,-12,-20,1.000000,1.000000
1,1,48,15,64100.0,-1,24,23,10,-5,-12,3,5,-6,3,14,0.500000,0.000000
2,2,25,18,31513.0,-4,-2,-3,-17,-9,-21,0,-19,4,-6,-21,2.000000,0.000000
3,2,1,30,55686.0,-6,-6,-18,12,-1,21,0,-14,10,-15,-8,2.000000,1.666667
4,3,14,47,38646.0,15,0,-14,7,11,-1,-11,-9,-15,14,-1,1.000000,1.000000
5,3,0,41,30325.0,8,-4,25,-23,-8,25,18,3,-12,-10,-14,1.000000,0.500000
6,4,39,17,38833.0,-3,-2,-9,-3,-2,6,-9,-9,-2,6,-9,0.000000,1.000000
7,4,16,2,62660.0,-3,-14,6,-15,-10,-11,-3,-11,-6,-5,-6,2.333333,1.333333
8,5,28,11,83465.0,12,6,-5,8,10,2,-13,22,-13,18,6,1.000000,0.500000
9,5,23,5,30620.0,20,-4,-5,-14,0,-9,20,-11,2,-23,21,1.000000,0.000000
