In [1]:
import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

import pickle

# Import dataset

In [2]:
data = pd.read_csv("../DataFormating/compressed_final.csv")

In [3]:
data.head()

Unnamed: 0,Stage,Home Team Name,Home Team Goals,Away Team Goals,Away Team Name,Attendance,Half-time Home Goals,Half-time Away Goals,Home Team Initials,Away Team Initials,...,Player 11 Short passing Diff,Player 11 Shot power Diff,Player 11 Sliding tackle Diff,Player 11 Sprint speed Diff,Player 11 Stamina Diff,Player 11 Standing tackle Diff,Player 11 Strength Diff,Player 11 Vision Diff,Player 11 Volleys Diff,Avg Goals / Rank Diff
0,Group A,South Africa,1.0,1.0,Mexico,84490.0,0.0,0.0,RSA,MEX,...,-42,-29,-15,-9,-31,-17,3,-43,-24,0.0
1,Group A,Uruguay,0.0,0.0,France,64100.0,0.0,0.0,URU,FRA,...,-14,-44,-9,-19,-26,-8,-4,9,-58,0.5
2,Group B,Korea Republic,2.0,0.0,Greece,31513.0,1.0,0.0,KOR,GRE,...,23,-2,39,7,29,40,-10,-16,14,2.0
3,Group B,Argentina,1.0,0.0,Nigeria,55686.0,1.0,0.0,ARG,NGA,...,-1,-25,35,-14,7,41,-12,-11,-55,0.333333
4,Group C,England,1.0,1.0,USA,38646.0,1.0,1.0,ENG,USA,...,-4,-4,-2,17,-2,-4,-18,-7,-5,0.0


# Set `X` and `y`

In [4]:
data.columns[:10]

Index(['Stage', 'Home Team Name', 'Home Team Goals', 'Away Team Goals',
       'Away Team Name', 'Attendance', 'Half-time Home Goals',
       'Half-time Away Goals', 'Home Team Initials', 'Away Team Initials'],
      dtype='object')

In [5]:
X = data.drop(["Home Team Goals", "Away Team Goals",
               "Half-time Home Goals", "Half-time Away Goals", 
               "Home Team Initials", "Away Team Initials"], axis=1)

y = []
for i in range(len(data)):
    home_team_goals = data.iloc[i]["Home Team Goals"]
    away_team_goals = data.iloc[i]["Away Team Goals"]
    
    if home_team_goals > away_team_goals:
        y.append(1)
    elif home_team_goals < away_team_goals:
        y.append(2)
    else:
        y.append(0)

In [6]:
# Test
assert len(X) == len(y)

### Encode textual features from the `X` dataset

In [7]:
word_cup_teams = [
    "Egypt",
    "Morocco",
    "Nigeria",
    "Senegal",
    "Tunisia",
    "Australia",
    "IR Iran",
    "Japan",
    "Korea DPR",
    "Saudi Arabia",
    "Belgium",
    "Croatia",
    "Denmark",
    "England",
    "France",
    "Germany",
    "Iceland",
    "Poland",
    "Portugal",
    "Russia",
    "Serbia",
    "Spain",
    "Sweden",
    "Switzerland",
    "Costa Rica",
    "Mexico",
    "Panama",
    "Argentina",
    "Brazil",
    "Colombia",
    "Peru",
    "Uruguay"
]

team_names = list(data["Home Team Name"].unique()) + list(data["Away Team Name"].unique()) + word_cup_teams

In [8]:
stage_encoder = LabelEncoder().fit(X["Stage"])
team_name_encoder = LabelEncoder().fit(team_names)

In [9]:
X["Stage"] = stage_encoder.transform(X["Stage"])
X["Home Team Name"] = team_name_encoder.transform(X["Home Team Name"])
X["Away Team Name"] = team_name_encoder.transform(X["Away Team Name"])

### Feature Selection

In [10]:
feature_names = [
    "Stage",
    "Home Team Name",
    "Away Team Name",
    "Overall",
    "Avg Goals / Rank Diff"
]

COLUMNS = []

for column_name in X.columns:
    for feature_name in feature_names:
        if feature_name in column_name:
            COLUMNS.append(column_name)
            break

X = X[COLUMNS]

In [11]:
COLUMNS

['Stage',
 'Home Team Name',
 'Away Team Name',
 'Player 1 Overall Diff',
 'Player 2 Overall Diff',
 'Player 3 Overall Diff',
 'Player 4 Overall Diff',
 'Player 5 Overall Diff',
 'Player 6 Overall Diff',
 'Player 7 Overall Diff',
 'Player 8 Overall Diff',
 'Player 9 Overall Diff',
 'Player 10 Overall Diff',
 'Player 11 Overall Diff',
 'Avg Goals / Rank Diff']

# Traning Session

In [12]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 15 columns):
Stage                     142 non-null int64
Home Team Name            142 non-null int64
Away Team Name            142 non-null int64
Player 1 Overall Diff     142 non-null int64
Player 2 Overall Diff     142 non-null int64
Player 3 Overall Diff     142 non-null int64
Player 4 Overall Diff     142 non-null int64
Player 5 Overall Diff     142 non-null int64
Player 6 Overall Diff     142 non-null int64
Player 7 Overall Diff     142 non-null int64
Player 8 Overall Diff     142 non-null int64
Player 9 Overall Diff     142 non-null int64
Player 10 Overall Diff    142 non-null int64
Player 11 Overall Diff    142 non-null int64
Avg Goals / Rank Diff     142 non-null float64
dtypes: float64(1), int64(14)
memory usage: 16.7 KB


In [13]:
X.describe()

Unnamed: 0,Stage,Home Team Name,Away Team Name,Player 1 Overall Diff,Player 2 Overall Diff,Player 3 Overall Diff,Player 4 Overall Diff,Player 5 Overall Diff,Player 6 Overall Diff,Player 7 Overall Diff,Player 8 Overall Diff,Player 9 Overall Diff,Player 10 Overall Diff,Player 11 Overall Diff,Avg Goals / Rank Diff
count,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0
mean,6.584507,20.330986,21.894366,0.15493,0.43662,-0.704225,0.661972,-0.478873,0.535211,1.225352,0.859155,-0.330986,0.84507,1.190141,-0.101174
std,3.955861,15.265834,14.721388,9.454104,8.707594,12.864702,11.056199,10.035199,11.404032,9.894761,9.855402,12.336144,11.766219,12.565253,7.042092
min,0.0,0.0,0.0,-23.0,-20.0,-30.0,-28.0,-28.0,-26.0,-21.0,-19.0,-29.0,-33.0,-32.0,-65.0
25%,3.0,5.25,9.25,-6.0,-5.75,-9.0,-6.0,-7.75,-8.0,-5.0,-7.0,-9.75,-6.0,-7.0,-0.916667
50%,6.0,17.0,19.0,-1.0,0.0,-2.0,2.0,0.0,1.0,1.0,0.0,0.0,1.0,2.0,0.0
75%,11.0,30.0,31.5,7.0,6.0,8.75,9.0,6.0,8.0,7.75,8.0,8.0,8.75,9.0,1.0
max,13.0,49.0,49.0,25.0,24.0,28.0,26.0,28.0,26.0,28.0,22.0,32.0,30.0,38.0,49.0


In [14]:
xgb_model = XGBClassifier(
    n_estimators=4000,
    max_depth=20,
    learning_rate=0.03,
    booster="gbtree",
    n_jobs=-1
)

In [15]:
xgb_model.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.03, max_delta_step=0,
       max_depth=20, min_child_weight=1, missing=None, n_estimators=4000,
       n_jobs=-1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

# Save model and encoders

In [16]:
with open("xgb_model.b", "wb") as f:
    pickle.dump(xgb_model, f)
    
with open("stage_encoder.b", "wb") as f:
    pickle.dump(stage_encoder, f)
    
with open("team_name_encoder.b", "wb") as f:
    pickle.dump(team_name_encoder, f)