In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import LabelEncoder

import pickle

# Import dataset

In [2]:
data = pd.read_csv("../DataFormating/compressed_final.csv")

In [3]:
data.head()

Unnamed: 0,Away Team Goals,Away Team Name,Home Team Goals,Home Team Name,Player 1 Overall Diff,Player 2 Overall Diff,Player 3 Overall Diff,Player 4 Overall Diff,Player 5 Overall Diff,Player 6 Overall Diff,Player 7 Overall Diff,Player 8 Overall Diff,Player 9 Overall Diff,Player 10 Overall Diff,Player 11 Overall Diff,Avg Goals Diff,FIFA Rank Diff
0,0.0,France,0.0,Uruguay,7,4,6,6,6,4,5,5,5,5,5,0.333333,10.0
1,0.0,Nigeria,1.0,Argentina,0,0,-2,-2,-1,0,0,0,-1,0,-1,0.5,-42.0
2,0.0,Australia,4.0,Germany,3,2,3,2,2,3,0,0,2,1,2,1.75,-39.0
3,1.0,Switzerland,0.0,Spain,6,5,3,2,3,6,5,5,5,5,4,-1.0,2.0
4,1.0,Korea Republic,4.0,Argentina,-4,-4,-4,-3,0,1,2,2,2,2,-3,3.0,-56.0


# Set `X` and `y`

In [4]:
data.columns

Index(['Away Team Goals', 'Away Team Name', 'Home Team Goals',
       'Home Team Name', 'Player 1 Overall Diff', 'Player 2 Overall Diff',
       'Player 3 Overall Diff', 'Player 4 Overall Diff',
       'Player 5 Overall Diff', 'Player 6 Overall Diff',
       'Player 7 Overall Diff', 'Player 8 Overall Diff',
       'Player 9 Overall Diff', 'Player 10 Overall Diff',
       'Player 11 Overall Diff', 'Avg Goals Diff', 'FIFA Rank Diff'],
      dtype='object')

In [5]:
X = data.drop(["Away Team Goals", "Home Team Goals"], axis=1)

y = []
for i in range(len(data)):
    home_team_goals = data["Home Team Goals"][i]
    away_team_goals = data["Away Team Goals"][i]
    
    if home_team_goals > away_team_goals:
        y.append(1)
    elif home_team_goals < away_team_goals:
        y.append(2)
    else:
        y.append(0)

In [6]:
# Test
assert len(X) == len(y)

### Encode textual features from the `X` dataset

In [7]:
word_cup_teams = [
    "Egypt",
    "Morocco",
    "Nigeria",
    "Senegal",
    "Tunisia",
    "Australia",
    "Iran",
    "Japan",
    "Korea DPR",
    "Saudi Arabia",
    "Belgium",
    "Croatia",
    "Denmark",
    "England",
    "France",
    "Germany",
    "Iceland",
    "Poland",
    "Portugal",
    "Russia",
    "Serbia",
    "Spain",
    "Sweden",
    "Switzerland",
    "Costa Rica",
    "Mexico",
    "Panama",
    "Argentina",
    "Brazil",
    "Colombia",
    "Peru",
    "Uruguay"
]

team_names = list(data["Home Team Name"].unique()) + list(data["Away Team Name"].unique()) + word_cup_teams

In [8]:
team_name_encoder = LabelEncoder().fit(team_names)

In [9]:
X["Home Team Name"] = team_name_encoder.transform(X["Home Team Name"])
X["Away Team Name"] = team_name_encoder.transform(X["Away Team Name"])

### Feature Selection

In [10]:
feature_names = [
    "Away Team Name",
    "Home Team Name",
#     "Player 1 Overall Diff",
#     "Player 2 Overall Diff",
#     "Player 3 Overall Diff",
#     "Player 4 Overall Diff",
#     "Player 5 Overall Diff",
#     "Player 6 Overall Diff",
#     "Player 7 Overall Diff",
#     "Player 8 Overall Diff",
#     "Player 9 Overall Diff",
#     "Player 10 Overall Diff",
#     "Player 11 Overall Diff",
    "Avg Goals Diff",
#     "FIFA Rank Diff"
]

COLUMNS = []
for column_name in X.columns:
    for feature_name in feature_names:
        if feature_name in column_name:
            COLUMNS.append(column_name)
            break

X = X[COLUMNS]

In [11]:
COLUMNS

['Away Team Name', 'Home Team Name', 'Avg Goals Diff']

# Traning Session

In [12]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 603 entries, 0 to 602
Data columns (total 3 columns):
Away Team Name    603 non-null int64
Home Team Name    603 non-null int64
Avg Goals Diff    603 non-null float64
dtypes: float64(1), int64(2)
memory usage: 14.2 KB


In [13]:
X.describe()

Unnamed: 0,Away Team Name,Home Team Name,Avg Goals Diff
count,603.0,603.0,603.0
mean,15.126036,14.271973,0.301824
std,10.036679,9.749129,1.518678
min,0.0,0.0,-6.0
25%,5.0,6.0,-0.5
50%,16.0,13.0,0.4
75%,23.0,23.0,1.0
max,32.0,32.0,6.0


In [14]:
extra_model = ExtraTreesClassifier(n_estimators=1100, max_depth=10, bootstrap=True, n_jobs=-1)

In [15]:
extra_model.fit(X, y)

ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='gini',
           max_depth=10, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1100, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

# Save model and encoders

In [16]:
with open("extra_model.b", "wb") as f:
    pickle.dump(extra_model, f)

with open("team_name_encoder.b", "wb") as f:
    pickle.dump(team_name_encoder, f)