In [1]:
import pickle
from copy import deepcopy

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Import dataset

In [2]:
data = pd.read_csv("../DataFormating/compressed_final.csv")

# Set `X` and `y`

In [3]:
data.columns[:10]

Index(['Stage', 'Home Team Name', 'Home Team Goals', 'Away Team Goals',
       'Away Team Name', 'Attendance', 'Half-time Home Goals',
       'Half-time Away Goals', 'Home Team Initials', 'Away Team Initials'],
      dtype='object')

In [4]:
X = data.drop(["Home Team Goals", "Away Team Goals",
               "Half-time Home Goals", "Half-time Away Goals", 
               "Home Team Initials", "Away Team Initials"], axis=1)

y = []
for i in range(len(data)):
    home_team_goals = data.iloc[i]["Home Team Goals"]
    away_team_goals = data.iloc[i]["Away Team Goals"]
    
    if home_team_goals > away_team_goals:
        y.append(1)
    elif home_team_goals < away_team_goals:
        y.append(2)
    else:
        y.append(0)

In [5]:
# Test
assert len(X) == len(y)

### Encode textual features from the `X` dataset

In [6]:
word_cup_teams = [
    "Egypt",
    "Morocco",
    "Nigeria",
    "Senegal",
    "Tunisia",
    "Australia",
    "IR Iran",
    "Japan",
    "Korea DPR",
    "Saudi Arabia",
    "Belgium",
    "Croatia",
    "Denmark",
    "England",
    "France",
    "Germany",
    "Iceland",
    "Poland",
    "Portugal",
    "Russia",
    "Serbia",
    "Spain",
    "Sweden",
    "Switzerland",
    "Costa Rica",
    "Mexico",
    "Panama",
    "Argentina",
    "Brazil",
    "Colombia",
    "Peru",
    "Uruguay"
]

team_names = list(data["Home Team Name"].unique()) + list(data["Away Team Name"].unique()) + word_cup_teams

In [7]:
stage_encoder = LabelEncoder().fit(X["Stage"])
team_name_encoder = LabelEncoder().fit(team_names)

In [8]:
X["Stage"] = stage_encoder.transform(X["Stage"])
X["Home Team Name"] = team_name_encoder.transform(X["Home Team Name"])
X["Away Team Name"] = team_name_encoder.transform(X["Away Team Name"])

### Feature Selection

In [9]:
feature_names = [
    "Stage", "Home Team Name", "Away Team Name",
    "Attendance", "Overall",
    "Mean Home Team Goals", "Mean Away Team Goals"
]

COLUMNS = []

for column_name in X.columns:
    for feature_name in feature_names:
        if feature_name in column_name:
            COLUMNS.append(column_name)
            break

X = X[COLUMNS]

In [10]:
COLUMNS

['Stage',
 'Home Team Name',
 'Away Team Name',
 'Attendance',
 'Player 1 Overall Diff',
 'Player 2 Overall Diff',
 'Player 3 Overall Diff',
 'Player 4 Overall Diff',
 'Player 5 Overall Diff',
 'Player 6 Overall Diff',
 'Player 7 Overall Diff',
 'Player 8 Overall Diff',
 'Player 9 Overall Diff',
 'Player 10 Overall Diff',
 'Player 11 Overall Diff',
 'Mean Home Team Goals',
 'Mean Away Team Goals']

# Traning / Evaluation Session

In [11]:
class ModelStacking(object):
    def __init__(self, base_model, *stacked_models):
        self.base_model = base_model
        self.stacked_models = stacked_models
        
    def fit(self, X, y):
        data = X.copy()
        for i in range(len(self.stacked_models)):
            self.stacked_models[i].fit(X, y)
            pred = self.stacked_models[i].predict(X)
            data["Model-"+str(i)] = pred

        self.base_model.fit(data, y)
        return self
    
    def predict(self, X):
        data = X.copy()
        for i in range(len(self.stacked_models)):
            pred = self.stacked_models[i].predict(X)
            data["Model-"+str(i)] = pred

        return self.base_model.predict(data)

In [21]:
def test_model(model, X, y, cv=10):
    score = 0
    for _ in range(cv):
        _model = deepcopy(model)
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
        _model.fit(X_train, y_train)
        y_pred = _model.predict(X_test)
        score += accuracy_score(y_test, y_pred)

        if cv == 1:
            print(classification_report(y_test, y_pred))

    print("Test set accuracy score: ", score/cv, "\n")

In [29]:
model = ModelStacking(
    RandomForestClassifier(n_estimators=10, max_depth=3, bootstrap=True, n_jobs=-1),

    ExtraTreesClassifier(n_estimators=1000, max_depth=10, bootstrap=True, n_jobs=-1),

    XGBClassifier(n_estimators=4000, max_depth=20, learning_rate=0.03, n_jobs=-1)
)

In [31]:
test_model(model, X, y, cv=10)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Test set accuracy score:  0.651063829787234 



  if diff:
