In [1]:
import pickle
from copy import deepcopy

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Import dataset

In [2]:
data = pd.read_csv("../DataFormating/final.csv")

# Set `X` and `y`

In [3]:
data.columns

Index(['Away Team Goals', 'Away Team Name', 'Home Team Goals',
       'Home Team Name', 'Year', 'home_rank', 'home_total_points',
       'home_previous_points', 'home_rank_change', 'home_cur_year_avg',
       'home_cur_year_avg_weighted', 'home_last_year_avg',
       'home_last_year_avg_weighted', 'home_two_year_ago_avg',
       'home_two_year_ago_weighted', 'home_three_year_ago_avg',
       'home_three_year_ago_weighted', 'away_rank', 'away_total_points',
       'away_previous_points', 'away_rank_change', 'away_cur_year_avg',
       'away_cur_year_avg_weighted', 'away_last_year_avg',
       'away_last_year_avg_weighted', 'away_two_year_ago_avg',
       'away_two_year_ago_weighted', 'away_three_year_ago_avg',
       'away_three_year_ago_weighted', 'Home Avg Goals', 'Away Avg Goals'],
      dtype='object')

In [4]:
X = data.drop(["Away Team Goals", "Home Team Goals"], axis=1)

y = []
for i in range(len(data)):
    home_team_goals = data.iloc[i]["Home Team Goals"]
    away_team_goals = data.iloc[i]["Away Team Goals"]
    
    if home_team_goals > away_team_goals:
        y.append(1)
    elif home_team_goals < away_team_goals:
        y.append(2)
    else:
        y.append(0)

In [5]:
# Test
assert len(X) == len(y)

### Encode textual features from the `X` dataset

In [6]:
word_cup_teams = [
    "Egypt",
    "Morocco",
    "Nigeria",
    "Senegal",
    "Tunisia",
    "Australia",
    "IR Iran",
    "Japan",
    "Korea DPR",
    "Saudi Arabia",
    "Belgium",
    "Croatia",
    "Denmark",
    "England",
    "France",
    "Germany",
    "Iceland",
    "Poland",
    "Portugal",
    "Russia",
    "Serbia",
    "Spain",
    "Sweden",
    "Switzerland",
    "Costa Rica",
    "Mexico",
    "Panama",
    "Argentina",
    "Brazil",
    "Colombia",
    "Peru",
    "Uruguay"
]

team_names = list(data["Home Team Name"].unique()) + list(data["Away Team Name"].unique()) + word_cup_teams

In [7]:
team_name_encoder = LabelEncoder().fit(team_names)

In [8]:
X["Home Team Name"] = team_name_encoder.transform(X["Home Team Name"])
X["Away Team Name"] = team_name_encoder.transform(X["Away Team Name"])

### Feature Selection

In [9]:
feature_names = []

COLUMNS = []
for column_name in X.columns:
    for feature_name in feature_names:
        if feature_name in column_name:
            COLUMNS.append(column_name)
            break

COLUMNS = [
    'Away Team Name',
    'Home Team Name',

    'home_rank',
    'home_total_points',
    'home_cur_year_avg',
    'home_cur_year_avg_weighted',
    
    'away_rank',
    'away_total_points',
    'away_cur_year_avg',
    'away_cur_year_avg_weighted',
    
    'Home Avg Goals',
    'Away Avg Goals'
]
X = X[COLUMNS]

In [10]:
COLUMNS

['Away Team Name',
 'Home Team Name',
 'home_rank',
 'home_total_points',
 'home_cur_year_avg',
 'home_cur_year_avg_weighted',
 'away_rank',
 'away_total_points',
 'away_cur_year_avg',
 'away_cur_year_avg_weighted',
 'Home Avg Goals',
 'Away Avg Goals']

# Traning / Evaluation Session

In [11]:
class ModelStacking(object):
    def __init__(self, base_model, *stacked_models):
        self.base_model = base_model
        self.stacked_models = stacked_models
        
    def fit(self, X, y):
        data = X.copy()
        for i in range(len(self.stacked_models)):
            self.stacked_models[i].fit(X, y)
            pred = self.stacked_models[i].predict(X)
            data["Model-"+str(i)] = pred

        self.base_model.fit(data, y)
        return self
    
    def predict(self, X):
        data = X.copy()
        for i in range(len(self.stacked_models)):
            pred = self.stacked_models[i].predict(X)
            data["Model-"+str(i)] = pred

        return self.base_model.predict(data)

In [12]:
def test_model(model, X, y, cv=10):
    score = 0
    for _ in range(cv):
        _model = deepcopy(model)
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
        _model.fit(X_train, y_train)
        y_pred = _model.predict(X_test)
        score += accuracy_score(y_test, y_pred)

        if cv == 1:
            print(classification_report(y_test, y_pred))

    print("Test set accuracy score: ", score/cv, "\n")

In [15]:
model = ModelStacking(
    LogisticRegression(),
    
    LogisticRegression(),
    MLPClassifier(
        hidden_layer_sizes=(60, 50),
        activation="logistic",
        solver="adam",
        alpha=1*10**-9
    ),
    XGBClassifier(n_estimators=300, max_depth=3, learning_rate=0.001, n_jobs=-1)
)

In [16]:
test_model(model, X, y, cv=10)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Test set accuracy score:  0.4902046783625731 



  if diff:
  if diff:
