In [1]:
import numpy as np
import pandas as pd

from statistics import mean

from sklearn.preprocessing import LabelEncoder

from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import (
    cross_validate, train_test_split, GridSearchCV, learning_curve, validation_curve
)
from sklearn.metrics import classification_report, accuracy_score

# Import dataset

In [2]:
compressed_final = pd.read_csv("../DataFormating/compressed_final.csv")
final = pd.read_csv("../DataFormating/final.csv")

In [3]:
compressed_final.head()

Unnamed: 0,Away Team Goals,Away Team Name,Home Team Goals,Home Team Name,Player 1 Overall Diff,Player 2 Overall Diff,Player 3 Overall Diff,Player 4 Overall Diff,Player 5 Overall Diff,Player 6 Overall Diff,Player 7 Overall Diff,Player 8 Overall Diff,Player 9 Overall Diff,Player 10 Overall Diff,Player 11 Overall Diff,Avg Goals Diff,FIFA Rank Diff
0,0.0,France,0.0,Uruguay,7,4,6,6,6,4,5,5,5,5,5,0.333333,10.0
1,0.0,Nigeria,1.0,Argentina,0,0,-2,-2,-1,0,0,0,-1,0,-1,0.5,-42.0
2,0.0,Australia,4.0,Germany,3,2,3,2,2,3,0,0,2,1,2,1.75,-39.0
3,1.0,Switzerland,0.0,Spain,6,5,3,2,3,6,5,5,5,5,4,-1.0,2.0
4,1.0,Korea Republic,4.0,Argentina,-4,-4,-4,-3,0,1,2,2,2,2,-3,3.0,-56.0


# Setup `X`, `y` data for training / testing

In [5]:
compressed_final.columns

Index(['Away Team Goals', 'Away Team Name', 'Home Team Goals',
       'Home Team Name', 'Player 1 Overall Diff', 'Player 2 Overall Diff',
       'Player 3 Overall Diff', 'Player 4 Overall Diff',
       'Player 5 Overall Diff', 'Player 6 Overall Diff',
       'Player 7 Overall Diff', 'Player 8 Overall Diff',
       'Player 9 Overall Diff', 'Player 10 Overall Diff',
       'Player 11 Overall Diff', 'Avg Goals Diff', 'FIFA Rank Diff'],
      dtype='object')

In [6]:
X = compressed_final.drop(["Away Team Goals", "Home Team Goals"], axis=1)

y = []
for i in range(len(compressed_final)):
    home_team_goals = compressed_final["Home Team Goals"][i]
    away_team_goals = compressed_final["Away Team Goals"][i]
    
    if home_team_goals > away_team_goals:
        y.append(1)
    elif home_team_goals < away_team_goals:
        y.append(2)
    else:
        y.append(0)

In [7]:
# Test
assert len(X) == len(y)

### Encode textual features from the `X` dataset

In [8]:
X["Home Team Name"] = LabelEncoder().fit_transform(X["Home Team Name"])
X["Away Team Name"] = LabelEncoder().fit_transform(X["Away Team Name"])

### Feature Selection

In [10]:
feature_names = [
    "Home Team Name",
    "Away Team Name",
    "Player 1 Overall Diff",
    "Player 2 Overall Diff",
    "Player 3 Overall Diff",
    "Player 4 Overall Diff",
    "Player 5 Overall Diff",
    "Player 6 Overall Diff",
    "Player 7 Overall Diff",
    "Player 8 Overall Diff",
    "Player 9 Overall Diff",
    "Player 10 Overall Diff",
    "Player 11 Overall Diff",
    "Avg Goals Diff",
    "FIFA Rank Diff",
]

COLUMNS = []
for column_name in X.columns:
    for feature_name in feature_names:
        if feature_name in column_name:
            COLUMNS.append(column_name)
            break

X = X[COLUMNS]

In [11]:
COLUMNS

['Away Team Name',
 'Home Team Name',
 'Player 1 Overall Diff',
 'Player 2 Overall Diff',
 'Player 3 Overall Diff',
 'Player 4 Overall Diff',
 'Player 5 Overall Diff',
 'Player 6 Overall Diff',
 'Player 7 Overall Diff',
 'Player 8 Overall Diff',
 'Player 9 Overall Diff',
 'Player 10 Overall Diff',
 'Player 11 Overall Diff',
 'Avg Goals Diff',
 'FIFA Rank Diff']

### Split `X` and `y` into train / test sets

In [22]:
len(X)

603

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# Fast testing

In [24]:
def test_model(model, cv=10):
    cv_scores = cross_validate(model, X, y, cv=cv)
    
    mean_train_acc = mean(cv_scores["train_score"]) 
    mean_test_acc = mean(cv_scores["test_score"])
    
    print()
    print("Train Accuracy: ", mean_train_acc)
    print("Test Accuracy: ", mean_test_acc)
    print()
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print()
    print("2014 accuracy score: ", accuracy_score(y_test, y_pred))

### K-Nearest Neighbors

In [25]:
test_model(KNeighborsClassifier(n_neighbors=3))


Train Accuracy:  0.6920922441268612
Test Accuracy:  0.4926775956284153

             precision    recall  f1-score   support

          0       0.24      0.37      0.29        43
          1       0.51      0.54      0.52        85
          2       0.50      0.30      0.37        71

avg / total       0.45      0.42      0.42       199


2014 accuracy score:  0.41708542713567837




### Random Forests

In [26]:
model = RandomForestClassifier(n_estimators=1100, max_depth=10, bootstrap=True, n_jobs=-1)

test_model(model)




Train Accuracy:  0.8249485229658926
Test Accuracy:  0.7013661202185792

             precision    recall  f1-score   support

          0       0.44      0.33      0.37        43
          1       0.67      0.73      0.70        85
          2       0.69      0.73      0.71        71

avg / total       0.63      0.64      0.63       199


2014 accuracy score:  0.6432160804020101


### Support Vector Machines

In [27]:
test_model(SVC(C=1.0, kernel="rbf", gamma="auto"))


Train Accuracy:  0.8249485229658926
Test Accuracy:  0.4840163934426229

             precision    recall  f1-score   support

          0       0.21      0.09      0.13        43
          1       0.45      0.76      0.57        85
          2       0.54      0.27      0.36        71

avg / total       0.43      0.44      0.40       199


2014 accuracy score:  0.44221105527638194




### Extremely Randomized Trees

In [17]:
test_model(
    ExtraTreesClassifier(n_estimators=1100, max_depth=10, bootstrap=True, n_jobs=-1)
)




Train Accuracy:  0.822737219084898
Test Accuracy:  0.645

             precision    recall  f1-score   support

          0       0.46      0.27      0.34        45
          1       0.71      0.80      0.75        93
          2       0.64      0.72      0.68        61

avg / total       0.63      0.65      0.64       199


2014 accuracy score:  0.6532663316582915


### Gradient Boosting Machines

In [28]:
model = GradientBoostingClassifier(n_estimators=1000, max_depth=3, learning_rate=0.1)

test_model(model)




Train Accuracy:  0.8249485229658926
Test Accuracy:  0.6998633879781421

             precision    recall  f1-score   support

          0       0.41      0.33      0.36        43
          1       0.67      0.73      0.70        85
          2       0.71      0.72      0.71        71

avg / total       0.63      0.64      0.63       199


2014 accuracy score:  0.6381909547738693


### XGBoost (81.53%)

In [None]:
# best_model = XGBClassifier(n_estimators=4000, max_depth=20, learning_rate=0.03, n_jobs=-1)

model = XGBClassifier(n_estimators=4000, max_depth=20, learning_rate=0.03, n_jobs=-1)

test_model(model)

  if diff:
  if diff:
  if diff:
  if diff:


### AdaBoost with Decision Tree

In [21]:
tree = DecisionTreeClassifier()
ada = AdaBoostClassifier(tree, n_estimators=10**5, learning_rate=0.1)

test_model(ada)

KeyboardInterrupt: 

### Neural Network

In [None]:
test_model(
    MLPClassifier(
        hidden_layer_sizes=(20, 40, 60, 100, 200, 300, 500, 500, 300, 200, 100, 60, 40, 20),
        activation="logistic"
    )
)

# Build up a Random Forest Classifier with Grid Search

In [22]:
model = RandomForestClassifier()

grid_search = GridSearchCV(
    model,
    param_grid={
        "n_estimators": [100, 200, 300, 500, 700, 1000],
        "max_depth": [1, 2, 3, 5, 10],
    },
    scoring="accuracy",
    cv=3,
    verbose=True
)

grid_search.fit(X, y)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:   43.0s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [100, 200, 300, 500, 700, 1000], 'max_depth': [1, 2, 3, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=True)

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_params_