In [1]:
import numpy as np
import pandas as pd

from statistics import mean

from sklearn.preprocessing import LabelEncoder

from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import (
    cross_validate, train_test_split, GridSearchCV, learning_curve, validation_curve
)
from sklearn.metrics import classification_report, accuracy_score

# Import dataset

In [2]:
compressed_final = pd.read_csv("../DataFormating/compressed_final.csv")
final = pd.read_csv("../DataFormating/final.csv")

In [3]:
compressed_final.head()

Unnamed: 0,Stage,Home Team Name,Home Team Goals,Away Team Goals,Away Team Name,Attendance,Half-time Home Goals,Half-time Away Goals,Home Team Initials,Away Team Initials,...,Player 11 Shot power Diff,Player 11 Sliding tackle Diff,Player 11 Sprint speed Diff,Player 11 Stamina Diff,Player 11 Standing tackle Diff,Player 11 Strength Diff,Player 11 Vision Diff,Player 11 Volleys Diff,Mean Home Team Goals,Mean Away Team Goals
0,Group A,South Africa,1.0,1.0,Mexico,84490.0,0.0,0.0,RSA,MEX,...,-29,-15,-9,-31,-17,3,-43,-24,1.0,1.0
1,Group A,Uruguay,0.0,0.0,France,64100.0,0.0,0.0,URU,FRA,...,-44,-9,-19,-26,-8,-4,9,-58,0.5,0.0
2,Group B,Korea Republic,2.0,0.0,Greece,31513.0,1.0,0.0,KOR,GRE,...,-2,39,7,29,40,-10,-16,14,2.0,0.0
3,Group B,Argentina,1.0,0.0,Nigeria,55686.0,1.0,0.0,ARG,NGA,...,-25,35,-14,7,41,-12,-11,-55,2.0,1.666667
4,Group C,England,1.0,1.0,USA,38646.0,1.0,1.0,ENG,USA,...,-4,-2,17,-2,-4,-18,-7,-5,1.0,1.0


# Setup `X`, `y` data for training / testing

In [4]:
compressed_final.columns[:10]

Index(['Stage', 'Home Team Name', 'Home Team Goals', 'Away Team Goals',
       'Away Team Name', 'Attendance', 'Half-time Home Goals',
       'Half-time Away Goals', 'Home Team Initials', 'Away Team Initials'],
      dtype='object')

In [5]:
X = compressed_final.drop(["Home Team Goals", "Away Team Goals", 
                           "Half-time Home Goals", "Half-time Away Goals", 
                           "Home Team Initials", "Away Team Initials"], axis=1)

y = []
for i in range(len(compressed_final)):
    home_team_goals = compressed_final.iloc[i]["Home Team Goals"]
    away_team_goals = compressed_final.iloc[i]["Away Team Goals"]
    
    if home_team_goals > away_team_goals:
        y.append(1)
    elif home_team_goals < away_team_goals:
        y.append(2)
    else:
        y.append(0)

In [6]:
# Test
assert len(X) == len(y)

### Encode textual features from the `X` dataset

In [7]:
X["Stage"] = LabelEncoder().fit_transform(X["Stage"])
X["Home Team Name"] = LabelEncoder().fit_transform(X["Home Team Name"])
X["Away Team Name"] = LabelEncoder().fit_transform(X["Away Team Name"])

In [8]:
len(X.columns)

413

### Feature Selection

In [9]:
X.columns[4:]

Index(['Player 1 Age Diff', 'Player 1 Overall Diff', 'Player 1 Potential Diff',
       'Player 1 Acceleration Diff', 'Player 1 Aggression Diff',
       'Player 1 Agility Diff', 'Player 1 Balance Diff',
       'Player 1 Ball control Diff', 'Player 1 Composure Diff',
       'Player 1 Crossing Diff',
       ...
       'Player 11 Shot power Diff', 'Player 11 Sliding tackle Diff',
       'Player 11 Sprint speed Diff', 'Player 11 Stamina Diff',
       'Player 11 Standing tackle Diff', 'Player 11 Strength Diff',
       'Player 11 Vision Diff', 'Player 11 Volleys Diff',
       'Mean Home Team Goals', 'Mean Away Team Goals'],
      dtype='object', length=409)

In [10]:
# selection = SelectKBest(score_func=f_classif, k=10)
# selection.fit(X, y)

# X = selection.transform(X)

In [11]:
feature_names = [
    "Stage", "Home Team Name", "Away Team Name",
    "Attendance", "Overall",
    "Mean Home Team Goals", "Mean Away Team Goals"
]

COLUMNS = []

for column_name in X.columns:
    for feature_name in feature_names:
        if feature_name in column_name:
            COLUMNS.append(column_name)
            break

X = X[COLUMNS]

### Split `X` and `y` into train / test sets

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# Fast testing

In [13]:
def test_model(model, cv=10):
    cv_scores = cross_validate(model, X, y, cv=cv)
    
    mean_train_acc = mean(cv_scores["train_score"]) 
    mean_test_acc = mean(cv_scores["test_score"])
    
    print()
    print("Train Accuracy: ", mean_train_acc)
    print("Test Accuracy: ", mean_test_acc)
    print()
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

### K-Nearest Neighbors

In [14]:
test_model(KNeighborsClassifier(n_neighbors=3))


Train Accuracy:  0.7097997538643426
Test Accuracy:  0.3898809523809524

             precision    recall  f1-score   support

          0       0.30      0.30      0.30        10
          1       0.58      0.71      0.64        21
          2       0.45      0.31      0.37        16

avg / total       0.48      0.49      0.48        47





### Random Forests

In [15]:
model = RandomForestClassifier(n_estimators=1200, max_depth=10, bootstrap=True, n_jobs=-1)

test_model(model)




Train Accuracy:  1.0
Test Accuracy:  0.7426694139194139

             precision    recall  f1-score   support

          0       0.43      0.30      0.35        10
          1       0.75      0.71      0.73        21
          2       0.75      0.94      0.83        16

avg / total       0.68      0.70      0.69        47



### Support Vector Machines

In [16]:
test_model(SVC(C=1.0, kernel="rbf", gamma="auto"))


Train Accuracy:  1.0
Test Accuracy:  0.4087087912087912

             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       0.45      1.00      0.62        21
          2       0.00      0.00      0.00        16

avg / total       0.20      0.45      0.28        47



  'precision', 'predicted', average, warn_for)


### Extremely Randomized Trees

In [17]:
test_model(
    ExtraTreesClassifier(n_estimators=1100, max_depth=10, bootstrap=True, n_jobs=-1)
)




Train Accuracy:  1.0
Test Accuracy:  0.7233058608058608

             precision    recall  f1-score   support

          0       0.50      0.20      0.29        10
          1       0.67      0.76      0.71        21
          2       0.68      0.81      0.74        16

avg / total       0.64      0.66      0.63        47



### Gradient Boosting Machines

In [18]:
model = GradientBoostingClassifier(n_estimators=100, max_depth=3, learning_rate=0.1)

test_model(model)


Train Accuracy:  1.0
Test Accuracy:  0.7272985347985348

             precision    recall  f1-score   support

          0       0.50      0.30      0.37        10
          1       0.62      0.76      0.68        21
          2       0.80      0.75      0.77        16

avg / total       0.65      0.66      0.65        47





### XGBoost (Best for now with 78.53%)

In [19]:
# best_model = XGBClassifier(n_estimators=4000, max_depth=20, learning_rate=0.03, n_jobs=-1)

model = XGBClassifier(n_estimators=4000, max_depth=20, learning_rate=0.03, n_jobs=-1)

test_model(model)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:



Train Accuracy:  1.0
Test Accuracy:  0.7853296703296703

             precision    recall  f1-score   support

          0       0.30      0.30      0.30        10
          1       0.76      0.76      0.76        21
          2       0.75      0.75      0.75        16

avg / total       0.66      0.66      0.66        47



  if diff:


### AdaBoost with Decision Tree

In [20]:
tree = DecisionTreeClassifier()
ada = AdaBoostClassifier(tree, n_estimators=10**7, learning_rate=0.03)

test_model(ada)


Train Accuracy:  1.0
Test Accuracy:  0.6629395604395605

             precision    recall  f1-score   support

          0       0.29      0.40      0.33        10
          1       0.72      0.62      0.67        21
          2       0.53      0.50      0.52        16

avg / total       0.57      0.53      0.54        47





### Neural Network

In [21]:
test_model(
    MLPClassifier(
        hidden_layer_sizes=(20, 40, 60, 100, 200, 300, 500, 500, 300, 200, 100, 60, 40, 20),
        activation="logistic"
    )
)




Train Accuracy:  0.4084551476850277
Test Accuracy:  0.4087087912087912

             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       0.00      0.00      0.00        21
          2       0.34      1.00      0.51        16

avg / total       0.12      0.34      0.17        47



  'precision', 'predicted', average, warn_for)


# Build up a Random Forest Classifier with Grid Search

In [22]:
model = RandomForestClassifier()

grid_search = GridSearchCV(
    model,
    param_grid={
        "n_estimators": [100, 200, 300, 500, 700, 1000],
        "max_depth": [1, 2, 3, 5, 10],
    },
    scoring="accuracy",
    cv=3,
    verbose=True
)

grid_search.fit(X, y)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:   43.0s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [100, 200, 300, 500, 700, 1000], 'max_depth': [1, 2, 3, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=True)

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_params_