In [9]:
import numpy as np
import pandas as pd

from statistics import mean

from sklearn.preprocessing import LabelEncoder

from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import (
    cross_validate, train_test_split, GridSearchCV, learning_curve, validation_curve
)
from sklearn.metrics import classification_report, accuracy_score

# Import dataset

In [10]:
final = pd.read_csv("../DataFormating/final.csv")

In [11]:
final.head()

Unnamed: 0,Away Team Goals,Away Team Name,Home Team Goals,Home Team Name,Year,Home Avg Goals,Away Avg Goals,Home FIFA Points,Away FIFA Points
0,1.0,Mexico,4.0,France,1930.0,0.0,0.0,1166.0,1008.0
1,1.0,Spain,0.0,Portugal,1930.0,0.0,0.0,1306.0,1162.0
2,3.0,Poland,0.0,Sweden,1930.0,0.0,0.0,889.0,1118.0
3,2.0,Sweden,2.0,Belgium,1930.0,0.0,0.0,1346.0,889.0
4,3.0,Germany,6.0,Denmark,1930.0,0.0,0.0,1054.0,1533.0


# Setup `X`, `y` data for training / testing

In [12]:
final.columns

Index(['Away Team Goals', 'Away Team Name', 'Home Team Goals',
       'Home Team Name', 'Year', 'Home Avg Goals', 'Away Avg Goals',
       'Home FIFA Points', 'Away FIFA Points'],
      dtype='object')

In [13]:
X = final.drop(["Away Team Goals", "Home Team Goals"], axis=1)

y = []
for i in range(len(final)):
    home_team_goals = final["Home Team Goals"][i]
    away_team_goals = final["Away Team Goals"][i]
    
    if home_team_goals > away_team_goals:
        y.append(1)
    elif home_team_goals < away_team_goals:
        y.append(2)
    else:
        y.append(0)

In [14]:
# Test
assert len(X) == len(y)

### Encode textual features from the `X` dataset

In [15]:
team_name_encoder = LabelEncoder().fit(
    list(X["Home Team Name"]) + list(X["Away Team Name"])
)

X["Home Team Name"] = team_name_encoder.transform(X["Home Team Name"])
X["Away Team Name"] = team_name_encoder.transform(X["Away Team Name"])

### Feature Selection

In [16]:
feature_names = [
    "Away Team Name",
    "Home Team Name",
    "Avg Goals",
#     "FIFA Points"
]

COLUMNS = []
for column_name in X.columns:
    for feature_name in feature_names:
        if feature_name in column_name:
            COLUMNS.append(column_name)
            break

X = X[COLUMNS]

In [17]:
COLUMNS

['Away Team Name', 'Home Team Name', 'Home Avg Goals', 'Away Avg Goals']

### Split `X` and `y` into train / test sets

In [18]:
len(X)

3951

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# Fast testing

In [20]:
def test_model(model, cv=10):
    cv_scores = cross_validate(model, X, y, cv=cv)
    
    mean_train_acc = mean(cv_scores["train_score"]) 
    mean_test_acc = mean(cv_scores["test_score"])
    
    print()
    print("Train Accuracy: ", mean_train_acc)
    print("Test Accuracy: ", mean_test_acc)
    print()
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print()
    print("Normal split accuracy score: ", accuracy_score(y_test, y_pred))

### Logistic Regression

In [21]:
test_model(LogisticRegression(C=1.0, solver="newton-cg"))


Train Accuracy:  0.48080665330167927
Test Accuracy:  0.4806423521211607

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       346
          1       0.48      0.91      0.63       598
          2       0.39      0.17      0.24       360

avg / total       0.33      0.47      0.35      1304


Normal split accuracy score:  0.46549079754601225


  'precision', 'predicted', average, warn_for)


### K-Nearest Neighbors

In [22]:
test_model(KNeighborsClassifier(n_neighbors=5))


Train Accuracy:  0.6286178848964958
Test Accuracy:  0.43104593922713774

             precision    recall  f1-score   support

          0       0.29      0.31      0.30       346
          1       0.53      0.64      0.58       598
          2       0.39      0.23      0.29       360

avg / total       0.43      0.44      0.42      1304


Normal split accuracy score:  0.4378834355828221




### Random Forests

In [23]:
model = RandomForestClassifier(n_estimators=100, max_depth=3, bootstrap=True, n_jobs=-1)

test_model(model)




Train Accuracy:  0.49320850634362784
Test Accuracy:  0.48570888836249615

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       346
          1       0.48      0.93      0.63       598
          2       0.47      0.20      0.28       360

avg / total       0.35      0.48      0.37      1304


Normal split accuracy score:  0.48006134969325154


  'precision', 'predicted', average, warn_for)


### Support Vector Machines

In [24]:
test_model(SVC(C=1.0, kernel="rbf", gamma="auto"))




Train Accuracy:  0.6148373115090557
Test Accuracy:  0.48749444972267675

             precision    recall  f1-score   support

          0       0.38      0.15      0.22       346
          1       0.53      0.82      0.64       598
          2       0.47      0.31      0.37       360

avg / total       0.47      0.50      0.46      1304


Normal split accuracy score:  0.5015337423312883


### Extremely Randomized Trees

In [25]:
test_model(
    ExtraTreesClassifier(n_estimators=300, max_depth=3, bootstrap=True, n_jobs=-1)
)




Train Accuracy:  0.46975463964139874
Test Accuracy:  0.4659592915974203

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       346
          1       0.46      0.99      0.63       598
          2       0.73      0.03      0.06       360

avg / total       0.41      0.46      0.31      1304


Normal split accuracy score:  0.4647239263803681


  'precision', 'predicted', average, warn_for)


### Gradient Boosting Machines

In [26]:
model = GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.1)

test_model(model)




Train Accuracy:  0.6233016025127135
Test Accuracy:  0.4788631742414078

             precision    recall  f1-score   support

          0       0.35      0.16      0.22       346
          1       0.54      0.78      0.64       598
          2       0.45      0.36      0.40       360

avg / total       0.47      0.50      0.46      1304


Normal split accuracy score:  0.49923312883435583


### XGBoost (58.77%)

In [27]:
# best_model = XGBClassifier(n_estimators=500, max_depth=3, learning_rate=0.01, n_jobs=-1)

model = XGBClassifier(n_estimators=500, max_depth=3, learning_rate=0.03, n_jobs=-1)

test_model(model)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:



Train Accuracy:  0.5720921672385185
Test Accuracy:  0.48925449348523936

             precision    recall  f1-score   support

          0       0.32      0.10      0.15       346
          1       0.52      0.82      0.64       598
          2       0.48      0.34      0.40       360

avg / total       0.46      0.50      0.44      1304


Normal split accuracy score:  0.4976993865030675


  if diff:


### AdaBoost with Decision Tree

In [28]:
tree = DecisionTreeClassifier()
ada = AdaBoostClassifier(tree, n_estimators=100, learning_rate=0.1)

test_model(ada)




Train Accuracy:  0.9553987031124916
Test Accuracy:  0.4242142899753188

             precision    recall  f1-score   support

          0       0.32      0.31      0.31       346
          1       0.56      0.60      0.58       598
          2       0.41      0.38      0.40       360

avg / total       0.46      0.46      0.46      1304


Normal split accuracy score:  0.46088957055214724


### Neural Network

In [29]:
test_model(
    MLPClassifier(
        hidden_layer_sizes=(20, 40, 60, 40, 20),
        activation="logistic",
        solver="lbfgs"
    )
)


Train Accuracy:  0.46418629236704856
Test Accuracy:  0.4641871234972081

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       346
          1       0.46      1.00      0.63       598
          2       0.00      0.00      0.00       360

avg / total       0.21      0.46      0.29      1304


Normal split accuracy score:  0.45858895705521474


  'precision', 'predicted', average, warn_for)


# Build up a Random Forest Classifier with Grid Search

In [22]:
model = RandomForestClassifier()

grid_search = GridSearchCV(
    model,
    param_grid={
        "n_estimators": [100, 200, 300, 500, 700, 1000],
        "max_depth": [1, 2, 3, 5, 10],
    },
    scoring="accuracy",
    cv=3,
    verbose=True
)

grid_search.fit(X, y)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:   43.0s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [100, 200, 300, 500, 700, 1000], 'max_depth': [1, 2, 3, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=True)

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_params_