In [16]:
import numpy as np
import pandas as pd

from statistics import mean

from sklearn.preprocessing import LabelEncoder

from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import (
    cross_validate, train_test_split, GridSearchCV, learning_curve, validation_curve
)
from sklearn.metrics import classification_report, accuracy_score

# Import dataset

In [17]:
final = pd.read_csv("../DataFormating/final.csv")

In [18]:
final.head()

Unnamed: 0,Away Team Goals,Away Team Name,Home Team Goals,Home Team Name,Year,Home Avg Goals,Away Avg Goals,Home FIFA Points,Away FIFA Points
0,1.0,Mexico,4.0,France,1930.0,0.0,0.0,1166.0,1008.0
1,1.0,Spain,0.0,Portugal,1930.0,0.0,0.0,1306.0,1162.0
2,3.0,Poland,0.0,Sweden,1930.0,0.0,0.0,889.0,1118.0
3,2.0,Sweden,2.0,Belgium,1930.0,0.0,0.0,1346.0,889.0
4,3.0,Germany,6.0,Denmark,1930.0,0.0,0.0,1054.0,1533.0


# Setup `X`, `y` data for training / testing

In [19]:
final.columns

Index(['Away Team Goals', 'Away Team Name', 'Home Team Goals',
       'Home Team Name', 'Year', 'Home Avg Goals', 'Away Avg Goals',
       'Home FIFA Points', 'Away FIFA Points'],
      dtype='object')

In [20]:
X = final.drop(["Away Team Goals", "Home Team Goals"], axis=1)

y = []
for i in range(len(final)):
    home_team_goals = final["Home Team Goals"][i]
    away_team_goals = final["Away Team Goals"][i]
    
    if home_team_goals > away_team_goals:
        y.append(1)
    elif home_team_goals < away_team_goals:
        y.append(2)
    else:
        y.append(0)

In [21]:
# Test
assert len(X) == len(y)

### Encode textual features from the `X` dataset

In [22]:
# team_name_encoder = LabelEncoder().fit(
#     list(X["Home Team Name"]) + list(X["Away Team Name"])
# )

# X["Home Team Name"] = team_name_encoder.transform(X["Home Team Name"])
# X["Away Team Name"] = team_name_encoder.transform(X["Away Team Name"])

In [23]:
dummy_cols = pd.get_dummies(X[["Away Team Name", "Home Team Name"]])
X[dummy_cols.columns] = dummy_cols

### Feature Selection

In [24]:
feature_names = [
    "Away Team Name_",
    "Home Team Name_",
    "Avg Goals",
    "FIFA Points"
]

COLUMNS = []
for column_name in X.columns:
    for feature_name in feature_names:
        if feature_name in column_name:
            COLUMNS.append(column_name)
            break

X = X[COLUMNS]

In [25]:
COLUMNS

['Home Avg Goals',
 'Away Avg Goals',
 'Home FIFA Points',
 'Away FIFA Points',
 'Away Team Name_Argentina',
 'Away Team Name_Australia',
 'Away Team Name_Belgium',
 'Away Team Name_Brazil',
 'Away Team Name_Colombia',
 'Away Team Name_Costa Rica',
 'Away Team Name_Croatia',
 'Away Team Name_Denmark',
 'Away Team Name_Egypt',
 'Away Team Name_England',
 'Away Team Name_France',
 'Away Team Name_Germany',
 'Away Team Name_Iceland',
 'Away Team Name_Iran',
 'Away Team Name_Japan',
 'Away Team Name_Korea Republic',
 'Away Team Name_Mexico',
 'Away Team Name_Morocco',
 'Away Team Name_Nigeria',
 'Away Team Name_Panama',
 'Away Team Name_Peru',
 'Away Team Name_Poland',
 'Away Team Name_Portugal',
 'Away Team Name_Russia',
 'Away Team Name_Saudi Arabia',
 'Away Team Name_Senegal',
 'Away Team Name_Serbia',
 'Away Team Name_Spain',
 'Away Team Name_Sweden',
 'Away Team Name_Switzerland',
 'Away Team Name_Tunisia',
 'Away Team Name_Uruguay',
 'Home Team Name_Argentina',
 'Home Team Name_Austr

### Split `X` and `y` into train / test sets

In [26]:
len(X)

3951

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# Fast testing

In [28]:
def test_model(model, cv=10):
    cv_scores = cross_validate(model, X, y, cv=cv)
    
    mean_train_acc = mean(cv_scores["train_score"]) 
    mean_test_acc = mean(cv_scores["test_score"])
    
    print()
    print("Train Accuracy: ", mean_train_acc)
    print("Test Accuracy: ", mean_test_acc)
    print()
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print()
    print("Normal split accuracy score: ", accuracy_score(y_test, y_pred))

### Logistic Regression

In [31]:
test_model(LogisticRegression(C=1.0, solver="newton-cg"))




Train Accuracy:  0.5398910860370726
Test Accuracy:  0.5072229264551856

             precision    recall  f1-score   support

          0       0.34      0.10      0.16       347
          1       0.54      0.81      0.64       580
          2       0.47      0.41      0.44       377

avg / total       0.47      0.50      0.45      1304


Normal split accuracy score:  0.5038343558282209


### K-Nearest Neighbors

In [33]:
test_model(KNeighborsClassifier(n_neighbors=5))


Train Accuracy:  0.6390510359833105
Test Accuracy:  0.4505545409359516

             precision    recall  f1-score   support

          0       0.31      0.33      0.32       347
          1       0.52      0.65      0.58       580
          2       0.46      0.28      0.34       377

avg / total       0.45      0.45      0.44      1304


Normal split accuracy score:  0.45475460122699385




### Random Forests

In [35]:
model = RandomForestClassifier(n_estimators=100, max_depth=3, bootstrap=True, n_jobs=-1)

test_model(model)




Train Accuracy:  0.4845749644511817
Test Accuracy:  0.4811409997236283

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       347
          1       0.45      0.99      0.62       580
          2       0.67      0.04      0.07       377

avg / total       0.39      0.45      0.30      1304


Normal split accuracy score:  0.4532208588957055


  'precision', 'predicted', average, warn_for)


### Support Vector Machines

In [36]:
test_model(SVC(C=1.0, kernel="rbf", gamma="auto"))




Train Accuracy:  0.6174811347302289
Test Accuracy:  0.4980642051216717

             precision    recall  f1-score   support

          0       0.36      0.18      0.24       347
          1       0.50      0.77      0.61       580
          2       0.46      0.30      0.36       377

avg / total       0.45      0.47      0.44      1304


Normal split accuracy score:  0.4746932515337423


### Extremely Randomized Trees

In [34]:
test_model(
    ExtraTreesClassifier(n_estimators=300, max_depth=3, bootstrap=True, n_jobs=-1)
)




Train Accuracy:  0.46781425697122575
Test Accuracy:  0.46570484194118433

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       328
          1       0.48      1.00      0.65       626
          2       0.76      0.04      0.07       350

avg / total       0.44      0.49      0.33      1304


Normal split accuracy score:  0.48849693251533743


  'precision', 'predicted', average, warn_for)


### Gradient Boosting Machines

In [35]:
model = GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.1)

test_model(model)




Train Accuracy:  0.632526344337378
Test Accuracy:  0.4801385900096301

             precision    recall  f1-score   support

          0       0.26      0.12      0.17       328
          1       0.55      0.74      0.63       626
          2       0.47      0.42      0.44       350

avg / total       0.46      0.50      0.46      1304


Normal split accuracy score:  0.4976993865030675


### XGBoost (58.77%)

In [43]:
# best_model = XGBClassifier(n_estimators=500, max_depth=3, learning_rate=0.01, n_jobs=-1)

model = XGBClassifier(n_estimators=500, max_depth=3, learning_rate=0.03, n_jobs=-1)

test_model(model)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:



Train Accuracy:  0.5803880221072579
Test Accuracy:  0.5062320274172037

             precision    recall  f1-score   support

          0       0.35      0.13      0.19       347
          1       0.53      0.82      0.64       580
          2       0.48      0.36      0.41       377

avg / total       0.47      0.50      0.45      1304


Normal split accuracy score:  0.5015337423312883


  if diff:


### AdaBoost with Decision Tree

In [42]:
tree = DecisionTreeClassifier()
ada = AdaBoostClassifier(tree, n_estimators=100, learning_rate=0.1)

test_model(ada)




Train Accuracy:  0.9555112048770042
Test Accuracy:  0.43284563021345396

             precision    recall  f1-score   support

          0       0.37      0.30      0.33       347
          1       0.55      0.60      0.57       580
          2       0.42      0.43      0.42       377

avg / total       0.46      0.47      0.46      1304


Normal split accuracy score:  0.4700920245398773


### Neural Network

In [41]:
test_model(
    MLPClassifier(
        hidden_layer_sizes=(20, 40, 60, 40, 20),
        activation="logistic",
        solver="lbfgs"
    )
)




Train Accuracy:  0.47059913077709714
Test Accuracy:  0.4756013785669248

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       347
          1       0.47      0.90      0.62       580
          2       0.42      0.22      0.29       377

avg / total       0.33      0.46      0.36      1304


Normal split accuracy score:  0.4647239263803681


  'precision', 'predicted', average, warn_for)


# Build up a Random Forest Classifier with Grid Search

In [22]:
model = RandomForestClassifier()

grid_search = GridSearchCV(
    model,
    param_grid={
        "n_estimators": [100, 200, 300, 500, 700, 1000],
        "max_depth": [1, 2, 3, 5, 10],
    },
    scoring="accuracy",
    cv=3,
    verbose=True
)

grid_search.fit(X, y)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:   43.0s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [100, 200, 300, 500, 700, 1000], 'max_depth': [1, 2, 3, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=True)

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_params_