In [63]:
import numpy as np
import pandas as pd

from statistics import mean

from sklearn.preprocessing import LabelEncoder

from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_validate, train_test_split, GridSearchCV
from sklearn.metrics import classification_report

# Import dataset

In [64]:
compressed_final = pd.read_csv("../DataFormating/compressed_final.csv")
final = pd.read_csv("../DataFormating/final.csv")

# Setup `X`, `y` data for training / testing

In [65]:
compressed_final.columns[:10]

Index(['Stage', 'Home Team Name', 'Home Team Goals', 'Away Team Goals',
       'Away Team Name', 'Attendance', 'Half-time Home Goals',
       'Half-time Away Goals', 'Home Team Initials', 'Away Team Initials'],
      dtype='object')

In [66]:
X = compressed_final.drop(["Home Team Goals", "Away Team Goals", 
                           "Half-time Home Goals", "Half-time Away Goals", 
                           "Home Team Initials", "Away Team Initials"], axis=1)
y = []

In [67]:
for i in range(len(compressed_final)):
    home_team_goals = compressed_final.iloc[i]["Home Team Goals"]
    away_team_goals = compressed_final.iloc[i]["Away Team Goals"]
    
    if home_team_goals > away_team_goals:
        y.append(1)
    elif home_team_goals < away_team_goals:
        y.append(2)
    else:
        y.append(0)

In [68]:
# Test
assert len(X) == len(y)

### Encode textual features from the `X` dataset

In [69]:
X["Stage"] = LabelEncoder().fit_transform(X["Stage"])
X["Home Team Name"] = LabelEncoder().fit_transform(X["Home Team Name"])
X["Away Team Name"] = LabelEncoder().fit_transform(X["Away Team Name"])

In [70]:
len(X.columns)

413

### Feature Selection

In [71]:
X.columns[4:]

Index(['Player 1 Age Diff', 'Player 1 Overall Diff', 'Player 1 Potential Diff',
       'Player 1 Acceleration Diff', 'Player 1 Aggression Diff',
       'Player 1 Agility Diff', 'Player 1 Balance Diff',
       'Player 1 Ball control Diff', 'Player 1 Composure Diff',
       'Player 1 Crossing Diff',
       ...
       'Player 11 Shot power Diff', 'Player 11 Sliding tackle Diff',
       'Player 11 Sprint speed Diff', 'Player 11 Stamina Diff',
       'Player 11 Standing tackle Diff', 'Player 11 Strength Diff',
       'Player 11 Vision Diff', 'Player 11 Volleys Diff',
       'Mean Home Team Goals', 'Mean Away Team Goals'],
      dtype='object', length=409)

In [72]:
# selection = SelectKBest(score_func=f_classif, k=5)
# selection.fit(X, y)

# X = selection.transform(X)

In [73]:
feature_names = [
    "Stage", "Home Team Name", "Away Team Name", "Attendance",
    "Overall", # "Potential",
    "Mean Home Team Goals", "Mean Away Team Goals"
]

COLUMNS = []

for column_name in X.columns:
    for feature_name in feature_names:
        if feature_name in column_name:
            COLUMNS.append(column_name)
            break


In [74]:
X = X[COLUMNS]

### Split `X` and `y` into train / test sets

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# Fast testing

In [76]:
def test_model(model):
    mean_train_acc = mean(cross_validate(model, X, y, cv=50)["train_score"]) 
    mean_test_acc = mean(cross_validate(model, X, y, cv=50)["test_score"])
    
    print("Train Accuracy: ", mean_train_acc)
    print("Test Accuracy: ", mean_test_acc)

### Random Forests

In [77]:
test_model(RandomForestClassifier(n_estimators=100))



Train Accuracy:  1.0
Test Accuracy:  0.7396666666666667


### Support Vector Machines

In [78]:
test_model(SVC(C=1.0, kernel="rbf", gamma="auto"))



Train Accuracy:  1.0
Test Accuracy:  0.4146666666666667


### Extremely Randomized Trees

In [79]:
test_model(ExtraTreesClassifier(n_estimators=100))



Train Accuracy:  1.0
Test Accuracy:  0.7663333333333333


### AdaBoost with Random Forests

In [81]:
test_model(AdaBoostClassifier(RandomForestClassifier(n_estimators=100), n_estimators=100))



Train Accuracy:  1.0
Test Accuracy:  0.7496666666666667


# Build up a Random Forest Classifier with Grid Search

In [62]:
model = RandomForestClassifier()

grid_search = GridSearchCV(
    model,
    param_grid={
        "n_estimators": range(1000, 1100)
    },
    scoring="accuracy",
    cv=3,
    verbose=True
)

grid_search.fit(X, y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


KeyboardInterrupt: 

In [None]:
grid_search.best_score_