In [24]:
import pandas
import time
import numpy as np
import cProfile
import pstats

from pathlib import Path

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

Load data:

In [2]:
data_table = pandas.read_csv(Path("../data/all_match_data.csv"))
label_table = pandas.read_csv(Path("../data/all_match_result_data.csv"))

data = data_table.to_numpy()[:, 1:]
labels = label_table.to_numpy()[:, 1:].flatten().astype(bool)
print("Data loaded.")

Data loaded.


Use GridSearch with Cross Validation to search for the best hyperparameters.

In [13]:
parameter_space = {
    'hidden_layer_sizes': [(16,16,16), (16, 16, 16, 16), (32,16,16), (32,32), (16,16,8,8)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0005, 0.00075, 0.00025, 0.0006, 0.0004],
    'learning_rate': ['constant','adaptive'],
}
mlp = MLPClassifier(max_iter=100)
clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
clf.fit(data, labels)
best_params = clf.best_params_
print("Grid search finished.")

Grid search finished.


Use KFold Cross validation to test a model trained with the previously given hyperparameters.
Test speed in both training and evaluation and test accuracy.

In [11]:
kf = KFold(n_splits=10)

scores = []
training_times = []
execution_times = []

for train_index, test_index in kf.split(data):
    train_data, test_data = data[train_index], data[test_index]
    train_labels, test_labels = labels[train_index], labels[test_index]

    classifier = MLPClassifier(**best_params)
    start_time = time.time()
    classifier.fit(train_data, train_labels)
    training_times.append(time.time() - start_time)

    start_time = time.time()
    scores.append(clf.score(test_data, test_labels))
    execution_times.append(time.time() - start_time)



avg_score = sum(scores) / len(scores)
avg_train_time = sum(training_times) / len(training_times)
avg_exec_time = sum(execution_times) / len(execution_times)
print("KFold finished.")
print("Avg score: {} ; Avg Train time: {} ; Avg exec time: {}".format(avg_score, avg_train_time, avg_exec_time))


KFold finished.
Avg score: 0.937740545059517 ; Avg Train time: 32.62016935348511 ; Avg exec time: 0.004850435256958008


Profile execution stats across all data using model trained via GridSearchCV.

In [28]:
stats_path = Path('../data/stats/mlp_exec_stats')
cProfile.run('clf.predict(data)', stats_path)

p = pstats.Stats(stats_path.as_posix())

p.strip_dirs().sort_stats(-1).print_stats()

Wed Nov 18 08:25:12 2020    ../data/stats/mlp_exec_stats

         404 function calls (396 primitive calls) in 0.052 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        4    0.000    0.000    0.011    0.003 <__array_function__ internals>:2(clip)
        8    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(ndim)
        1    0.000    0.000    0.002    0.002 <__array_function__ internals>:2(sum)
        1    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:389(parent)
        1    0.000    0.000    0.052    0.052 <string>:1(<module>)
        6    0.000    0.000    0.000    0.000 _asarray.py:14(asarray)
        1    0.000    0.000    0.000    0.000 _asarray.py:86(asanyarray)
        1    0.002    0.002    0.002    0.002 _base.py:30(logistic)
        4    0.000    0.000    0.011    0.003 _base.py:62(relu)
        1    0.000    0.000    0.000    0.000 _config.py:14(get_config)
        1    0.0

<pstats.Stats at 0x7ff6cb38d340>