In [12]:
import pandas
import time
import numpy as np
import cProfile
import pstats

from pathlib import Path

from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold

Load data:

In [13]:
data_table = pandas.read_csv(Path("../data/all_match_data.csv"))
label_table = pandas.read_csv(Path("../data/all_match_result_data.csv"))

data = data_table.to_numpy()[:, 1:]
labels = label_table.to_numpy()[:, 1:].flatten().astype(bool)
print("Data loaded.")

Data loaded.


Use KFold Cross validation to test a model trained with the previously given hyperparameters.
Test speed in both training and evaluation and test accuracy.

In [14]:
kf = KFold(n_splits=10)

scores = []
training_times = []
execution_times = []

for train_index, test_index in kf.split(data):
    train_data, test_data = data[train_index], data[test_index]
    train_labels, test_labels = labels[train_index], labels[test_index]

    gnb = GaussianNB()

    start_time = time.time()
    gnb.fit(train_data, train_labels)
    training_times.append(time.time() - start_time)

    start_time = time.time()
    scores.append(gnb.score(test_data, test_labels))
    execution_times.append(time.time() - start_time)



avg_score = sum(scores) / len(scores)
avg_train_time = sum(training_times) / len(training_times)
avg_exec_time = sum(execution_times) / len(execution_times)
print("KFold finished.")
print("Avg score: {} ; Avg Train time: {} ; Avg exec time: {}".format(avg_score, avg_train_time, avg_exec_time))

KFold finished.
Avg score: 0.9322213415333781 ; Avg Train time: 0.03225941658020019 ; Avg exec time: 0.003827667236328125


Train model across all data

In [15]:
gnb = GaussianNB()
gnb.fit(data, labels)
print("Finished training.")

Finished training.


Profile execution stats across all data using model trained via GridSearchCV.

In [16]:
stats_path = Path('../data/stats/nb_exec_stats')
cProfile.run('gnb.predict(data)', stats_path)

p = pstats.Stats(stats_path.as_posix())

p.strip_dirs().sort_stats(-1).print_stats()

Wed Nov 18 08:42:23 2020    ../data/stats/nb_exec_stats

         154 function calls in 0.025 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.001    0.001 <__array_function__ internals>:2(argmax)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(size)
        5    0.000    0.000    0.005    0.001 <__array_function__ internals>:2(sum)
        1    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:389(parent)
        1    0.000    0.000    0.025    0.025 <string>:1(<module>)
        1    0.000    0.000    0.000    0.000 _asarray.py:14(asarray)
        1    0.000    0.000    0.000    0.000 _asarray.py:86(asanyarray)
        1    0.000    0.000    0.000    0.000 _config.py:14(get_config)
        1    0.000    0.000    0.000    0.000 abc.py:100(__subclasscheck__)
        1    0.000    0.000    0.000    0.000 abc.py:96(__instancecheck__)
        1    0.000

<pstats.Stats at 0x7f036c662d30>