In [1]:
from collections import namedtuple

import numpy as np
import matplotlib.pyplot as plt
from forecast.games_dataset_features import *

Dataset = namedtuple('Dataset', ['train_x', 'train_y', 'test_x', 'test_y'])

In [24]:
dataset_file = 'games_10_3'
x_origin = np.load('./server/forecast/dataset/{}_x.npy'.format(dataset_file))
y = np.load('./server/forecast/dataset/{}_y.npy'.format(dataset_file))

print(x_origin.shape)
print(y.shape)

# y2 is a target for win/lose only
y2 = y.copy()
y2[y2 == 1] = 0
y2[y2 == 2] = 1
y2[y2 == 3] = 1
print(y2.shape)


(6615, 107)
(6615,)
(6615,)


In [None]:
x_data = np.zeros((x_origin.shape[0], (x_origin.shape[1] + 1)//2), dtype=x_origin.dtype)
print(x_data.shape)

x_data[:, FEATURE_IS_REGULAR] = x_origin[:, FEATURE_IS_REGULAR]
x_data[:, FEATURE_HOME_DAYS_SINCE_LAST_GAME:] = \
    x_origin[:, FEATURE_HOME_DAYS_SINCE_LAST_GAME:FEATURE_AWAY_DAYS_SINCE_LAST_GAME] -\
    x_origin[:, FEATURE_AWAY_DAYS_SINCE_LAST_GAME:]


[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106]


In [21]:
from sklearn.feature_selection import VarianceThreshold

sel = VarianceThreshold(threshold=0.1)
sel.fit(x_origin)

all_idx = set(range(FEATURE_COUNT))
idx = set(sel.get_support(indices=True))
print(all_idx.difference(idx))


{0, 66, 67, 36, 100, 101, 72, 106, 13, 14, 47, 48, 19, 83, 53, 84, 89, 58, 30, 31}


In [3]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler


def split_dataset(features, target, test_data_percentage=0.25, norm=False):
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=test_data_percentage)
    if norm:
        scaler = StandardScaler()  
        scaler.fit(x_train)  
        x_train = scaler.transform(x_train)  
        x_test = scaler.transform(x_test)  
    return Dataset(x_train, y_train, x_test, y_test)


def evaluate_model(model, data):
    predictions = model.predict(data.test_x)
    print("Train Accuracy: ", accuracy_score(data.train_y, model.predict(data.train_x)))
    print("Test Accuracy: ", accuracy_score(data.test_y, predictions))
    print("Confusion matrix\n", confusion_matrix(data.test_y, predictions))


def hyperparameters_tune(model, features, target, params, iter_num=20, cv=3):
    rscv = RandomizedSearchCV(estimator=model, param_distributions=params,
                              n_iter=iter_num, cv=cv, n_jobs=-1)
    rscv.fit(features, target)
    return rscv


def feature_importance(clf):
    importances = list(clf.feature_importances_)
    feature_importances =\
        [(f, round(imp, 2)) for f, imp in zip(range(FEATURE_COUNT), importances)]
    feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True)
    [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]


In [4]:
MASKS = {
    'all': None,
    'L_H2H': FEATURES_L_H2H,
    'S_H2H': FEATURES_S_H2H,
    'H2H': FEATURES_H2H
}

TARGETS = {
    '4Classes': y,
    '2Classes': y2
}


def test_feature_sets(train_func, data=None, test_data_percentage=0.25, norm=False):
    if data is not None:
        for target in TARGETS.keys():
            print('Target {}'.format(target))
            dataset = split_dataset(data, TARGETS[target], test_data_percentage=test_data_percentage, norm=norm)
            train_func(dataset)
    else:        
        for feature_mask in MASKS.keys():
            if MASKS[feature_mask] is None:
                x = x_origin
            else:
                x = x_origin[:, MASKS[feature_mask]]
            for target in TARGETS.keys():
                print('Features {}, target {}'.format(feature_mask, target))
                dataset = split_dataset(x, TARGETS[target], test_data_percentage=test_data_percentage, norm=norm)
                train_func(dataset)


In [10]:
from sklearn.ensemble import RandomForestClassifier


def train_random_forest(data):
    # Number of trees in random forest
    n_estimators = [300, 400]
    # Number of features to consider at every split
    max_features = ['auto', 20, 30]
    # Maximum number of levels in tree
    max_depth = [None, 5, 10, 20]
    # Minimum number of samples required to split a node
    min_samples_split = [10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    
    rf_param_grid = {
        'n_estimators': n_estimators,
        'max_features': max_features,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'bootstrap': bootstrap
    }
    
    rf_model = RandomForestClassifier()
    rand_cv = hyperparameters_tune(rf_model, data.train_x, data.train_y, rf_param_grid, iter_num=20, cv=5)
    
    print(rand_cv.best_params_)
    evaluate_model(rand_cv.best_estimator_, data)


In [6]:
from sklearn.ensemble import AdaBoostClassifier


def train_ada_boost(data):
    n_estimators = [50, 100, 400, 600]
    learning_rate = [0.1, 0.5, 1.0, 1.3, 1.5]    

    ab_param_grid = {
        'n_estimators': n_estimators,
        'learning_rate': learning_rate
    }
    
    ab_model = AdaBoostClassifier()
    rand_cv = hyperparameters_tune(ab_model, data.train_x, data.train_y, ab_param_grid, iter_num=20, cv=5)
    
    print(rand_cv.best_params_)
    evaluate_model(rand_cv.best_estimator_, data)


In [7]:
from sklearn.naive_bayes import GaussianNB


def train_naive_bayes(data):
    nb_param_grid = {
        'priors': [None]
    }
    
    nb_model = GaussianNB()
    rand_cv = hyperparameters_tune(nb_model, data.train_x, data.train_y, nb_param_grid, iter_num=1, cv=5)
    
    print(rand_cv.best_params_)
    print(rand_cv.best_estimator_.class_prior_)
    evaluate_model(rand_cv.best_estimator_, data)


In [18]:
from sklearn.neural_network import MLPClassifier


def train_mlp(data):
    mlp_param_grid = {
        'hidden_layer_sizes': [(5,), (25,), (30, 10), (25, 10), (20, 5)],
        'activation': ['tanh', 'relu'],
        'solver': ['sgd', 'adam'],
        'alpha': [0.00001, 0.0001, 1.0, 1.5],
        'learning_rate': ['constant', 'adaptive'],
        'max_iter': [1200, 1500]
    }

    mlp = MLPClassifier()
    rand_cv = hyperparameters_tune(mlp, data.train_x, data.train_y, mlp_param_grid, iter_num=80, cv=5)
    
    print(rand_cv.best_params_)
    evaluate_model(rand_cv.best_estimator_, data)


In [26]:
test_feature_sets(train_ada_boost, norm=False)



Features all, target 4Classes


{'n_estimators': 400, 'learning_rate': 0.1}


Train Accuracy:  0.4696633743196936
Test Accuracy:  0.4347037484885127
Confusion matrix
 [[571   0   3 131]
 [157   0   0  45]
 [135   0   1  53]
 [411   0   0 147]]
Features all, target 2Classes


{'n_estimators': 100, 'learning_rate': 0.1}
Train Accuracy:  0.5908083047772626
Test Accuracy:  0.5749697702539298
Confusion matrix
 [[738 172]
 [531 213]]
Features L_H2H, target 4Classes


{'n_estimators': 600, 'learning_rate': 0.1}


Train Accuracy:  0.468655513001411
Test Accuracy:  0.4480048367593712
Confusion matrix
 [[567   1   0 124]
 [146   0   0  50]
 [122   0   0  60]
 [410   0   0 174]]
Features L_H2H, target 2Classes


{'n_estimators': 100, 'learning_rate': 0.1}
Train Accuracy:  0.589397298931667
Test Accuracy:  0.5640870616686819
Confusion matrix
 [[761 151]
 [570 172]]
Features S_H2H, target 4Classes


{'n_estimators': 400, 'learning_rate': 0.1}


Train Accuracy:  0.4611973392461197
Test Accuracy:  0.43893591293833134
Confusion matrix
 [[581   1   0 129]
 [158   0   0  43]
 [142   0   0  38]
 [417   0   0 145]]
Features S_H2H, target 2Classes


{'n_estimators': 50, 'learning_rate': 0.5}
Train Accuracy:  0.614190687361419
Test Accuracy:  0.5423216444981862
Confusion matrix
 [[635 251]
 [506 262]]
Features H2H, target 4Classes


{'n_estimators': 400, 'learning_rate': 0.1}


Train Accuracy:  0.4464825639991937
Test Accuracy:  0.44316807738814995
Confusion matrix
 [[605   2   0  89]
 [158   0   0  38]
 [147   0   0  23]
 [464   0   0 128]]
Features H2H, target 2Classes


{'n_estimators': 50, 'learning_rate': 0.5}
Train Accuracy:  0.5928240274138279
Test Accuracy:  0.5532043530834341
Confusion matrix
 [[700 207]
 [532 215]]


In [95]:
dataset = split_dataset(x_data, y2, norm=True)

mlp = MLPClassifier(hidden_layer_sizes=(10, ),
                    activation="relu",
                    solver='adam',
                    alpha=2.0,
                    learning_rate="adaptive",
                    learning_rate_init=0.0001,
                    max_iter=800)

mlp.fit(dataset.train_x, dataset.train_y)
evaluate_model(mlp, dataset)


Train Accuracy:  0.5982664785325539
Test Accuracy:  0.5761789600967352
Confusion matrix
 [[693 207]
 [494 260]]


In [76]:
# show loss curve
x_axis = range(0, len(mlp.loss_curve_))
plt.plot(x_axis, mlp.loss_curve_, 'ko-')
plt.show()
