In [1]:
import os
import sys
import warnings
import pandas as pd
from pathlib import Path

In [2]:
from split_start import *
from split_block import *
from prepare_dataset import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics

In [3]:
working_dir = Path(os.getcwd()).parent
sys.path.append(os.path.join(working_dir, 'Preprocessing\\'))

from data_aggregator import *
from season import *
from parameters import *
from team import *

## Przygotowanie danych - pobranie przetworzonych danych do pamięci podręcznej

In [4]:
no_last_matches = 3

data_aggregator = DataAggregator()

all_seasons, all_data_past =\
                    data_aggregator.get_data_for_seasons([Season.y2010, Season.y2011,
                                                         Season.y2012, Season.y2013,
                                                         Season.y2014, Season.y2015, Season.y2016], 
                                                         Parameters(no_last_matches=no_last_matches))

## Przygotowanie podstawowych wartości

In [5]:
all_seasons['match_date'] = pd.to_datetime(all_seasons['match_date'])

In [6]:
list_of_parameters = ['home_team_score',
                        'away_team_score', 
                        'home_team_seasons_played',
                        'away_team_seasons_played', 
                        'home_team_last_season_points',
                        'away_team_last_season_points', 
                        'home_players_avg_age',
                        'away_players_avg_age', 
                        'home_players_avg_rating',
                        'away_players_avg_rating', 
                        'home_elo_rating', 
                        'away_elo_rating',
                        'avg_home_win_odds', 
                        'avg_draw_odds', 
                        'avg_away_win_odds',
                        'home_avg_corners', 
                        'away_avg_corners', 
                        'home_avg_shots',
                        'away_avg_shots', 
                        'home_won_games', 
                        'away_won_games', 
                        'home_tied_games',
                        'away_tied_games', 
                        'home_lost_games', 
                        'away_lost_games',
                        'home_scored_goals', 
                        'away_scored_goals'
                     ]

In [32]:
dataset = prepare_dataset(all_seasons, list_of_parameters, all_data_past, add_direct = True, avg = 3, train_size = 0.9,
                          test_size = 0.1, undersample = False, globalCS = False)

In [33]:
Counter(dataset['y_train'])

Counter({1: 923, 0: 522, 2: 597})

## Algorytm Random Forest

In [34]:
Xtrain = dataset['X_train']
Ytrain = dataset['y_train']

Xtest = dataset['X_test']
Ytest = dataset['y_test']

In [35]:
parameters = {
    'n_estimators': [10, 20, 40, 60, 80, 100, 130, 170],
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 5, 7, 10, 15, 20, 30, 40],
    'bootstrap': [True, False],
    'min_samples_split': [2, 4, 6, 8, 10, 15, 20],
    'class_weight': ['balanced', None],
    'random_state': [10, 20, 40, 41, 42, 46, 50, 51, 56, 60, 70, 80]
}

In [36]:
randomForest = RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions = parameters, n_jobs = -1,
                                  random_state = 42, scoring = 'accuracy')

In [37]:
randomForest.fit(Xtrain, Ytrain)
print("Najlepsze znalezione parametry:", randomForest.best_params_)

Najlepsze znalezione parametry: {'random_state': 40, 'n_estimators': 80, 'min_samples_split': 2, 'max_depth': 5, 'criterion': 'entropy', 'class_weight': None, 'bootstrap': True}


In [38]:
Ypred = randomForest.predict(Xtest)

In [39]:
print('\n##################################\nAccuracy: ')

print(metrics.accuracy_score(Ytest, Ypred))

print('\n##################################\nPrecision, Recall and fscore:: ')

print(metrics.precision_recall_fscore_support(Ytest, Ypred, average='macro'))


##################################
Accuracy: 
0.5066079295154186

##################################
Precision, Recall and fscore:: 
(0.520952380952381, 0.45537022501308216, 0.41019906776381104, None)


In [40]:
print("Poprawnie sklasyfikowane przykłady:")
sorted(Counter(Ytest[Ypred == Ytest]).items(), key = lambda el : el[0])

Poprawnie sklasyfikowane przykłady:


[(0, 4), (1, 78), (2, 33)]

In [41]:
print("Błędnie sklasyfikowane przykłady:")
sorted(Counter(Ytest[Ypred != Ytest]).items(), key = lambda el : el[0])

Błędnie sklasyfikowane przykłady:


[(0, 60), (1, 20), (2, 32)]

In [42]:
confusion = metrics.confusion_matrix(Ytest, Ypred)
print("Macierz pomyłek:")
print(confusion)

Macierz pomyłek:
[[ 4 42 18]
 [ 1 78 19]
 [ 2 30 33]]


In [43]:
forestFeatureImportances = pd.Series(randomForest.best_estimator_.feature_importances_, index=dataset['X_train'].columns).sort_values(ascending=False)
forestFeatureImportances

avg_home_win_odds               0.132054
avg_away_win_odds               0.128795
home_players_avg_rating         0.085001
home_elo_rating                 0.082156
avg_draw_odds                   0.076478
home_team_last_season_points    0.057551
away_players_avg_rating         0.053931
away_elo_rating                 0.049289
away_team_last_season_points    0.033046
away_players_avg_age            0.029554
home_avg_shots                  0.026627
home_avg_corners                0.026218
away_avg_shots                  0.021186
away_direct_wins                0.020088
home_players_avg_age            0.020038
home_scored_goals               0.018703
away_team_score                 0.017095
home_team_score                 0.016206
away_avg_corners                0.015183
away_scored_goals               0.013919
home_direct_wins                0.013499
home_team_seasons_played        0.011218
away_team_seasons_played        0.009209
home_tied_games                 0.007502
direct_draws    

## Nauczenie modelu na całym dostępnym zbiorze danych i jego zapis w celu wykorzystania w interaktywnym notebooku

In [44]:
import pickle

finalModel = randomForest.best_estimator_
dataset_final = prepare_dataset(all_seasons, list_of_parameters, all_data_past, add_direct = True, avg = 3, train_size = 1.0,
                          test_size = 0.0, undersample = True, globalCS = False)

finalModel.fit(dataset_final['X_train'], dataset_final['y_train'])

filename = 'randomForest.pkl'
pickle.dump(finalModel, open(filename, "wb"))