In [8]:
import os
import sys
import pandas as pd
from pathlib import Path

In [9]:
from split_start import *
from split_block import *
from prepare_dataset import *
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics

In [12]:
working_dir = Path(os.getcwd()).parent
sys.path.append(os.path.join(working_dir, 'Preprocessing\\'))

from data_aggregator import *
from season import *
from parameters import *
from team import *

## Przygotowanie danych - pobranie przetworzonych danych do pamięci podręcznej

In [13]:
no_last_matches = 3

data_aggregator = DataAggregator()

all_seasons, all_data_past =\
                    data_aggregator.get_data_for_seasons([Season.y2010, Season.y2011,
                                                         Season.y2012, Season.y2013,
                                                         Season.y2014, Season.y2015, Season.y2016], 
                                                         Parameters(no_last_matches=no_last_matches))

## Przygotowanie podstawowych wartości

In [14]:
all_seasons['match_date'] = pd.to_datetime(all_seasons['match_date'])

In [15]:
list_of_parameters = ['home_team_score',
                        'away_team_score', 
                        'home_team_seasons_played',
                        'away_team_seasons_played', 
                        'home_team_last_season_points',
                        'away_team_last_season_points', 
                        'home_players_avg_age',
                        'away_players_avg_age', 
                        'home_players_avg_rating',
                        'away_players_avg_rating', 
                        'home_elo_rating', 
                        'away_elo_rating',
                        'avg_home_win_odds', 
                        'avg_draw_odds', 
                        'avg_away_win_odds',
                        'home_avg_corners', 
                        'away_avg_corners', 
                        'home_avg_shots',
                        'away_avg_shots', 
                        'home_won_games', 
                        'away_won_games', 
                        'home_tied_games',
                        'away_tied_games', 
                        'home_lost_games', 
                        'away_lost_games',
                        'home_scored_goals', 
                        'away_scored_goals'
                     ]

In [17]:
dataset = prepare_dataset(all_seasons, list_of_parameters, all_data_past, add_direct = False, avg = 3, train_size = 0.9,
                          test_size = 0.1, undersample = True, globalCS = False)

In [18]:
Counter(dataset['y_train'])

Counter({1: 603, 0: 521, 2: 595})

## Algorytm Logistic Regression

In [19]:
Xtrain = dataset['X_train']
Ytrain = dataset['y_train']

Xtest = dataset['X_test']
Ytest = dataset['y_test']

In [20]:
parameters = {
    'random_state': [10, 20, 40, 41, 42, 46, 50, 51, 56, 60, 70, 80],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': [0.2, 0.3, 0.7, 1.0, 1.5, 2.0],
    'fit_intercept': [True, False],
    'class_weight': ['balanced'],
    'warm_start': [True, False],
    'multi_class': ['auto'],
    'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
    'max_iter': [2000]
}

In [21]:
logisticRegression = RandomizedSearchCV(estimator = LogisticRegression(), param_distributions = parameters, \
                                        n_jobs = -1, random_state = 42, scoring = 'accuracy')

In [22]:
logisticRegression.fit(Xtrain, Ytrain)
print("Najlepsze znalezione parametry:", logisticRegression.best_params_)

Najlepsze znalezione parametry: {'warm_start': True, 'solver': 'saga', 'random_state': 46, 'penalty': 'l2', 'multi_class': 'auto', 'max_iter': 2000, 'fit_intercept': True, 'class_weight': 'balanced', 'C': 0.7}


In [23]:
Ypred = logisticRegression.predict(Xtest)

In [24]:
print('\n##################################\nAccuracy: ')

print(metrics.accuracy_score(Ytest, Ypred))

print('\n##################################\nPrecision, Recall and fscore:: ')

print(metrics.precision_recall_fscore_support(Ytest, Ypred, average='macro'))


##################################
Accuracy: 
0.5340314136125655

##################################
Precision, Recall and fscore:: 
(0.5280712050078247, 0.5379672692599584, 0.5278717801796683, None)


In [28]:
print("Poprawnie sklasyfikowane przykłady:")
sorted(Counter(Ytest[Ypred == Ytest]).items(), key = lambda el : el[0])

Poprawnie sklasyfikowane przykłady:


[(0, 23), (1, 40), (2, 39)]

In [29]:
print("Błędnie sklasyfikowane przykłady:")
sorted(Counter(Ytest[Ypred != Ytest]).items(), key = lambda el : el[0])

Błędnie sklasyfikowane przykłady:


[(0, 42), (1, 19), (2, 28)]

In [30]:
confusion = metrics.confusion_matrix(Ytest, Ypred)
print("Macierz pomyłek:")
print(confusion)

Macierz pomyłek:
[[23 18 24]
 [10 40  9]
 [15 13 39]]


## Nauczenie modelu na całym dostępnym zbiorze danych i jego zapis w celu wykorzystania w interaktywnym notebooku

In [32]:
import pickle

finalModel = logisticRegression.best_estimator_
dataset_final = prepare_dataset(all_seasons, list_of_parameters, all_data_past, add_direct = False, avg = 3, train_size = 1.0,
                          test_size = 0.0, undersample = True, globalCS = False)

finalModel.fit(dataset_final['X_train'], dataset_final['y_train'])

filename = 'logisticRegression.pkl'
pickle.dump(finalModel, open(filename, "wb"))