In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

from features.data_provider import get_feature_columns, get_whole_dataset, set_feature_columns
from simulation.predictor import MaxProbabilityScorePredictor
from models.score_model import get_model
from simulation.analyse import get_win_probabilities, get_simulations
from simulation.simulation import run_actual_tournament_simulation
from db.simulation_table import get_simulation_results, delete_all
from notebook_helpers import plot_bank_and_bets, run_unit_strategy, run_kelly_strategy

In [2]:
all_features = get_feature_columns()
player_features = ['rating_diff', 'potential_diff', 'height_diff','weight_diff','age_diff',
                   'weak_foot_diff','internationl_repuatiotion_diff','crossing_diff','finishing_diff',
                   'heading_accuracy_diff','short_passing_diff','dribbling_diff','fk_accuracy_diff',
                   'long_passing_diff','ball_control_diff','acceleration_diff','sprint_speed_diff',
                   'reactions_diff','shot_power_diff','stamina_diff','strength_diff','long_shots_diff',
                   'aggression_diff','penalties_diff','marking_diff','standing_tackle_diff',
                  'gk_diving_diff', 'gk_handling_diff', 'gk_kicking_diff', 'gk_reflexes_diff']

other_features = ['elo_diff', 'away_goal_mean', 'away_goals_with_home', 
                  'goal_diff_with_away', 'home_goal_mean', 'home_goals_with_away']

assert (len(player_features) + len(other_features)) == len(all_features)

In [10]:
def simulate_betting_strategies(features, match_template_file, bet_file, filter_start=None, filter_end=None, interval=None):
    unit_banks = []
    kelly_banks = []
    accuracies = []

    set_feature_columns(features)
    
    home = get_whole_dataset("home_score", filter_start=filter_start, filter_end=filter_end, interval=interval)
    away = get_whole_dataset("away_score", filter_start=filter_start, filter_end=filter_end, interval=interval)
    X = pd.concat([home[0], away[0]])
    y = pd.concat([home[1], away[1]])

    print(X.shape)
    for i in range(10):
        model = get_model(X=X, y=y, n_estimators=2000)
        predictor = MaxProbabilityScorePredictor(model)
        match_template = pd.read_csv(match_template_file)
        run_actual_tournament_simulation(match_template, predictor)
        tournament_simulation = get_simulation_results()
        tournament_simulation["true_outcome"] = np.sign(tournament_simulation["home_score"] - tournament_simulation["away_score"])
        delete_all()

        accuracy = sum(tournament_simulation["outcome"] == tournament_simulation["true_outcome"]) / tournament_simulation.shape[0]
        accuracies.append(accuracy)

        match_bets = pd.read_csv(bet_file)

        y_pred = tournament_simulation["outcome"].values
        y_true = tournament_simulation["true_outcome"].values
        odds = match_bets[["1", "X", "2"]].values
        unit_bank = run_unit_strategy(y_pred, y_true, odds)

        probabilities = tournament_simulation[["home_win_prob", "draw_prob", "away_win_prob"]].values
        kelly_bank = run_kelly_strategy(y_true, odds, probabilities)

        unit_banks.append(unit_bank)
        kelly_banks.append(kelly_bank)

    return accuracies, unit_banks, kelly_banks

def print_report(accuracy, unit, kelly):
    print("AVG Accuracy: ", np.mean(accuracy), np.std(accuracy))
    print("AVG Unit bank: ", np.mean(unit), np.std(unit))
    print("AVG Kelly bank: ", np.mean(kelly), np.std(kelly))

In [None]:
#### WC 2018
match_template = 'data/original/wc_2018_games_real.csv'
betting_file = 'data/original/wc_2018_bets.csv'

In [None]:
acc, unit, kelly = simulate_betting_strategies(all_features, match_template, betting_file)
print_report(acc, unit, kelly)

In [None]:
acc, unit, kelly = simulate_betting_strategies(other_features, match_template, betting_file)
print_report(acc, unit, kelly)

In [None]:
acc, unit, kelly = simulate_betting_strategies(player_features, match_template, betting_file)
print_report(acc, unit, kelly)

In [None]:
#### WC 2014
match_template = 'data/original/wc_2014_games_real.csv'
betting_file = 'data/original/wc_2014_bets.csv'

In [None]:
acc, unit, kelly = simulate_betting_strategies(all_features, match_template, betting_file,
                                              filter_start="2014-06-12")
print_report(acc, unit, kelly)

In [None]:
acc, unit, kelly = simulate_betting_strategies(other_features, match_template, betting_file,
                                               filter_start="2014-06-12")
print_report(acc, unit, kelly)

In [None]:
acc, unit, kelly = simulate_betting_strategies(player_features, match_template, betting_file,
                                               filter_start="2014-06-12")
print_report(acc, unit, kelly)

In [None]:
#### WC 2010
match_template = 'data/original/wc_2010_games_real.csv'
betting_file = 'data/original/wc_2010_bets.csv'

In [None]:
acc, unit, kelly = simulate_betting_strategies(all_features, match_template, betting_file,
                                              filter_start="2010-06-12")
print_report(acc, unit, kelly)

In [None]:
acc, unit, kelly = simulate_betting_strategies(other_features, match_template, betting_file,
                                               filter_start="2010-06-12")
print_report(acc, unit, kelly)

In [None]:
acc, unit, kelly = simulate_betting_strategies(player_features, match_template, betting_file,
                                               filter_start="2010-06-12")
print_report(acc, unit, kelly)

In [None]:
from sklearn.model_selection import train_test_split

for features_group in [all_features, other_features, player_features]:
    set_feature_columns(features_group)
    home = get_whole_dataset("home_score")
    away = get_whole_dataset("away_score")
    X = pd.concat([home[0], away[0]])
    y = pd.concat([home[1], away[1]])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
    model = get_model(X=X_train, y=y_train, n_estimators=500)

    y_pred_mu = model.predict(X_test)
    y_pred = np.around(y_pred_mu)
    print("Score: ", sum(np.around(y_pred) == y_test) / len(X_test))

In [11]:
#### WC 2018
match_template = 'data/original/wc_2018_games_real.csv'
betting_file = 'data/original/wc_2018_bets.csv'

for interval in [(None, "2010-06-12"), ("2010-06-12", "2014-06-12"), ("2014-06-12", "2018-06-14")]:
    acc, unit, kelly = simulate_betting_strategies(all_features, match_template, betting_file,
                                               interval=interval)
    print_report(acc, unit, kelly)

(4238, 36)
AVG Accuracy:  0.578125 0.00698771242969
AVG Unit bank:  70.364 1.39791416045
AVG Kelly bank:  56.9552719695 2.40650181396
(16248, 36)
AVG Accuracy:  0.5875 0.0076546554462
AVG Unit bank:  71.47 1.44519894824
AVG Kelly bank:  71.3111380075 2.21825663259
(16248, 36)
AVG Accuracy:  0.5875 0.0076546554462
AVG Unit bank:  71.47 1.44519894824
AVG Kelly bank:  69.9056815772 2.99508925063
