In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score

from features.data_provider import get_feature_columns, get_whole_dataset, set_feature_columns
from simulation.predictor import MaxProbabilityOutcomePredictor
from models.outcome_model import get_model
from simulation.analyse import get_win_probabilities, get_simulations
from simulation.simulation import run_actual_tournament_simulation
from db.simulation_table import get_simulation_results, delete_all
from notebook_helpers import plot_bank_and_bets, run_unit_strategy, run_kelly_strategy

In [2]:
all_features = get_feature_columns()
player_features = ['rating_diff', 'potential_diff', 'height_diff','weight_diff','age_diff',
                   'weak_foot_diff','internationl_repuatiotion_diff','crossing_diff','finishing_diff',
                   'heading_accuracy_diff','short_passing_diff','dribbling_diff','fk_accuracy_diff',
                   'long_passing_diff','ball_control_diff','acceleration_diff','sprint_speed_diff',
                   'reactions_diff','shot_power_diff','stamina_diff','strength_diff','long_shots_diff',
                   'aggression_diff','penalties_diff','marking_diff','standing_tackle_diff',
                  'gk_diving_diff', 'gk_handling_diff', 'gk_kicking_diff', 'gk_reflexes_diff']

other_features = ['elo_diff', 'away_goal_mean', 'away_goals_with_home', 
                  'goal_diff_with_away', 'home_goal_mean', 'home_goals_with_away']

assert (len(player_features) + len(other_features)) == len(all_features)

In [3]:
#### WC 2018

In [4]:
X, y = get_whole_dataset("home_win")

In [5]:
unit_banks = []
kelly_banks = []
accuracies = []

set_feature_columns(all_features)
print(X.shape)
for i in range(10):
    model = get_model(X=X, y=y, n_estimators=500)
    predictor = MaxProbabilityOutcomePredictor(model)
    match_template = pd.read_csv('data/original/wc_2018_games_real.csv')
    run_actual_tournament_simulation(match_template, predictor)
    tournament_simulation = get_simulation_results()
    tournament_simulation["true_outcome"] = np.sign(tournament_simulation["home_score"] - tournament_simulation["away_score"])
    delete_all()

    accuracy = sum(tournament_simulation["outcome"] == tournament_simulation["true_outcome"]) / tournament_simulation.shape[0]
    accuracies.append(accuracy)
    
    match_bets = pd.read_csv('data/original/wc_2018_bets.csv')

    y_pred = tournament_simulation["outcome"].values
    y_true = tournament_simulation["true_outcome"].values
    odds = match_bets[["1", "X", "2"]].values
    unit_bank = run_unit_strategy(y_pred, y_true, odds)

    probabilities = tournament_simulation[["home_win_prob", "draw_prob", "away_win_prob"]].values
    kelly_bank = run_kelly_strategy(y_pred, y_true, odds, probabilities)
    
    unit_banks.append(unit_bank)
    kelly_banks.append(kelly_bank)

print(np.mean(accuracies))
print(np.mean(unit_banks))
print(np.mean(kelly_banks))

(8124, 36)
0.5734375
68.179
91.7368271917


In [6]:
unit_banks = []
kelly_banks = []
accuracies = []

set_feature_columns(other_features)
subset_of_features = other_features
Xsub = X[subset_of_features]
print(Xsub.shape)

for i in range(10):
    model = get_model(X=Xsub, y=y, n_estimators=500)
    predictor = MaxProbabilityOutcomePredictor(model)
    match_template = pd.read_csv('data/original/wc_2018_games_real.csv')
    run_actual_tournament_simulation(match_template, predictor)
    tournament_simulation = get_simulation_results()
    tournament_simulation["true_outcome"] = np.sign(tournament_simulation["home_score"] - tournament_simulation["away_score"])
    delete_all()
    
    accuracy = sum(tournament_simulation["outcome"] == tournament_simulation["true_outcome"]) / tournament_simulation.shape[0]
    accuracies.append(accuracy)
    
    match_bets = pd.read_csv('data/original/wc_2018_bets.csv')

    y_pred = tournament_simulation["outcome"].values
    y_true = tournament_simulation["true_outcome"].values
    odds = match_bets[["1", "X", "2"]].values
    unit_bank = run_unit_strategy(y_pred, y_true, odds)

    probabilities = tournament_simulation[["home_win_prob", "draw_prob", "away_win_prob"]].values
    kelly_bank = run_kelly_strategy(y_pred, y_true, odds, probabilities)
    
    unit_banks.append(unit_bank)
    kelly_banks.append(kelly_bank)
    
print(np.mean(accuracies))
print(np.mean(unit_banks))
print(np.mean(kelly_banks))

(8124, 6)
0.5234375
62.05
38.5146511152


In [7]:
#### CROSS VALIDATION

In [8]:
set_feature_columns(all_features)
X, y = get_whole_dataset("home_win")

In [9]:
model = get_model(n_estimators=500)
score = cross_val_score(model, X, y, cv=10)
np.mean(score)

0.54024251169770565

In [10]:
X.shape

(8124, 36)

In [11]:
subset_of_features = other_features
Xsub = X[subset_of_features]
Xsub.shape

(8124, 6)

In [12]:
model = get_model(n_estimators=500)
score = cross_val_score(model, Xsub, y, cv=10)
np.mean(score)

0.5457883213485617

In [13]:
#### WC 2014

In [14]:
set_feature_columns(all_features)
X, y = get_whole_dataset("home_win", filter_start="2014-06-12")

In [15]:
unit_banks = []
kelly_banks = []
accuracies = []

print(X.shape)
for i in range(10):
    model = get_model(X=X, y=y, n_estimators=500)
    predictor = MaxProbabilityOutcomePredictor(model)
    match_template = pd.read_csv('data/original/wc_2014_games_real.csv')
    run_actual_tournament_simulation(match_template, predictor)
    tournament_simulation = get_simulation_results()
    tournament_simulation["true_outcome"] = np.sign(tournament_simulation["home_score"] - tournament_simulation["away_score"])
    delete_all()

    accuracy = sum(tournament_simulation["outcome"] == tournament_simulation["true_outcome"]) / tournament_simulation.shape[0]
    accuracies.append(accuracy)
    
    match_bets = pd.read_csv('data/original/wc_2014_bets.csv')

    y_pred = tournament_simulation["outcome"].values
    y_true = tournament_simulation["true_outcome"].values
    odds = match_bets[["1", "X", "2"]].values
    unit_bank = run_unit_strategy(y_pred, y_true, odds)

    probabilities = tournament_simulation[["home_win_prob", "draw_prob", "away_win_prob"]].values
    kelly_bank = run_kelly_strategy(y_pred, y_true, odds, probabilities)
    
    unit_banks.append(unit_bank)
    kelly_banks.append(kelly_bank)

print(np.mean(accuracies))
print(np.mean(unit_banks))
print(np.mean(kelly_banks))

(5153, 36)
0.6203125
76.536
131.146662296


In [16]:
unit_banks = []
kelly_banks = []
accuracies = []

set_feature_columns(other_features)
subset_of_features = other_features
Xsub = X[subset_of_features]
print(Xsub.shape)

for i in range(10):
    model = get_model(X=Xsub, y=y, n_estimators=500)
    predictor = MaxProbabilityOutcomePredictor(model)
    match_template = pd.read_csv('data/original/wc_2014_games_real.csv')
    run_actual_tournament_simulation(match_template, predictor)
    tournament_simulation = get_simulation_results()
    tournament_simulation["true_outcome"] = np.sign(tournament_simulation["home_score"] - tournament_simulation["away_score"])
    delete_all()
    
    accuracy = sum(tournament_simulation["outcome"] == tournament_simulation["true_outcome"]) / tournament_simulation.shape[0]
    accuracies.append(accuracy)
    
    match_bets = pd.read_csv('data/original/wc_2014_bets.csv')

    y_pred = tournament_simulation["outcome"].values
    y_true = tournament_simulation["true_outcome"].values
    odds = match_bets[["1", "X", "2"]].values
    unit_bank = run_unit_strategy(y_pred, y_true, odds)

    probabilities = tournament_simulation[["home_win_prob", "draw_prob", "away_win_prob"]].values
    kelly_bank = run_kelly_strategy(y_pred, y_true, odds, probabilities)
    
    unit_banks.append(unit_bank)
    kelly_banks.append(kelly_bank)
    
print(np.mean(accuracies))
print(np.mean(unit_banks))
print(np.mean(kelly_banks))

(5153, 6)
0.5703125
68.698
202.253328729
