In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn import decomposition
from sklearn import datasets, linear_model
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import explained_variance_score, classification_report, r2_score, accuracy_score
from itertools import combinations
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
dataset = pd.read_csv('dataset_player_rating_augmented.csv')
dataset = dataset.drop(['Unnamed: 0'], axis=1)
dataset.head()

Unnamed: 0,year,away_team,home_team,date,home_score,away_score,tournament,city,country,neutral,...,away_dribbling,away_defending,away_physical,rating_diff,pace_diff,shooting_diff,passing_diff,dribbling_diff,defending_diff,physical_diff
0,1993,Argentina,Australia,1993-10-31,1,1,FIFA World Cup qualification,Sydney,Australia,False,...,74.861821,72.777893,73.46154,-1.815437,-3.596078,-1.948306,-1.267271,-2.342418,-2.758941,-3.430231
1,1993,Argentina,Colombia,1993-08-15,2,1,FIFA World Cup qualification,Barranquilla,Colombia,False,...,80.257301,77.39582,77.814123,-6.027294,-7.260922,-6.977287,-6.297929,-7.443481,-6.399032,-6.420082
2,1993,Argentina,Paraguay,1993-08-08,1,3,FIFA World Cup qualification,Asunci<U+00F3>n,Paraguay,False,...,81.051861,77.935734,78.354805,-17.497554,-21.684425,-20.413301,-18.266553,-21.585661,-18.887825,-18.674328
3,1993,Australia,Argentina,1993-11-17,1,0,FIFA World Cup qualification,Buenos Aires,Argentina,False,...,71.675809,69.646914,69.733326,3.322779,5.316932,3.900924,3.122804,4.269153,4.158559,4.482709
4,1993,Australia,Korea Republic,1993-09-24,1,1,Friendly,Seoul,Korea Republic,False,...,72.194969,70.084934,70.095494,-2.193589,-1.953892,-2.314932,-2.40583,-2.302152,-2.210416,-1.470006


In [3]:
dataset.columns

Index(['year', 'away_team', 'home_team', 'date', 'home_score', 'away_score',
       'tournament', 'city', 'country', 'neutral', 'home_elo', 'away_elo',
       'elo_diff', 'score_difference', 'home_score_difference_lag',
       'home_avg_goal_diff', 'away_score_difference_lag', 'away_avg_goal_diff',
       'avg_goals_received', 'home_win', 'wc_home_wins', 'wc_away_wins',
       'home_rank', 'home_country_abrv', 'home_total_points',
       'home_previous_points', 'home_rank_change', 'home_cur_year_avg',
       'home_cur_year_avg_weighted', 'home_last_year_avg',
       'home_last_year_avg_weighted', 'home_two_year_ago_avg',
       'home_two_year_ago_weighted', 'home_three_year_ago_avg',
       'home_three_year_ago_weighted', 'home_confederation', 'away_rank',
       'away_country_abrv', 'away_total_points', 'away_previous_points',
       'away_rank_change', 'away_cur_year_avg', 'away_cur_year_avg_weighted',
       'away_last_year_avg', 'away_last_year_avg_weighted',
       'away_two_year_

In [5]:
def get_previous_goals(row):
    home = row["home_team"]
    away = row["away_team"]
    date = row["date"]
    data = dataset[((dataset['home_team'] == home) | (dataset['away_team'] == home)) & 
            ((dataset['home_team'] == away) | (dataset['away_team'] == away)) &
            (dataset['date'] < date)
           ]
    
    home_team_goals = pd.concat([data[(data['home_team'] == home)]["home_score"], 
                                 data[(data['away_team'] == home)]["away_score"]])
    away_team_goals = pd.concat([data[(data['home_team'] == away)]["home_score"], 
                                 data[(data['away_team'] == away)]["away_score"]])
    
    home_goals = home_team_goals.sum()
    away_goals = away_team_goals.sum()
    
    home_avg = home_team_goals.mean()
    away_avg = away_team_goals.mean()
    if np.isnan(home_avg):
        home_avg = 0
    if np.isnan(away_avg):
        away_avg = 0
    return pd.Series({
        'goal_history_with_opponent': home_goals - away_goals, 
        'goal_history_with_opponent_home_avg': home_avg,
        'goal_history_with_opponent_away_avg': away_avg
    }) 

dataset = pd.concat([dataset, dataset.apply(get_previous_goals, axis=1)], axis=1)

In [6]:
def get_median_goals(row):
    home = row["home_team"]
    away = row["away_team"]
    date = row["date"]
    
    home_data = dataset[((dataset['home_team'] == home) | (dataset['away_team'] == home)) &
            (dataset['date'] < date)
           ]
    
    home_goal_median = pd.concat([home_data[(home_data['home_team'] == home)]["home_score"], 
                                 home_data[(home_data['away_team'] == home)]["away_score"]]).mean()
    
    
    
    away_data = dataset[((dataset['home_team'] == away) | (dataset['away_team'] == away)) &
            (dataset['date'] < date)
           ]
    away_goal_median = pd.concat([away_data[(away_data['home_team'] == away)]["home_score"], 
                                 away_data[(away_data['away_team'] == away)]["away_score"]]).mean()
    
    if np.isnan(home_goal_median):
        home_goal_median = 0
    if np.isnan(away_goal_median):
        away_goal_median = 0
    return pd.Series({
        'home_goal_mean': home_goal_median, 
        'away_goal_mean': away_goal_median
    })
dataset = pd.concat([dataset, dataset.apply(get_median_goals, axis=1)], axis=1)

In [7]:
dataset.to_csv("dataset_pd.csv")

In [8]:
important_columns = ["elo_diff", "rank_diff", "home_score", "away_score", "home_win",
                     "date", "tournament", "home_rank_change", "away_rank_change", 
                     "goal_history_with_opponent",
                     "goal_history_with_opponent_home_avg", "goal_history_with_opponent_away_avg",
                    "home_goal_mean", "away_goal_mean",
                    'rating_diff', 'pace_diff', 'shooting_diff',
                    'passing_diff', 'dribbling_diff', 'defending_diff', 'physical_diff']

In [46]:
rankings = dataset[important_columns]

In [10]:
feature_columns = ["elo_diff", "rank_diff", "home_rank_change",
                   "away_rank_change", "goal_history_with_opponent",
                  "goal_history_with_opponent_home_avg", "goal_history_with_opponent_away_avg",
                  "home_goal_mean", "away_goal_mean",
                  'rating_diff', 'pace_diff', 'shooting_diff',
                   'passing_diff', 'dribbling_diff', 'defending_diff', 'physical_diff']

In [11]:
rankings = rankings[(rankings['tournament'] != "Friendly")]
print("ALL: ", len(rankings))

test_period_start = "2014-01-01"

# Master data TRAIN
train_master = rankings[(rankings['date'] < test_period_start)]

# Data for classification
only_wc = train_master[train_master['tournament'].isin(
    ["FIFA World Cup", "Confederations Cup", "FIFA World Cup qualification"])]

ytrain_home_win = train_master["home_win"]
ytrain_home_win_wc = only_wc["home_win"]

Xtrain = train_master[feature_columns]
Xtrain_wc = only_wc[feature_columns]

# Data for regression
ytrain_home_score = train_master["home_score"]
ytrain_away_score = train_master["away_score"]
ytrain_home_score_wc = only_wc["home_score"]
ytrain_away_score_wc = only_wc["away_score"]

Xtrain_tie = train_master[train_master['home_win'] == 0]
ytrain_tie = Xtrain_tie["away_score"]
Xtrain_tie = Xtrain_tie[feature_columns]

# Data for classification WIN/LOSE
Xtrain_wl = train_master[train_master["home_win"] != 0]
ytrain_wl_home_win = Xtrain_wl["home_win"]
Xtrain_wl = Xtrain_wl[feature_columns]

Xtrain_wl_wc = only_wc[only_wc["home_win"] != 0]
ytrain_wl_home_win_wc = Xtrain_wl_wc["home_win"]
Xtrain_wl_wc = Xtrain_wl_wc[feature_columns]


# Master data World Cup TEST
test_master = rankings[(rankings['tournament'].isin(["FIFA World Cup", "Confederations Cup", "FIFA World Cup qualification"])) & (rankings['date'] >= test_period_start)]

# Data for classification
ytest_home_win = test_master["home_win"]
Xtest = test_master[feature_columns]

# Data for regression
ytest_home_score = test_master["home_score"]
ytest_away_score = test_master["away_score"]

Xtest_tie = test_master[test_master['home_win'] == 0]
ytest_tie = Xtest_tie["away_score"]
Xtest_tie = Xtest_tie[feature_columns]

# Data for classification WIN/LOSE
Xtest_wl = test_master[test_master["home_win"] != 0]
ytest_wl_home_win = Xtest_wl["home_win"]
Xtest_wl = Xtest_wl[feature_columns]

# WORLD CUP CROSS VALIDATION DATA
CV_train = rankings[rankings['tournament'].isin(["FIFA World Cup", "Confederations Cup", "FIFA World Cup qualification"])]
CV_train_wl = CV_train[CV_train["home_win"] != 0]
CV_ytrain = CV_train["home_win"]
CV_ytrain_wl = CV_train_wl["home_win"]
CV_ytrain_home_score = CV_train["home_score"]
CV_ytrain_away_score = CV_train["away_score"]
CV_train = CV_train[feature_columns]
CV_train_wl = CV_train_wl[feature_columns]


# Final train
Xfinal = rankings[feature_columns]
yfinal_home_win = rankings["home_win"]
yfinal_home_goals = rankings["home_score"]
yfinal_away_goals = rankings["away_score"]

Xfinal_wl = rankings[rankings["home_win"] != 0]
yfinal_wl_home_win = Xfinal_wl["home_win"]
Xfinal_wl = Xfinal_wl[feature_columns]

Xfinal_tie = rankings[rankings['home_win'] == 0]
yfinal_tie = Xfinal_tie["home_score"]
Xfinal_tie = Xfinal_tie[feature_columns]

print("TRAIN: ", len(Xtrain))
print("TRAIN WC: ", len(Xtrain_wc))
print("WC TEST: ", len(Xtest))
print("WC CV TEST: ", len(CV_train))
print("FINAL TRAIN: ", len(Xfinal))
print("FINAL TIE TRAIN: ", len(Xfinal_tie))

ALL:  11170
TRAIN:  9251
TRAIN WC:  3878
WC TEST:  764
WC CV TEST:  4642
FINAL TRAIN:  11170
FINAL TIE TRAIN:  2471


In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

def get_wlt_model(X, y, X_cv, y_cv, X_test, y_test):
    model = KNeighborsClassifier(n_neighbors=60)

    print("CV score", cross_val_score(model, CV_train, CV_ytrain, cv=5))
    model.fit(X, y) 
   
    y_true, y_pred = y_test, model.predict(X_test)
    print(classification_report(y_true, y_pred))

    y_hat = model.predict(X_test)
    print(sum(y_hat == y_test) / len(X_test))
    return model

wlt_model = get_wlt_model(Xtrain, ytrain_home_win, CV_train, CV_ytrain, Xtest, ytest_home_win)

CV score [ 0.70721206  0.71474704  0.71905274  0.68857759  0.7206041 ]
             precision    recall  f1-score   support

         -1       0.68      0.87      0.76       224
          0       0.45      0.13      0.20       161
          1       0.79      0.90      0.84       379

avg / total       0.69      0.73      0.68       764

0.7277486910994765


In [13]:
def get_wl_model(X, y, X_cv, y_cv, X_test, y_test):
    model = KNeighborsClassifier(n_neighbors=60)

    print("CV score", cross_val_score(model, X, y, cv=5))
    model.fit(X, y) 

    y_true, y_pred = y_test, model.predict(X_test)
    print(classification_report(y_true, y_pred))
    print(sum(y_pred == y_test) / len(X_test))
    return model

wl_model = get_wl_model(Xtrain_wl, ytrain_wl_home_win, CV_train_wl, CV_ytrain_wl, Xtest_wl, ytest_wl_home_win)

CV score [ 0.86791148  0.85961272  0.87759336  0.87128028  0.86495845]
             precision    recall  f1-score   support

         -1       0.88      0.91      0.89       224
          1       0.95      0.92      0.93       379

avg / total       0.92      0.92      0.92       603

0.9187396351575456


In [14]:
import statsmodels.api as sm
lr = sm.OLS(ytrain_home_score, Xtrain)
results = lr.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:             home_score   R-squared:                       0.590
Model:                            OLS   Adj. R-squared:                  0.589
Method:                 Least Squares   F-statistic:                     830.4
Date:                Thu, 14 Jun 2018   Prob (F-statistic):               0.00
Time:                        13:40:14   Log-Likelihood:                -17308.
No. Observations:                9251   AIC:                         3.465e+04
Df Residuals:                    9235   BIC:                         3.476e+04
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
el

In [15]:
def get_home_goal_model(X, y, X_cv, y_cv, X_test, y_test):
    model = KNeighborsClassifier(n_neighbors=50)

    print("CV score", cross_val_score(model, CV_train, CV_ytrain, cv=5))
    model.fit(X, y) 

    y_true, y_pred = y_test, model.predict(X_test)
    print(classification_report(y_true, y_pred))
    print(sum(y_pred == y_test) / len(X_test))
    return model

home_goal_model = get_home_goal_model(Xtrain, ytrain_home_score, CV_train, CV_ytrain_home_score, Xtest, ytest_home_score)

CV score [ 0.70182992  0.71367061  0.71474704  0.68965517  0.72384035]
             precision    recall  f1-score   support

          0       0.54      0.71      0.61       207
          1       0.36      0.40      0.38       210
          2       0.32      0.35      0.34       178
          3       0.23      0.14      0.17        88
          4       0.24      0.09      0.13        44
          5       0.00      0.00      0.00        16
          6       0.00      0.00      0.00         8
          7       0.00      0.00      0.00         5
          8       0.00      0.00      0.00         5
          9       0.00      0.00      0.00         1
         10       0.00      0.00      0.00         1
         15       0.00      0.00      0.00         1

avg / total       0.36      0.40      0.38       764

0.40445026178010474


  'precision', 'predicted', average, warn_for)


In [16]:
def get_away_goal_model(X, y, X_cv, y_cv, X_test, y_test):
    model = KNeighborsClassifier(n_neighbors=70)
    print("CV score", cross_val_score(model, CV_train, CV_ytrain, cv=5))
    model.fit(X, y) 

    y_true, y_pred = y_test, model.predict(X_test)
    print(classification_report(y_true, y_pred))
    print(sum(y_pred == y_test) / len(X_test))
    return model


away_goal_model = get_away_goal_model(Xtrain, ytrain_away_score, CV_train, CV_ytrain_away_score, Xtest, ytest_away_score)

CV score [ 0.70721206  0.71582347  0.72228202  0.68965517  0.7227616 ]
             precision    recall  f1-score   support

          0       0.63      0.75      0.68       320
          1       0.32      0.35      0.33       216
          2       0.24      0.29      0.26       122
          3       0.00      0.00      0.00        60
          4       0.00      0.00      0.00        22
          5       0.00      0.00      0.00         8
          6       0.00      0.00      0.00        12
          7       0.00      0.00      0.00         1
          8       0.00      0.00      0.00         3

avg / total       0.39      0.46      0.42       764

0.4581151832460733


  'precision', 'predicted', average, warn_for)


In [17]:
def get_tie_model(X, y, X_cv, y_cv, X_test, y_test):
    model = linear_model.LinearRegression()
    print("CV score", cross_val_score(model, CV_train, CV_ytrain, cv=5))
    model.fit(X, y) 

    y_hat = model.predict(X_test)
    print(sum(np.around(y_hat) == y_test) / len(X_test))
    return model
    
tie_goal_model = get_tie_model(Xtrain_tie, ytrain_tie, Xtrain_tie, ytrain_tie, Xtest_tie, ytest_tie)

CV score [ 0.51182757  0.50169094  0.54417006  0.52480131  0.52766074]
0.391304347826087


In [32]:
def predict_score(data_row, sign_model, home_goal_model, away_goal_model, tie_model):
    game_winner = sign_model.predict(data_row)[0]
    game_winner_prob = sign_model.predict_proba(data_row)[0]
    
    if game_winner == 0:
        goals = int(tie_goal_model.predict(data_row)[0])
        home_goals, away_goals = goals, goals
    else:        
        home_goals_prob = home_goal_model.predict_proba(data_row)[0]
        away_goals_prob = away_goal_model.predict_proba(data_row)[0]
        
        N = len(home_goals_prob)
        M = len(away_goals_prob)
        min_shape = min(N, M)

        goal_matrix = np.outer(home_goals_prob, away_goals_prob)

        # make square
        goal_matrix = goal_matrix[:min_shape, :min_shape]

        if game_winner == 1:
            a = np.tril(goal_matrix, -1)
            home_goals, away_goals = np.unravel_index(a.argmax(), a.shape)
            assert(home_goals > away_goals)
        elif game_winner == 0:
            a = np.diag(goal_matrix)
            home_goals, away_goals = a.argmax() + 1, a.argmax() + 1
            assert(home_goals == away_goals)
        else:
            a = np.triu(goal_matrix, 1)
            home_goals, away_goals = np.unravel_index(a.argmax(), a.shape)
            assert(home_goals < away_goals)

    return home_goals, away_goals, game_winner, home_goals_prob, away_goals_prob

In [19]:
game_predictions = []

for index in range(len(Xtest)):
    row = pd.DataFrame(Xtest.iloc[index]).T
    home_goals, away_goals, winner = predict_score(row, wlt_model, home_goal_model, away_goal_model, tie_goal_model)
    game_predictions.append([home_goals, away_goals, winner])
    
predictions = pd.DataFrame(game_predictions, columns=["home_pred", "away_pred", "winner_pred"])
correct = test_master[["home_score", "away_score"]].reset_index()
result_df = pd.concat([correct, predictions], axis=1)
result_df["home_win"] = np.sign(result_df["home_score"] - result_df["away_score"])

total_score = 0
max_score = 0
total_games_correct = 0
total_home_score_correct = 0
total_score_correct = 0

for index, game_row in result_df.iterrows():
    row_score = 0
    if game_row["home_win"] == game_row["winner_pred"]:
        row_score += 2
        total_games_correct += 1
    
    if (game_row["home_score"] == game_row["home_pred"]) or (game_row["away_score"] == game_row["away_pred"]):
        row_score += 1
        total_home_score_correct += 1
    
    if (game_row["home_score"] == game_row["home_pred"]) and (game_row["away_score"] == game_row["away_pred"]):
        row_score += 2
        total_score_correct += 1
    
    total_score += row_score
    max_score += 5
    
N = len(result_df)
print(f"How many points available: {total_score/max_score}")
print(f"{total_games_correct/N}")
print(f"{total_home_score_correct/N}")
print(f"{total_score_correct/N}")

How many points available: 0.4887434554973822
0.7277486910994765
0.6950261780104712
0.14659685863874344


In [20]:
feature_columns

['elo_diff',
 'rank_diff',
 'home_rank_change',
 'away_rank_change',
 'goal_history_with_opponent',
 'goal_history_with_opponent_home_avg',
 'goal_history_with_opponent_away_avg',
 'home_goal_mean',
 'away_goal_mean',
 'rating_diff',
 'pace_diff',
 'shooting_diff',
 'passing_diff',
 'dribbling_diff',
 'defending_diff',
 'physical_diff']

In [21]:
def get_prediction_row(home, away):
    row = pd.DataFrame(np.array([[np.nan] * len(feature_columns)]), columns=feature_columns)
    
    home_row = dataset[(dataset["home_team"] == home) | (dataset["away_team"] == home)].tail(1)
    away_row = dataset[(dataset["home_team"] == away) | (dataset["away_team"] == away)].tail(1)

    home_rank = world_cup[world_cup["Team"] == home]["Current FIFA rank"].item()
    away_rank = world_cup[world_cup["Team"] == away]["Current FIFA rank"].item()
    
    home_elo = world_cup[world_cup["Team"] == home]["elo_ranking"].item()
    away_elo = world_cup[world_cup["Team"] == away]["elo_ranking"].item()
    
    row["rank_diff"] = home_rank - away_rank
    row["elo_diff"] = home_elo - away_elo

    row["home_rank_change"] = world_cup[world_cup["Team"] == home]["Rank Change"].item()
    row["away_rank_change"] = world_cup[world_cup["Team"] == away]["Rank Change"].item()
    
    goal_diff = dataset[((dataset['home_team'] == home) | (dataset['away_team'] == home)) & 
            ((dataset['home_team'] == away) | (dataset['away_team'] == away))].tail(1)
    
    if len(goal_diff) == 0:
        row["goal_history_with_opponent"] = 0
        row["goal_history_with_opponent_home_avg"] = 0
        row["goal_history_with_opponent_away_avg"] = 0
        
    elif goal_diff["home_team"].item() == home:
        row["goal_history_with_opponent"] = goal_diff["goal_history_with_opponent"].item()
        row["goal_history_with_opponent_home_avg"] = goal_diff["goal_history_with_opponent_home_avg"].item()
    else:
        row["goal_history_with_opponent"] = -goal_diff["goal_history_with_opponent"].item()
        row["goal_history_with_opponent_away_avg"] = goal_diff["goal_history_with_opponent_away_avg"].item()
        
    home_data = dataset[(dataset['home_team'] == home)].tail(1)
    if len(home_data) > 0:
        home_goal_mean = home_data["home_goal_mean"].item()
    else:
        home_goal_mean = 0
    away_data = dataset[(dataset['away_team'] == away)].tail(1)
    if len(    away_data) > 0:
        away_goal_mean = away_data["away_goal_mean"].item()
    else:
        away_goal_mean = 0
    
    row["home_goal_mean"] = home_goal_mean
    row["away_goal_mean"] = away_goal_mean
    
    row["rating_diff"] = home_row["home_rating"].item() - away_row["away_rating"].item()
    row["pace_diff"] = home_row["home_pace"].item() - away_row["away_pace"].item()
    row["shooting_diff"] = home_row["home_shooting"].item() - away_row["away_shooting"].item()
    row["passing_diff"] = home_row["home_passing"].item() - away_row["away_passing"].item()
    row["dribbling_diff"] = home_row["home_dribbling"].item() - away_row["away_dribbling"].item()
    row["defending_diff"] = home_row["home_defending"].item() - away_row["away_defending"].item()
    row["physical_diff"] = home_row["home_physical"].item() - away_row["away_physical"].item()
    
    # Make sure that there is no nan values
    row.fillna(0, inplace=True)
    
    return row

In [22]:
wlt_model = get_wlt_model(Xfinal, yfinal_home_win, Xfinal, yfinal_home_win, Xfinal, yfinal_home_win)
wl_model = get_wl_model(Xfinal_wl, yfinal_wl_home_win, Xfinal_wl, yfinal_wl_home_win, Xfinal_wl, yfinal_wl_home_win)
home_goal_model = get_home_goal_model(Xfinal, yfinal_home_goals, Xfinal, yfinal_home_goals, Xfinal, yfinal_home_goals)
away_goal_model = get_away_goal_model(Xfinal, yfinal_away_goals, Xfinal, yfinal_away_goals, Xfinal, yfinal_away_goals)
tie_goal_model = get_tie_model(Xfinal_tie, yfinal_tie, Xfinal_tie, yfinal_tie, Xfinal_tie, yfinal_tie)

CV score [ 0.70721206  0.71474704  0.71905274  0.68857759  0.7206041 ]
             precision    recall  f1-score   support

         -1       0.66      0.77      0.71      3184
          0       0.41      0.16      0.23      2471
          1       0.75      0.87      0.80      5515

avg / total       0.65      0.69      0.65     11170

0.6867502238137869
CV score [ 0.86781609  0.86436782  0.88218391  0.86666667  0.86831512]
             precision    recall  f1-score   support

         -1       0.83      0.81      0.82      3184
          1       0.89      0.91      0.90      5515

avg / total       0.87      0.87      0.87      8699

0.8725140820783999
CV score [ 0.70182992  0.71367061  0.71474704  0.68965517  0.72384035]


  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.52      0.62      0.57      2935
          1       0.37      0.43      0.40      3243
          2       0.32      0.40      0.36      2453
          3       0.27      0.13      0.17      1252
          4       0.27      0.06      0.10       649
          5       0.13      0.01      0.03       278
          6       0.00      0.00      0.00       162
          7       0.00      0.00      0.00        76
          8       0.00      0.00      0.00        47
          9       0.00      0.00      0.00        27
         10       0.00      0.00      0.00        15
         11       0.00      0.00      0.00         9
         12       0.00      0.00      0.00         7
         13       0.00      0.00      0.00         4
         14       0.00      0.00      0.00         3
         15       0.00      0.00      0.00         2
         16       0.00      0.00      0.00         3
         17       0.00      0.00      0.00   

  'precision', 'predicted', average, warn_for)


In [23]:
all_games = pd.DataFrame()
world_cup = pd.read_csv('world_cup_2018_dataset.csv')

In [33]:
opponents = ['First match \nagainst', 'Second match\n against', 'Third match\n against']

world_cup['points'] = 0
world_cup['total_prob'] = 0
world_cup['goal_diff'] = 0
world_cup['goals_scored'] = 0

for group in set(world_cup['Group']):
    print('___Starting group {}:___'.format(group))
    for home, away in combinations(world_cup.query('Group == "{}"'.format(group)).Team, 2):
        print("{} vs. {}: ".format(home, away), end='')
        
        row = get_prediction_row(home, away)
        home_goals, away_goals, game_winner, hg_prob, ag_prob = predict_score(row, 
                                                            wlt_model, 
                                                            home_goal_model, 
                                                            away_goal_model, 
                                                            tie_goal_model)
        import pdb; pdb.set_trace()
        if game_winner == 1:
            world_cup.loc[world_cup["Team"] == home, 'points'] += 3
            
        elif game_winner == 0:
            world_cup.loc[world_cup["Team"] == home, 'points'] += 1
            world_cup.loc[world_cup["Team"] == away, 'points'] += 1
        else:
            world_cup.loc[world_cup["Team"] == away, 'points'] += 3
            
        world_cup.loc[world_cup["Team"] == home, 'goal_diff'] += (home_goals - away_goals)
        world_cup.loc[world_cup["Team"] == away, 'goal_diff'] += (away_goals - home_goals)
        world_cup.loc[world_cup["Team"] == home, 'goals_scored'] += home_goals
        world_cup.loc[world_cup["Team"] == away, 'goals_scored'] += away_goals
        if home < away:
            world_cup[f"{home}-{away}"] = home_goals - away_goals
        else:
            world_cup[f"{away}-{home}"] = away_goals - home_goals
        
        tmp_df = row
        tmp_df["home_team"] = home
        tmp_df["away_team"] = away
        tmp_df["home_score"] = home_goals
        tmp_df["away_score"] = away_goals
        
        all_games = pd.concat([all_games, tmp_df])
        print(f"{home_goals} - {away_goals}")

___Starting group C:___
France vs. Australia: > <ipython-input-33-c42dcd0fc923>(20)<module>()
-> if game_winner == 1:
(Pdb) hg_prob
array([ 0.08,  0.32,  0.22,  0.2 ,  0.12,  0.02,  0.02,  0.  ,  0.  ,
        0.02,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ])
(Pdb) ag_prob
array([ 0.42857143,  0.42857143,  0.11428571,  0.02857143,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ])
(Pdb) np.outer(hg_prob, ag_prob)
array([[ 0.03428571,  0.03428571,  0.00914286,  0.00228571,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.13714286,  0.13714286,  0.03657143,  0.00914286,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  

BdbQuit: 

In [25]:
def get_better_team(first_team, second_team):
    if first_team["points"].item() > second_team["points"].item():
        return -1
    elif first_team["points"].item() < second_team["points"].item():
        return 1
    
    if first_team["goal_diff"].item() > second_team["goal_diff"].item():
        return -1
    elif first_team["goal_diff"].item() < second_team["goal_diff"].item():
        return 1
    
    if first_team["goals_scored"].item() > second_team["goals_scored"].item():
        return -1
    elif first_team["goals_scored"].item() < second_team["goals_scored"].item():
        return 1
    
    f_name = first_team["Team"]
    s_name = second_team["Team"]
    if (f_name < s_name):
        if (first_team[f"{f_name}-{s_name}"] > 0):
            return -1
        else:
            return 1
    else:
        if (first_team[f"{s_name}-{f_name}"] < 0):
            return -1
        else:
            return 1

In [26]:
import functools
# Get best two
sort_wc = world_cup.sort_values(by=['Group', 'points'], ascending=False).reset_index()
next_round_wc = sort_wc.groupby('Group').nth([0, 1])

groups = np.unique(sort_wc["Group"])

next_round = pd.DataFrame()
for group in groups:
    group_df = sort_wc[sort_wc["Group"] == group]
    team = [group_df.iloc[0], group_df.iloc[1], group_df.iloc[2], group_df.iloc[3]]
    shorted_standing = sorted(team, key=functools.cmp_to_key(get_better_team))
    first = shorted_standing[0]
    second = shorted_standing[1]
    next_round = pd.concat([next_round, first.to_frame().T, second.to_frame().T])

In [27]:
 world_cup[["Team", "Group", "points", "goals_scored"]].sort_values(by=['Group', 'points'], ascending=False).reset_index()

Unnamed: 0,index,Team,Group,points,goals_scored
0,30,Colombia,H,9,5
1,28,Poland,H,6,2
2,31,Japan,H,3,1
3,29,Senegal,H,0,1
4,24,Belgium,G,7,6
5,27,England,G,7,3
6,25,Panama,G,3,1
7,26,Tunisia,G,0,0
8,20,Germany,F,9,6
9,21,Mexico,F,6,3


In [28]:
all_games[(all_games["home_team"] == "Japan") | (all_games["away_team"] == "Japan")]

Unnamed: 0,elo_diff,rank_diff,home_rank_change,away_rank_change,goal_history_with_opponent,goal_history_with_opponent_home_avg,goal_history_with_opponent_away_avg,home_goal_mean,away_goal_mean,rating_diff,pace_diff,shooting_diff,passing_diff,dribbling_diff,defending_diff,physical_diff,home_team,away_team,home_score,away_score
0,54.023,-53,2,-1,-0.0,0.0,0.0,1.642857,1.942029,23.955871,32.359866,33.204824,25.762144,31.825586,30.096318,32.513389,Poland,Japan,1,0
0,20.997,-34,1,-1,0.0,0.0,0.0,1.542254,1.942029,24.434132,34.026533,30.204824,22.462144,34.025586,29.496318,33.138389,Senegal,Japan,0,1
0,133.295,-45,0,-1,1.0,0.0,0.5,1.247253,1.942029,26.99935,37.1932,32.604824,25.662144,34.225586,27.496318,28.638389,Colombia,Japan,1,0


In [29]:
next_round

Unnamed: 0,index,Team,Group,Previous appearances,Previous titles,Previous finals,Previous semifinals,Current FIFA rank,Rank Change,elo_ranking,...,Argentina-Nigeria,Croatia-Iceland,Iceland-Nigeria,Croatia-Nigeria,Belgium-Panama,Belgium-Tunisia,Belgium-England,Panama-Tunisia,England-Panama,England-Tunisia
29,3,Uruguay,A,12,2,2,5,14,3,1918.32,...,1,0,1,1,3,3,0,1,1,2
28,0,Russia,A,10,0,0,1,70,-4,1766.75,...,1,0,1,1,3,3,0,1,1,2
24,5,Spain,B,14,1,1,2,10,-2,2085.24,...,1,0,1,1,3,3,0,1,1,2
25,4,Portugal,B,6,0,0,2,4,0,2017.86,...,1,0,1,1,3,3,0,1,1,2
20,8,France,C,14,1,2,5,7,0,2052.37,...,1,0,1,1,3,3,0,1,1,2
21,10,Peru,C,4,0,0,0,11,0,1981.94,...,1,0,1,1,3,3,0,1,1,2
16,12,Argentina,D,16,2,5,5,5,0,1983.36,...,1,0,1,1,3,3,0,1,1,2
18,14,Croatia,D,4,0,0,1,20,-2,1871.75,...,1,0,1,1,3,3,0,1,1,2
12,16,Brazil,E,20,5,7,11,2,0,2179.67,...,1,0,1,1,3,3,0,1,1,2
13,17,Switzerland,E,10,0,0,0,6,0,1921.65,...,1,0,1,1,3,3,0,1,1,2


In [30]:
pairing = [0,3,4,7,8,11,12,15,1,2,5,6,9,10,13,14]

next_round_wc = next_round
next_round_wc = next_round_wc.reset_index()
next_round_wc = next_round_wc.loc[pairing]
next_round_wc = next_round_wc.set_index('Team')

finals = ['round_of_16', 'quarterfinal', 'semifinal', 'final']


for f in finals:
    print("___Starting of the {}___".format(f))
    iterations = int(len(next_round_wc) / 2)
    winners = []

    for i in range(iterations):
        home = next_round_wc.index[i*2]
        away = next_round_wc.index[i*2+1]
        print("{} vs. {}: ".format(home, away), end='')
        
        row = get_prediction_row(home, away)
        home_goals, away_goals, game_winner = predict_score(row, wl_model,
                                                            home_goal_model, 
                                                            away_goal_model,
                                                           tie_goal_model)
        if game_winner == 1:
            winners.append(home)
        else:
            winners.append(away)
            
        tmp_df = row
        tmp_df["home_team"] = home
        tmp_df["away_team"] = away
        tmp_df["home_score"] = home_goals
        tmp_df["away_score"] = away_goals
        
        all_games = pd.concat([all_games, tmp_df])
        
        print(f"{home_goals} - {away_goals}")
        
    next_round_wc = next_round_wc.loc[winners]
    print("\n")

___Starting of the round_of_16___
Uruguay vs. Portugal: 0 - 1
France vs. Croatia: 2 - 0
Brazil vs. Mexico: 3 - 0
Belgium vs. Poland: 1 - 0
Russia vs. Spain: 0 - 2
Peru vs. Argentina: 2 - 1
Switzerland vs. Germany: 0 - 1
England vs. Colombia: 2 - 0


___Starting of the quarterfinal___
Portugal vs. France: 0 - 1
Brazil vs. Belgium: 2 - 0
Spain vs. Peru: 1 - 0
Germany vs. England: 2 - 0


___Starting of the semifinal___
France vs. Brazil: 1 - 2
Spain vs. Germany: 2 - 1


___Starting of the final___
Brazil vs. Spain: 1 - 0




In [31]:
all_games

Unnamed: 0,elo_diff,rank_diff,home_rank_change,away_rank_change,goal_history_with_opponent,goal_history_with_opponent_home_avg,goal_history_with_opponent_away_avg,home_goal_mean,away_goal_mean,rating_diff,pace_diff,shooting_diff,passing_diff,dribbling_diff,defending_diff,physical_diff,home_team,away_team,home_score,away_score
0,200.080,-29,0,4,1.0,0.000000,1.000000,1.821656,2.718121,26.643893,27.368454,31.505135,29.535193,31.026123,26.722574,29.090677,France,Australia,1,0
0,70.432,-4,0,0,0.0,0.000000,0.000000,1.821656,1.062893,8.695652,6.166667,8.800000,12.000000,7.400000,4.000000,6.250000,France,Peru,2,1
0,184.538,-5,0,0,4.0,0.000000,2.500000,1.821656,1.734266,17.478261,14.666667,21.400000,22.000000,20.000000,14.200000,19.250000,France,Denmark,2,0
0,-129.648,25,4,0,0.0,0.000000,0.000000,2.750000,1.062893,-8.217391,-3.166667,-7.000000,-7.900000,-9.000000,-13.400000,-7.250000,Australia,Peru,0,1
0,-15.542,24,4,0,0.0,0.000000,0.000000,2.750000,1.734266,0.565217,5.333333,5.600000,2.100000,3.600000,-3.200000,5.750000,Australia,Denmark,2,0
0,114.106,-1,0,0,0.0,0.000000,0.000000,1.050955,1.734266,-11.831304,-18.773333,-14.240000,-10.240000,-16.040000,-16.240000,-13.815000,Peru,Denmark,1,0
0,258.015,-4,0,0,0.0,0.000000,0.000000,2.110000,1.635659,11.782609,11.333333,13.400000,14.200000,12.600000,14.800000,10.125000,Brazil,Switzerland,2,0
0,335.940,-21,0,2,8.0,5.000000,0.000000,2.110000,1.551402,30.068835,31.449601,32.316969,32.942887,33.432711,34.170210,30.128685,Brazil,Costa Rica,3,0
0,401.928,-32,0,1,0.0,0.000000,0.000000,2.110000,1.483333,14.869565,13.833333,15.200000,17.900000,12.400000,16.400000,10.250000,Brazil,Serbia,2,0
0,77.925,-17,0,2,0.0,0.000000,0.000000,1.618321,1.551402,27.460139,34.616268,29.916969,30.242887,33.432711,27.770210,27.628685,Switzerland,Costa Rica,1,0


In [56]:
# Simulate world cup 2014

wc2014 = dataset[(dataset['tournament'] == "FIFA World Cup") & 
                    (dataset['date'] > "2013-01-01") &
                   (dataset['date'] < "2015-01-01")]
print(len(wc2014))
wc2014[["home_team", "away_team", "date", "home_score", "away_score"]].sort_values(by='date')

58


Unnamed: 0,home_team,away_team,date,home_score,away_score
15097,Brazil,Croatia,2014-06-12,3,1
15385,Spain,Netherlands,2014-06-13,1,5
15002,Chile,Australia,2014-06-13,3,1
15056,Mexico,Cameroon,2014-06-13,1,0
15237,England,Italy,2014-06-14,1,2
15088,Uruguay,Costa Rica,2014-06-14,1,3
15187,Colombia,Greece,2014-06-14,3,0
15202,France,Honduras,2014-06-15,3,0
15119,Switzerland,Ecuador,2014-06-15,2,1
15464,Germany,Portugal,2014-06-16,4,0


array(['ABCS Tournament', 'AFC Asian Cup', 'AFC Asian Cup qualification',
       'AFC Challenge Cup', 'AFC Challenge Cup qualification',
       'AFF Championship', 'African Cup of Nations',
       'African Cup of Nations qualification',
       'African Nations Championship', 'Am<U+00ED>lcar Cabral Cup',
       'Baltic Cup', 'CECAFA Cup', 'CFU Caribbean Cup',
       'CFU Caribbean Cup qualification', 'COSAFA Cup',
       'Confederations Cup', 'Copa Am<U+00E9>rica', 'Copa Paz del Chaco',
       'Copa del Pac<U+00ED>fico', 'Cyprus International Tournament',
       'Dragon Cup', 'Dunhill Cup', 'Dynasty Cup', 'EAFF Championship',
       'FIFA World Cup', 'FIFA World Cup qualification', 'Gold Cup',
       'Gold Cup qualification', 'Gulf Cup', 'King Hassan II Tournament',
       "King's Cup", 'Kirin Cup', 'Korea Cup', 'Lunar New Year Cup',
       'Malta International Tournament', 'Merdeka Tournament',
       'Millennium Cup', 'Nations Cup', 'Nehru Cup',
       'Nile Basin Tournament', 'Nordic