In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn import decomposition
from sklearn import datasets, linear_model
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import explained_variance_score, classification_report, r2_score, accuracy_score
from itertools import combinations
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

In [45]:
dataset = pd.read_csv('dataset_player_rating_augmented.csv')
dataset = dataset.drop(['Unnamed: 0'], axis=1)
dataset.head()

Unnamed: 0,year,away_team,home_team,date,home_score,away_score,tournament,city,country,neutral,...,away_dribbling,away_defending,away_physical,rating_diff,pace_diff,shooting_diff,passing_diff,dribbling_diff,defending_diff,physical_diff
0,1993,Argentina,Australia,1993-10-31,1,1,FIFA World Cup qualification,Sydney,Australia,False,...,74.558755,72.204447,73.033112,-1.92715,-4.486182,-2.184369,-1.476209,-2.929558,-2.800414,-3.477045
1,1993,Argentina,Colombia,1993-08-15,2,1,FIFA World Cup qualification,Barranquilla,Colombia,False,...,80.346823,77.506495,77.928084,-7.086274,-8.701378,-8.174451,-7.519782,-8.817914,-7.528667,-7.595503
2,1993,Argentina,Paraguay,1993-08-08,1,3,FIFA World Cup qualification,Asunci<U+00F3>n,Paraguay,False,...,81.169743,77.924507,78.34298,-18.106692,-22.826033,-21.239485,-19.018341,-22.644583,-19.526578,-19.3252
3,1993,Australia,Argentina,1993-11-17,1,0,FIFA World Cup qualification,Buenos Aires,Argentina,False,...,71.089991,69.17151,69.24892,2.79267,5.056751,3.422862,2.524203,3.867557,3.547839,4.038441
4,1993,Austria,Bulgaria,1993-10-13,4,1,FIFA World Cup qualification,Sofia,Bulgaria,False,...,65.413139,64.429416,65.272919,4.98758,5.316824,5.968414,5.626184,5.7731,5.225,4.79924


In [46]:
dataset.columns

Index(['year', 'away_team', 'home_team', 'date', 'home_score', 'away_score',
       'tournament', 'city', 'country', 'neutral', 'home_elo', 'away_elo',
       'elo_diff', 'score_difference', 'home_score_difference_lag',
       'home_avg_goal_diff', 'away_score_difference_lag', 'away_avg_goal_diff',
       'avg_goals_received', 'home_win', 'wc_home_wins', 'wc_away_wins',
       'home_rank', 'home_country_abrv', 'home_total_points',
       'home_previous_points', 'home_rank_change', 'home_cur_year_avg',
       'home_cur_year_avg_weighted', 'home_last_year_avg',
       'home_last_year_avg_weighted', 'home_two_year_ago_avg',
       'home_two_year_ago_weighted', 'home_three_year_ago_avg',
       'home_three_year_ago_weighted', 'home_confederation', 'away_rank',
       'away_country_abrv', 'away_total_points', 'away_previous_points',
       'away_rank_change', 'away_cur_year_avg', 'away_cur_year_avg_weighted',
       'away_last_year_avg', 'away_last_year_avg_weighted',
       'away_two_year_

In [47]:
def get_previous_goals(row):
    home = row["home_team"]
    away = row["away_team"]
    date = row["date"]
    data = dataset[((dataset['home_team'] == home) | (dataset['away_team'] == home)) & 
            ((dataset['home_team'] == away) | (dataset['away_team'] == away)) &
            (dataset['date'] < date)
           ]
    
    home_team_goals = pd.concat([data[(data['home_team'] == home)]["home_score"], 
                                 data[(data['away_team'] == home)]["away_score"]])
    away_team_goals = pd.concat([data[(data['home_team'] == away)]["home_score"], 
                                 data[(data['away_team'] == away)]["away_score"]])
    
    home_goals = home_team_goals.sum()
    away_goals = away_team_goals.sum()
    
    home_avg = home_team_goals.mean()
    away_avg = away_team_goals.mean()
    if np.isnan(home_avg):
        home_avg = 0
    if np.isnan(away_avg):
        away_avg = 0
    return pd.Series({
        'goal_history_with_opponent': home_goals - away_goals, 
        'goal_history_with_opponent_home_avg': home_avg,
        'goal_history_with_opponent_away_avg': away_avg
    }) 

dataset = pd.concat([dataset, dataset.apply(get_previous_goals, axis=1)], axis=1)

In [48]:
def get_median_goals(row):
    home = row["home_team"]
    away = row["away_team"]
    date = row["date"]
    
    home_data = dataset[((dataset['home_team'] == home) | (dataset['away_team'] == home)) &
            (dataset['date'] < date)
           ]
    
    home_goal_median = pd.concat([home_data[(home_data['home_team'] == home)]["home_score"], 
                                 home_data[(home_data['away_team'] == home)]["away_score"]]).mean()
    
    
    
    away_data = dataset[((dataset['home_team'] == away) | (dataset['away_team'] == away)) &
            (dataset['date'] < date)
           ]
    away_goal_median = pd.concat([away_data[(away_data['home_team'] == away)]["home_score"], 
                                 away_data[(away_data['away_team'] == away)]["away_score"]]).mean()
    
    if np.isnan(home_goal_median):
        home_goal_median = 0
    if np.isnan(away_goal_median):
        away_goal_median = 0
    return pd.Series({
        'home_goal_mean': home_goal_median, 
        'away_goal_mean': away_goal_median
    })
dataset = pd.concat([dataset, dataset.apply(get_median_goals, axis=1)], axis=1)

In [49]:
dataset["rank_diff_year_ago"] = dataset["home_last_year_avg"] - dataset["away_last_year_avg"]
dataset["rank_diff_two_year_ago"] = dataset["home_two_year_ago_avg"] - dataset["away_two_year_ago_avg"]
dataset["rank_diff_three_year_ago"] = dataset["home_three_year_ago_avg"] - dataset["away_three_year_ago_avg"]

In [50]:
important_columns = ["elo_diff", "rank_diff", "home_score", "away_score", "home_win",
                     "date", "tournament", "home_rank_change", "away_rank_change", 
                     "goal_history_with_opponent",
                     "goal_history_with_opponent_home_avg", "goal_history_with_opponent_away_avg",
                    "home_goal_mean", "away_goal_mean",
                    'rating_diff', 'pace_diff', 'shooting_diff',
                    'passing_diff', 'dribbling_diff', 'defending_diff', 'physical_diff',
                    "rank_diff_year_ago", "rank_diff_two_year_ago", "rank_diff_three_year_ago"]

In [51]:
rankings = dataset[important_columns]

In [72]:
feature_columns = ["elo_diff", "rank_diff", "home_rank_change",
                   "away_rank_change", "goal_history_with_opponent",
                  "goal_history_with_opponent_home_avg", "goal_history_with_opponent_away_avg",
                  "home_goal_mean", "away_goal_mean",
                  'rating_diff', 'pace_diff', 'shooting_diff',
                   'passing_diff', 'dribbling_diff', 'defending_diff', 'physical_diff']

In [73]:
test_period_start = "2014-01-01"

# Master data TRAIN
train_master = rankings[(rankings['date'] < test_period_start)]

# Data for classification
only_wc = train_master[train_master['tournament'].isin(
    ["FIFA World Cup", "Confederations Cup", "FIFA World Cup qualification"])]

ytrain_home_win = train_master["home_win"]
ytrain_home_win_wc = only_wc["home_win"]

Xtrain = train_master[feature_columns]
Xtrain_wc = only_wc[feature_columns]

# Data for regression
ytrain_home_score = train_master["home_score"]
ytrain_away_score = train_master["away_score"]
ytrain_home_score_wc = only_wc["home_score"]
ytrain_away_score_wc = only_wc["away_score"]

Xtrain_tie = train_master[train_master['home_win'] == 0]
ytrain_tie = Xtrain_tie["away_score"]
Xtrain_tie = Xtrain_tie[feature_columns]

# Data for classification WIN/LOSE
Xtrain_wl = train_master[train_master["home_win"] != 0]
ytrain_wl_home_win = Xtrain_wl["home_win"]
Xtrain_wl = Xtrain_wl[feature_columns]

Xtrain_wl_wc = only_wc[only_wc["home_win"] != 0]
ytrain_wl_home_win_wc = Xtrain_wl_wc["home_win"]
Xtrain_wl_wc = Xtrain_wl_wc[feature_columns]


# Master data World Cup TEST
test_master = rankings[(rankings['tournament'].isin(["FIFA World Cup", "Confederations Cup", "FIFA World Cup qualification"])) & (rankings['date'] >= test_period_start)]

# Data for classification
ytest_home_win = test_master["home_win"]
Xtest = test_master[feature_columns]

# Data for regression
ytest_home_score = test_master["home_score"]
ytest_away_score = test_master["away_score"]

Xtest_tie = test_master[test_master['home_win'] == 0]
ytest_tie = Xtest_tie["away_score"]
Xtest_tie = Xtest_tie[feature_columns]

# Data for classification WIN/LOSE
Xtest_wl = test_master[test_master["home_win"] != 0]
ytest_wl_home_win = Xtest_wl["home_win"]
Xtest_wl = Xtest_wl[feature_columns]

# WORLD CUP CROSS VALIDATION DATA
CV_train = rankings[rankings['tournament'].isin(["FIFA World Cup", "Confederations Cup", "FIFA World Cup qualification"])]
CV_train_wl = CV_train[CV_train["home_win"] != 0]
CV_ytrain = CV_train["home_win"]
CV_ytrain_wl = CV_train_wl["home_win"]
CV_ytrain_home_score = CV_train["home_score"]
CV_ytrain_away_score = CV_train["away_score"]
CV_train = CV_train[feature_columns]
CV_train_wl = CV_train_wl[feature_columns]


print("TRAIN: ", len(Xtrain))
print("TRAIN WC: ", len(Xtrain_wc))
print("WC TEST: ", len(Xtest))
print("WC CV TEST: ", len(CV_train))

TRAIN:  9251
TRAIN WC:  3878
WC TEST:  764
WC CV TEST:  4642


In [104]:
wlt_model = KNeighborsClassifier(n_neighbors=55)

print("CV score", cross_val_score(wlt_model, CV_train, CV_ytrain, cv=5))
wlt_model.fit(Xtrain, ytrain_home_win) 

y_hat = wlt_model.predict(Xtest)
print(sum(y_hat == ytest_home_win) / len(Xtest))

CV score [ 0.7050592   0.71259419  0.72012917  0.68965517  0.71952535]
0.7264397905759162


In [124]:
wl_model = KNeighborsClassifier(n_neighbors=20)

print("CV score", cross_val_score(wl_model, CV_train_wl, CV_ytrain_wl, cv=10))
wl_model.fit(Xtrain_wl, ytrain_wl_home_win) 

y_hat = wl_model.predict(Xtest_wl)
print(sum(y_hat == ytest_wl_home_win) / len(Xtest_wl))

CV score [ 0.91232877  0.91780822  0.90136986  0.87362637  0.90934066  0.91735537
  0.88980716  0.90358127  0.91735537  0.91460055]
0.9071310116086235


In [107]:
import statsmodels.api as sm
lr = sm.OLS(ytrain_home_score, Xtrain)
results = lr.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:             home_score   R-squared:                       0.590
Model:                            OLS   Adj. R-squared:                  0.590
Method:                 Least Squares   F-statistic:                     831.8
Date:                Thu, 14 Jun 2018   Prob (F-statistic):               0.00
Time:                        08:56:27   Log-Likelihood:                -17303.
No. Observations:                9251   AIC:                         3.464e+04
Df Residuals:                    9235   BIC:                         3.475e+04
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
el

In [125]:
home_goal_model = KNeighborsClassifier(n_neighbors=50)
print("CV score home", cross_val_score(home_goal_model, CV_train, CV_ytrain_home_score, cv=5))
home_goal_model.fit(Xtrain, ytrain_home_score)
y_hat = home_goal_model.predict(Xtest)
print("HOME", sum(np.around(y_hat) == ytest_home_score) / len(Xtest))

CV score home [ 0.35820896  0.35085837  0.35167206  0.3517316   0.38219327]
HOME 0.3913612565445026




In [126]:
away_goal_model = KNeighborsClassifier(n_neighbors=70)
print("CV score away", cross_val_score(away_goal_model, CV_train, CV_ytrain_away_score, cv=5))
away_goal_model.fit(Xtrain, ytrain_away_score)
y_hat = away_goal_model.predict(Xtest)

print("AWAY", sum(np.around(y_hat) == ytest_away_score) / len(Xtest))

CV score away [ 0.47639485  0.49085038  0.46824543  0.49461207  0.47727273]
AWAY 0.4607329842931937




In [127]:
tie_goal_model = linear_model.LinearRegression()
tie_goal_model.fit(Xtrain_tie, ytrain_tie)
y_hat = tie_goal_model.predict(Xtest_tie)
print("TIE", sum(np.around(y_hat) == ytest_tie) / len(Xtest_tie))

TIE 0.391304347826087


In [128]:
def predict_score(data_row, sign_model, home_goal_model, away_goal_model, tie_model):
    game_winner = sign_model.predict(data_row)[0]
    game_winner_prob = sign_model.predict_proba(data_row)[0]
    
    if game_winner == 0:
        goals = int(tie_goal_model.predict(data_row)[0])
        home_goals, away_goals = goals, goals
    else:        
        home_goals = home_goal_model.predict_proba(data_row)[0]
        away_goals = away_goal_model.predict_proba(data_row)[0]
        
        N = len(home_goals)
        M = len(away_goals)
        min_shape = min(N, M)

        goal_matrix = np.outer(home_goals, away_goals)

        # make square
        goal_matrix = goal_matrix[:min_shape, :min_shape]

        if game_winner == 1:
            a = np.tril(goal_matrix, -1)
            home_goals, away_goals = np.unravel_index(a.argmax(), a.shape)
            assert(home_goals > away_goals)
        elif game_winner == 0:
            a = np.diag(goal_matrix)
            home_goals, away_goals = a.argmax() + 1, a.argmax() + 1
            assert(home_goals == away_goals)
        else:
            a = np.triu(goal_matrix, 1)
            home_goals, away_goals = np.unravel_index(a.argmax(), a.shape)
            assert(home_goals < away_goals)

    return home_goals, away_goals, game_winner

In [129]:
game_predictions = []

for index in range(len(Xtest)):
    row = pd.DataFrame(Xtest.iloc[index]).T
    home_goals, away_goals, winner = predict_score(row, wlt_model, home_goal_model, away_goal_model, tie_goal_model)
    game_predictions.append([home_goals, away_goals, winner])
    
predictions = pd.DataFrame(game_predictions, columns=["home_pred", "away_pred", "winner_pred"])
correct = test_master[["home_score", "away_score"]].reset_index()
result_df = pd.concat([correct, predictions], axis=1)
result_df["home_win"] = np.sign(result_df["home_score"] - result_df["away_score"])

total_score = 0
max_score = 0
total_games_correct = 0
total_home_score_correct = 0
total_score_correct = 0

for index, game_row in result_df.iterrows():
    row_score = 0
    if game_row["home_win"] == game_row["winner_pred"]:
        row_score += 2
        total_games_correct += 1
    
    if (game_row["home_score"] == game_row["home_pred"]) or (game_row["away_score"] == game_row["away_pred"]):
        row_score += 1
        total_home_score_correct += 1
    
    if (game_row["home_score"] == game_row["home_pred"]) and (game_row["away_score"] == game_row["away_pred"]):
        row_score += 2
        total_score_correct += 1
    
    total_score += row_score
    max_score += 5
    
N = len(result_df)
print(f"How many points available: {total_score/max_score}")
print(f"{total_games_correct/N}")
print(f"{total_home_score_correct/N}")
print(f"{total_score_correct/N}")

How many points available: 0.48795811518324606
0.7264397905759162
0.693717277486911
0.14659685863874344


In [None]:
feature_columns

In [135]:
def get_prediction_row(home, away):
    row = pd.DataFrame(np.array([[np.nan] * len(feature_columns)]), columns=feature_columns)
    
    home_row = dataset[(dataset["home_team"] == home) | (dataset["away_team"] == home)].tail(1)
    if home_row["home_team"].item() == home:
        home_elo = home_row["home_elo"].item()
    else: 
        home_elo = home_row["away_elo"].item()

    away_row = dataset[(dataset["home_team"] == away) | (dataset["away_team"] == away)].tail(1)
    if away_row["home_team"].item() == away:
        away_elo = away_row["home_elo"].item()
    else: 
        away_elo = away_row["away_elo"].item()
    row["elo_diff"] = home_elo - away_elo
    
    home_rank = world_cup[world_cup["Team"] == home]["Current FIFA rank"].item()
    away_rank = world_cup[world_cup["Team"] == away]["Current FIFA rank"].item()
    
    row["rank_diff"] = home_rank - away_rank

    row["home_rank_change"] = world_cup[world_cup["Team"] == home]["Rank Change"].item()
    row["away_rank_change"] = world_cup[world_cup["Team"] == away]["Rank Change"].item()
    
    goal_diff = dataset[((dataset['home_team'] == home) | (dataset['away_team'] == home)) & 
            ((dataset['home_team'] == away) | (dataset['away_team'] == away))].tail(1)
    
    if len(goal_diff) == 0:
        row["goal_history_with_opponent"] = 0
        row["goal_history_with_opponent_home_avg"] = 0
        row["goal_history_with_opponent_away_avg"] = 0
        
    elif goal_diff["home_team"].item() == home:
        row["goal_history_with_opponent"] = goal_diff["goal_history_with_opponent"].item()
        row["goal_history_with_opponent_home_avg"] = goal_diff["goal_history_with_opponent_home_avg"].item()
    else:
        row["goal_history_with_opponent"] = -goal_diff["goal_history_with_opponent"].item()
        row["goal_history_with_opponent_away_avg"] = goal_diff["goal_history_with_opponent_away_avg"].item()
        
    home_data = dataset[(dataset['home_team'] == home)].tail(1)
    if len(home_data) > 0:
        home_goal_mean = home_data["home_goal_mean"].item()
    else:
        home_goal_mean = 0
    away_data = dataset[(dataset['away_team'] == away)].tail(1)
    if len(    away_data) > 0:
        away_goal_mean = away_data["away_goal_mean"].item()
    else:
        away_goal_mean = 0
    
    row["home_goal_mean"] = home_goal_mean
    row["away_goal_mean"] = away_goal_mean
    
    row["rating_diff"] = home_row["home_rating"].item() - away_row["away_rating"].item()
    row["pace_diff"] = home_row["home_pace"].item() - away_row["away_pace"].item()
    row["shooting_diff"] = home_row["home_shooting"].item() - away_row["away_shooting"].item()
    row["passing_diff"] = home_row["home_passing"].item() - away_row["away_passing"].item()
    row["dribbling_diff"] = home_row["home_dribbling"].item() - away_row["away_dribbling"].item()
    row["defending_diff"] = home_row["home_defending"].item() - away_row["away_defending"].item()
    row["physical_diff"] = home_row["home_physical"].item() - away_row["away_physical"].item()
    
    # Make sure that there is no nan values
    row.fillna(0, inplace=True)
    
    return row

In [136]:
all_games = pd.DataFrame()
world_cup = pd.read_csv('world_cup_2018_dataset.csv')

In [137]:
opponents = ['First match \nagainst', 'Second match\n against', 'Third match\n against']

world_cup['points'] = 0
world_cup['total_prob'] = 0
world_cup['goal_diff'] = 0
world_cup['goals_scored'] = 0

for group in set(world_cup['Group']):
    print('___Starting group {}:___'.format(group))
    for home, away in combinations(world_cup.query('Group == "{}"'.format(group)).Team, 2):
        print("{} vs. {}: ".format(home, away), end='')
        
        row = get_prediction_row(home, away)
        home_goals, away_goals, game_winner = predict_score(row, 
                                                            wlt_model, 
                                                            home_goal_model, 
                                                            away_goal_model, 
                                                            tie_goal_model)
        if game_winner == 1:
            world_cup.loc[world_cup["Team"] == home, 'points'] += 3
            
        elif game_winner == 0:
            world_cup.loc[world_cup["Team"] == home, 'points'] += 1
            world_cup.loc[world_cup["Team"] == away, 'points'] += 1
        else:
            world_cup.loc[world_cup["Team"] == away, 'points'] += 3
            
        world_cup.loc[world_cup["Team"] == home, 'goal_diff'] += (home_goals - away_goals)
        world_cup.loc[world_cup["Team"] == away, 'goal_diff'] += (away_goals - home_goals)
        world_cup.loc[world_cup["Team"] == home, 'goals_scored'] += home_goals
        world_cup.loc[world_cup["Team"] == away, 'goals_scored'] += away_goals
        if home < away:
            world_cup[f"{home}-{away}"] = home_goals - away_goals
        else:
            world_cup[f"{away}-{home}"] = away_goals - home_goals
        
        tmp_df = row
        tmp_df["home_team"] = home
        tmp_df["away_team"] = away
        tmp_df["home_score"] = home_goals
        tmp_df["away_score"] = away_goals
        
        all_games = pd.concat([all_games, tmp_df])
        print(f"{home_goals} - {away_goals}")

___Starting group B:___
Portugal vs. Spain: 0 - 0
Portugal vs. Morocco: 3 - 0
Portugal vs. Iran: 2 - 1
Spain vs. Morocco: 2 - 0
Spain vs. Iran: 1 - 0
Morocco vs. Iran: 0 - 3
___Starting group H:___
Poland vs. Senegal: 1 - 0
Poland vs. Colombia: 1 - 2
Poland vs. Japan: 1 - 2
Senegal vs. Colombia: 0 - 2
Senegal vs. Japan: 1 - 2
Colombia vs. Japan: 1 - 0
___Starting group F:___
Germany vs. Mexico: 2 - 1
Germany vs. Sweden: 2 - 0
Germany vs. Korea Republic: 1 - 0
Mexico vs. Sweden: 2 - 0
Mexico vs. Korea Republic: 2 - 1
Sweden vs. Korea Republic: 1 - 2
___Starting group A:___
Russia vs. Saudi Arabia: 1 - 1
Russia vs. Egypt: 1 - 0
Russia vs. Uruguay: 1 - 0
Saudi Arabia vs. Egypt: 2 - 1
Saudi Arabia vs. Uruguay: 0 - 0
Egypt vs. Uruguay: 0 - 1
___Starting group G:___
Belgium vs. Panama: 1 - 0
Belgium vs. Tunisia: 1 - 0
Belgium vs. England: 2 - 1
Panama vs. Tunisia: 1 - 0
Panama vs. England: 0 - 1
Tunisia vs. England: 0 - 2
___Starting group D:___
Argentina vs. Iceland: 3 - 0
Argentina vs. Cro

In [116]:
def get_better_team(first_team, second_team):
    if first_team["points"].item() > second_team["points"].item():
        return -1
    elif first_team["points"].item() < second_team["points"].item():
        return 1
    
    if first_team["goal_diff"].item() > second_team["goal_diff"].item():
        return -1
    elif first_team["goal_diff"].item() < second_team["goal_diff"].item():
        return 1
    
    if first_team["goals_scored"].item() > second_team["goals_scored"].item():
        return -1
    elif first_team["goals_scored"].item() < second_team["goals_scored"].item():
        return 1
    
    f_name = first_team["Team"]
    s_name = second_team["Team"]
    if (f_name < s_name):
        if (first_team[f"{f_name}-{s_name}"] > 0):
            return -1
        else:
            return 1
    else:
        if (first_team[f"{s_name}-{f_name}"] < 0):
            return -1
        else:
            return 1

In [132]:
import functools
# Get best two
sort_wc = world_cup.sort_values(by=['Group', 'points'], ascending=False).reset_index()
next_round_wc = sort_wc.groupby('Group').nth([0, 1])

groups = np.unique(sort_wc["Group"])

next_round = pd.DataFrame()
for group in groups:
    group_df = sort_wc[sort_wc["Group"] == group]
    team = [group_df.iloc[0], group_df.iloc[1], group_df.iloc[2], group_df.iloc[3]]
    shorted_standing = sorted(team, key=functools.cmp_to_key(get_better_team))
    first = shorted_standing[0]
    second = shorted_standing[1]
    next_round = pd.concat([next_round, first.to_frame().T, second.to_frame().T])

In [118]:
 world_cup[["Team", "Group", "points"]].sort_values(by=['Group', 'points'], ascending=False).reset_index()

Unnamed: 0,index,Team,Group,points
0,29,Senegal,H,6
1,31,Japan,H,6
2,28,Poland,H,3
3,30,Colombia,H,3
4,25,Panama,G,9
5,24,Belgium,G,4
6,26,Tunisia,G,3
7,27,England,G,1
8,23,Korea Republic,F,7
9,22,Sweden,F,5


In [139]:
all_games[(all_games["home_team"] == "Japan") | (all_games["away_team"] == "Japan")]

Unnamed: 0,elo_diff,rank_diff,home_rank_change,away_rank_change,goal_history_with_opponent,goal_history_with_opponent_home_avg,goal_history_with_opponent_away_avg,home_goal_mean,away_goal_mean,rating_diff,pace_diff,shooting_diff,passing_diff,dribbling_diff,defending_diff,physical_diff,home_team,away_team,home_score,away_score
0,4.635278,-53,2,-1,-0.0,0.0,0.0,1.642857,1.931707,24.886876,33.42834,33.937898,26.668978,32.814685,31.207834,33.465434,Poland,Japan,1,2
0,-76.797978,-34,1,-1,0.0,0.0,0.0,1.542254,1.931707,25.365137,35.095007,30.937898,23.368978,35.014685,30.607834,34.090434,Senegal,Japan,1,2
0,36.652001,-45,0,-1,1.0,0.0,0.5,1.247253,1.931707,27.930354,38.261674,33.337898,26.568978,35.214685,28.607834,29.590434,Colombia,Japan,1,0


In [69]:
next_round

Unnamed: 0,index,Team,Group,Previous appearances,Previous titles,Previous finals,Previous semifinals,Current FIFA rank,Rank Change,First match against,...,Brazil-Serbia,Costa Rica-Switzerland,Serbia-Switzerland,Costa Rica-Serbia,Australia-France,France-Peru,Denmark-France,Australia-Peru,Australia-Denmark,Denmark-Peru
28,0,Russia,A,10,0,0,1,70,-4,Saudi Arabia,...,-1,0,1,-1,0,-1,0,0,0,-2
29,1,Saudi Arabia,A,4,0,0,0,67,0,Russia,...,-1,0,1,-1,0,-1,0,0,0,-2
24,6,Morocco,B,4,0,0,0,41,1,Iran,...,-1,0,1,-1,0,-1,0,0,0,-2
25,4,Portugal,B,6,0,0,2,4,0,Spain,...,-1,0,1,-1,0,-1,0,0,0,-2
20,10,Peru,C,4,0,0,0,11,0,Denmark,...,-1,0,1,-1,0,-1,0,0,0,-2
21,9,Australia,C,4,0,0,0,36,4,France,...,-1,0,1,-1,0,-1,0,0,0,-2
16,13,Iceland,D,0,0,0,0,22,0,Argentina,...,-1,0,1,-1,0,-1,0,0,0,-2
17,15,Nigeria,D,5,0,0,0,48,-1,Croatia,...,-1,0,1,-1,0,-1,0,0,0,-2
12,19,Serbia,E,11,0,0,2,34,1,Costa Rica,...,-1,0,1,-1,0,-1,0,0,0,-2
13,16,Brazil,E,20,5,7,11,2,0,Switzerland,...,-1,0,1,-1,0,-1,0,0,0,-2


In [133]:
pairing = [0,3,4,7,8,11,12,15,1,2,5,6,9,10,13,14]

next_round_wc = next_round
next_round_wc = next_round_wc.reset_index()
next_round_wc = next_round_wc.loc[pairing]
next_round_wc = next_round_wc.set_index('Team')

finals = ['round_of_16', 'quarterfinal', 'semifinal', 'final']


for f in finals:
    print("___Starting of the {}___".format(f))
    iterations = int(len(next_round_wc) / 2)
    winners = []

    for i in range(iterations):
        home = next_round_wc.index[i*2]
        away = next_round_wc.index[i*2+1]
        print("{} vs. {}: ".format(home, away), end='')
        
        row = get_prediction_row(home, away)
        home_goals, away_goals, game_winner = predict_score(row, wl_model,
                                                            home_goal_model, 
                                                            away_goal_model,
                                                           tie_goal_model)
        if game_winner == 1:
            winners.append(home)
        else:
            winners.append(away)
            
        tmp_df = row
        tmp_df["home_team"] = home
        tmp_df["away_team"] = away
        tmp_df["home_score"] = home_goals
        tmp_df["away_score"] = away_goals
        
        all_games = pd.concat([all_games, tmp_df])
        
        print(f"{home_goals} - {away_goals}")
        
    next_round_wc = next_round_wc.loc[winners]
    print("\n")

___Starting of the round_of_16___
Russia vs. Portugal: 1 - 0
Australia vs. Iceland: 1 - 0
Switzerland vs. Sweden: 0 - 1
Panama vs. Senegal: 2 - 0
Saudi Arabia vs. Morocco: 1 - 0
France vs. Nigeria: 1 - 0
Serbia vs. Korea Republic: 1 - 2
Belgium vs. Japan: 2 - 1


___Starting of the quarterfinal___
Russia vs. Australia: 1 - 0
Sweden vs. Panama: 0 - 1
Saudi Arabia vs. France: 1 - 0
Korea Republic vs. Belgium: 1 - 0


___Starting of the semifinal___
Russia vs. Panama: 1 - 0
Saudi Arabia vs. Korea Republic: 2 - 1


___Starting of the final___
Russia vs. Saudi Arabia: 2 - 1




In [138]:
all_games

Unnamed: 0,elo_diff,rank_diff,home_rank_change,away_rank_change,goal_history_with_opponent,goal_history_with_opponent_home_avg,goal_history_with_opponent_away_avg,home_goal_mean,away_goal_mean,rating_diff,pace_diff,shooting_diff,passing_diff,dribbling_diff,defending_diff,physical_diff,home_team,away_team,home_score,away_score
0,-25.00127,-6,0,-2,0.0,0.5,0.0,2.109756,2.395062,-3.347826,6.0,-4.4,-3.3,-1.2,-5.6,-2.75,Portugal,Spain,0,0
0,340.836048,-37,0,1,0.0,0.0,0.0,2.109756,1.48951,29.055901,38.47619,32.742857,31.742857,36.742857,30.142857,30.267857,Portugal,Morocco,3,0
0,112.923005,-33,0,-1,0.0,0.0,0.0,2.109756,2.153005,25.913043,35.333333,29.6,28.6,33.6,27.0,27.125,Portugal,Iran,2,1
0,365.837319,-31,-2,1,0.0,0.0,0.0,2.352201,1.48951,-0.92381,-0.92381,-0.92381,-0.92381,-0.92381,-0.92381,-0.92381,Spain,Morocco,2,0
0,137.924276,-27,-2,-1,0.0,0.0,0.0,2.352201,2.153005,-4.066667,-4.066667,-4.066667,-4.066667,-4.066667,-4.066667,-4.066667,Spain,Iran,1,0
0,-227.913043,4,1,-1,0.0,0.0,0.0,1.489362,2.153005,16.391304,22.666667,17.4,18.7,24.6,17.6,18.625,Morocco,Iran,0,3
0,81.433256,-19,2,1,0.0,0.0,0.0,1.642857,1.537415,21.982609,31.366667,31.4,23.8,30.2,28.2,30.575,Poland,Senegal,1,0
0,-32.016724,-8,2,0,0.0,0.0,0.0,1.642857,1.245902,4.130435,3.5,8.4,8.2,6.0,10.8,10.375,Poland,Colombia,1,2
0,4.635278,-53,2,-1,-0.0,0.0,0.0,1.642857,1.931707,24.886876,33.42834,33.937898,26.668978,32.814685,31.207834,33.465434,Poland,Japan,1,2
0,-113.449979,11,1,0,0.0,0.0,0.0,1.542254,1.245902,4.608696,5.166667,5.4,4.9,8.2,10.2,11.0,Senegal,Colombia,0,2
