In [56]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn import decomposition
from sklearn import datasets, linear_model
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import explained_variance_score, classification_report, r2_score, accuracy_score
from itertools import combinations

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
rankings = pd.read_csv('dataset.csv')
rankings = rankings.drop(['Unnamed: 0'], axis=1)
rankings.head()
print(rankings.columns)

Index(['date', 'away_team', 'home_team', 'home_score', 'away_score',
       'tournament', 'city', 'country', 'neutral', 'home_elo', 'away_elo',
       'elo_diff', 'score_difference', 'home_score_difference_lag',
       'home_avg_goal_diff', 'away_score_difference_lag', 'away_avg_goal_diff',
       'avg_goals_received', 'home_win', 'wc_home_wins', 'wc_away_wins',
       'home_rank', 'home_country_abrv', 'home_total_points',
       'home_previous_points', 'home_rank_change', 'home_cur_year_avg',
       'home_cur_year_avg_weighted', 'home_last_year_avg',
       'home_last_year_avg_weighted', 'home_two_year_ago_avg',
       'home_two_year_ago_weighted', 'home_three_year_ago_avg',
       'home_three_year_ago_weighted', 'home_confederation', 'away_rank',
       'away_country_abrv', 'away_total_points', 'away_previous_points',
       'away_rank_change', 'away_cur_year_avg', 'away_cur_year_avg_weighted',
       'away_last_year_avg', 'away_last_year_avg_weighted',
       'away_two_year_ago_avg'

In [3]:
feature_columns = ["home_score_difference_lag", "away_score_difference_lag",
                    "home_avg_goal_diff", "away_avg_goal_diff", "rank_diff", "elo_diff", "avg_goals_received"]

test_period_start = "2010-01-01"

In [43]:
# Data for classification
Xtrain = rankings[(rankings['date'] < test_period_start)]
Xtrain_wl = Xtrain[Xtrain["home_win"] != 0]
ytrain = Xtrain["home_win"]
ytrain_wl = Xtrain_wl["home_win"]
Xtrain = Xtrain[feature_columns]
Xtrain_wl = Xtrain_wl[feature_columns]

Xtest = rankings[(rankings['tournament'] == "FIFA World Cup") & (rankings['date'] >= test_period_start)]
Xtest_wl = Xtest[Xtest["home_win"] != 0]
ytest = Xtest["home_win"]
ytest_wl = Xtest_wl["home_win"]
Xtest = Xtest[feature_columns]
Xtest_wl = Xtest_wl[feature_columns]

CV_train = rankings[(rankings['tournament'] == "FIFA World Cup")]
CV_train_wl = CV_train[CV_train["home_win"] != 0]
CV_ytrain = CV_train["home_win"]
CV_ytrain_wl = CV_train_wl["home_win"]
CV_train = CV_train[feature_columns]
CV_train_wl = CV_train_wl[feature_columns]

print(len(Xtrain))
print(len(Xtest))
print(len(CV_train))

7289
119
345


In [5]:
rfc = RandomForestClassifier(max_depth=9, random_state=0)
print("CV score", cross_val_score(rfc, CV_train, CV_ytest, cv=10).mean())
rfc.fit(Xtrain, ytrain)
print(rfc.feature_importances_)

y_rfc = rfc.predict(Xtest)
print(sum(y_rfc == ytest) / len(Xtest))

CV score 0.629778456837
[ 0.03637527  0.03108259  0.07278769  0.07393808  0.187375    0.50391618
  0.09452519]
0.6974789915966386


In [45]:
knn = KNeighborsClassifier(n_neighbors=100)

print("CV score", cross_val_score(knn, CV_train, CV_ytrain, cv=10).mean())
knn.fit(Xtrain, ytrain) 

y_knn = knn.predict(Xtest)
y_knn_prob = knn.predict_proba(Xtest)
print(sum(y_knn == ytest) / len(Xtest))

CV score 0.68798955946
0.7058823529411765


In [51]:
knn_wl = KNeighborsClassifier(n_neighbors=40)

print("CV score", cross_val_score(knn, CV_train_wl, CV_ytrain_wl, cv=10).mean())
knn_wl.fit(Xtrain_wl, ytrain_wl) 

y_knn_wl = knn_wl.predict(Xtest_wl)
y_knn_prob_wl = knn_wl.predict_proba(Xtest_wl)
print(sum(y_knn_wl == ytest_wl) / len(Xtest_wl))

CV score 0.87792022792
0.9340659340659341


In [7]:
# Regression data
Xtrain = rankings[(rankings['date'] < test_period_start)]
# FIT only for country
ytrain = Xtrain["home_score"]
ytrain_away = Xtrain["away_score"]
Xtrain = Xtrain[feature_columns]


# Test with last world cup
Xtest = rankings[(rankings['tournament'] == "FIFA World Cup") & (rankings['date'] >= "2010-01-01")]
ytest_home = Xtest["home_score"]
ytest_away = Xtest["away_score"]
Xtest = Xtest[feature_columns]

print(len(Xtrain))
print(len(Xtest))

7289
119


In [8]:
regr_home = linear_model.LinearRegression()
regr_home.fit(Xtrain, ytrain)
lr_home = regr_home.predict(Xtest)


regr_away = linear_model.LinearRegression()
regr_away.fit(Xtrain, ytrain_away)
lr_away = regr_away.predict(Xtest)


print("HOME", sum(np.around(lr_home) == ytest_home) / len(Xtest))
print("AWAY", sum(np.around(lr_away) == ytest_away) / len(Xtest))

HOME 0.29411764705882354
AWAY 0.5126050420168067


In [9]:
feature_columns_statsmodels = feature_columns + ["home_score", "away_score"]

# Regression data
Xtrain = rankings[(rankings['date'] < test_period_start)]
# FIT only for country
Xtrain = Xtrain[feature_columns_statsmodels]
ytrain = Xtrain["home_score"]

# Test with last world cup
Xtest = rankings[(rankings['tournament'] == "FIFA World Cup") & (rankings['date'] >= "2010-01-01")]
ytest_home = Xtest["home_score"]
ytest_away = Xtest["away_score"]
Xtest = Xtest[feature_columns_statsmodels]

print(len(Xtrain))
print(len(Xtest))

7289
119


In [10]:
# importing the tools required for the Poisson regression model
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import poisson


poisson_model_home = smf.glm(formula="home_score ~ home_score_difference_lag + away_score_difference_lag + home_avg_goal_diff + away_avg_goal_diff + rank_diff + elo_diff + avg_goals_received",
                        data=Xtrain,
                        family=sm.families.Poisson()).fit()

y_poisson_home = poisson_model_home.predict(Xtest)
print("Accuracy home: ", sum(np.around(y_poisson_home) == ytest_home) / len(Xtest))


poisson_model_away = smf.glm(formula="away_score ~ home_score_difference_lag + away_score_difference_lag + home_avg_goal_diff + away_avg_goal_diff + rank_diff + elo_diff + avg_goals_received",
                        data=Xtrain,
                        family=sm.families.Poisson()).fit()

y_poisson_away = poisson_model_away.predict(Xtest)
print("Accuracy away: ", sum(np.around(y_poisson_away) == ytest_away) / len(Xtest))

Accuracy home:  0.23529411764705882
Accuracy away:  0.47058823529411764


In [11]:
game_winner = knn.predict(Xtest.drop(['home_score', 'away_score'], axis=1))
game_predictions = []

i = 0
for index, row in Xtest.iterrows():
    home_goals = poisson_model_home.predict(row)
    away_goals = poisson_model_away.predict(row)
    team_pred = [[poisson.pmf(idx, team_avg) for idx in range(0, 7)] for team_avg in [np.around(home_goals), np.round(away_goals)]]
    goal_matrix = np.outer(np.array(team_pred[0]), np.array(team_pred[1]))
    if game_winner[i] == 1:
        a = np.tril(goal_matrix, -1)
        home_goals, away_goals = np.unravel_index(a.argmax(), a.shape)
    elif game_winner[i] == 0:
        a = np.diag(goal_matrix)
        home_goals, away_goals = a.argmax() + 1, a.argmax() + 1
    else:
        a = np.triu(goal_matrix, 1)
        home_goals, away_goals = np.unravel_index(a.argmax(), a.shape)

    game_predictions.append([home_goals, away_goals])
    i += 1


In [12]:
predictions = pd.DataFrame(game_predictions, columns=["home_pred", "away_pred"])
correct = Xtest[["home_score", "away_score"]].reset_index()
result_df = pd.concat([correct, predictions], axis=1)
result_df["home_win"] = np.sign(result_df["home_score"] - result_df["away_score"])
result_df["home_win_pred"] = np.sign(result_df["home_pred"] - result_df["away_pred"])

total_score = 0
for index, game_row in result_df.iterrows():
    row_score = 0
    if game_row["home_win"] == game_row["home_win_pred"]:
        row_score += 2
    
    if (game_row["home_score"] == game_row["home_pred"]) or (game_row["away_score"] == game_row["away_pred"]):
        row_score += 1
    
    if (game_row["home_score"] == game_row["home_pred"]) and (game_row["away_score"] == game_row["away_pred"]):
        row_score += 2
    
    total_score += row_score
    
print(total_score)

298


In [30]:
world_cup = pd.read_csv('world_cup_2018_dataset.csv')

feature_columns = ["home_score_difference_lag", "away_score_difference_lag",
                    "home_avg_goal_diff", "away_avg_goal_diff", "rank_diff", "elo_diff", "avg_goals_received"]

In [33]:
def get_prediction_row(home, away):
    row = pd.DataFrame(np.array([[np.nan] * len(feature_columns)]), columns=feature_columns)
    
    home_row = rankings[(rankings["home_team"] == home) | (rankings["away_team"] == home)].tail(1)
    if home_row["home_team"].item() == home:
        home_rank = home_row["home_rank"].item()
        home_elo = home_row["home_elo"].item()

    else: 
        home_rank = home_row["away_rank"].item()
        home_elo = home_row["away_elo"].item()

    away_row = rankings[(rankings["home_team"] == away) | (rankings["away_team"] == away)].tail(1)
    if away_row["home_team"].item() == away:
        away_rank = away_row["home_rank"].item()
        away_elo = away_row["home_elo"].item()
    else: 
        away_rank = away_row["away_rank"].item()
        away_elo = away_row["away_elo"].item()

    row["rank_diff"] = home_rank - away_rank
    row["elo_diff"] = home_elo - away_elo

    home_row = rankings[(rankings["home_team"] == home)].tail(1)
    row["home_score_difference_lag"] = home_row["home_score_difference_lag"].item()
    row["home_avg_goal_diff"] = home_row["home_avg_goal_diff"].item()
    row["avg_goals_received"] = home_row["avg_goals_received"].item()

    away_row = rankings[(rankings["away_team"] == away)].tail(1)
    row["away_score_difference_lag"] = away_row["away_score_difference_lag"].item()
    row["away_avg_goal_diff"] = away_row["away_avg_goal_diff"].item()
    return row

In [61]:
all_games = pd.DataFrame()

In [63]:
opponents = ['First match \nagainst', 'Second match\n against', 'Third match\n against']

world_cup['points'] = 0
world_cup['total_prob'] = 0

for group in set(world_cup['Group']):
    print('___Starting group {}:___'.format(group))
    for home, away in combinations(world_cup.query('Group == "{}"'.format(group)).Team, 2):
        print("{} vs. {}: ".format(home, away), end='')
        
        row = get_prediction_row(home, away)
        # Game Winner
        game_winner = knn.predict(row)[0]
        game_winner_prob = knn.predict_proba(row)
        
        # Goals
        home_goals = poisson_model_home.predict(row)
        away_goals = poisson_model_away.predict(row)
        
        team_pred = [[poisson.pmf(idx, team_avg) for idx in range(0, 7)] for team_avg in [np.around(home_goals), np.round(away_goals)]]
        goal_matrix = np.outer(np.array(team_pred[0]), np.array(team_pred[1]))
        if game_winner == 1:
            a = np.tril(goal_matrix, -1)
            home_goals, away_goals = np.unravel_index(a.argmax(), a.shape)
            world_cup.loc[world_cup["Team"] == home, 'points'] += 3
        elif game_winner == 0:
            a = np.diag(goal_matrix)
            home_goals, away_goals = a.argmax() + 1, a.argmax() + 1
            
            world_cup.loc[world_cup["Team"] == home, 'points'] += 1
            world_cup.loc[world_cup["Team"] == away, 'points'] += 1
        else:
            a = np.triu(goal_matrix, 1)
            home_goals, away_goals = np.unravel_index(a.argmax(), a.shape)
            world_cup.loc[world_cup["Team"] == away, 'points'] += 3
        
        all_games = pd.concat([all_games, row])
        print(f"Game {home} - {away} end result {home_goals} - {away_goals}")

___Starting group G:___
Belgium vs. Panama: Game Belgium - Panama end result 1 - 0
Belgium vs. Tunisia: Game Belgium - Tunisia end result 2 - 0
Belgium vs. England: Game Belgium - England end result 1 - 1
Panama vs. Tunisia: Game Panama - Tunisia end result 1 - 0
Panama vs. England: Game Panama - England end result 0 - 1
Tunisia vs. England: Game Tunisia - England end result 0 - 1
___Starting group A:___
Russia vs. Saudi Arabia: Game Russia - Saudi Arabia end result 1 - 1
Russia vs. Egypt: Game Russia - Egypt end result 1 - 0
Russia vs. Uruguay: Game Russia - Uruguay end result 0 - 1
Saudi Arabia vs. Egypt: Game Saudi Arabia - Egypt end result 1 - 0
Saudi Arabia vs. Uruguay: Game Saudi Arabia - Uruguay end result 0 - 1
Egypt vs. Uruguay: Game Egypt - Uruguay end result 0 - 1
___Starting group E:___
Brazil vs. Switzerland: Game Brazil - Switzerland end result 2 - 0
Brazil vs. Costa Rica: Game Brazil - Costa Rica end result 2 - 0
Brazil vs. Serbia: Game Brazil - Serbia end result 2 - 0
S

Unnamed: 0,home_score_difference_lag,away_score_difference_lag,home_avg_goal_diff,away_avg_goal_diff,rank_diff,elo_diff,avg_goals_received
0,9,1,3.0,1.1,-55,181.339584,0.6
0,9,0,3.0,-1.0,-23,257.987746,0.6
0,9,-4,3.0,-1.8,-10,11.863072,0.6
0,3,0,1.2,-1.0,32,76.648162,0.6
0,3,-4,1.2,-1.8,45,-169.476512,0.6
0,1,-4,0.4,-1.8,13,-246.124674,1.0
0,-1,1,0.3,-1.6,4,-19.650917,1.0
0,-1,1,0.3,-0.6,33,-15.259639,1.0
0,-1,-1,0.3,0.7,47,-157.447045,1.0
0,1,1,1.4,-0.6,29,4.391278,0.6


In [55]:
pairing = [0,3,4,7,8,11,12,15,1,2,5,6,9,10,13,14]

sort_wc = world_cup.sort_values(by=['Group', 'points'], ascending=False).reset_index()
next_round_wc = sort_wc.groupby('Group').nth([0, 1])


next_round_wc = next_round_wc.reset_index()
next_round_wc = next_round_wc.loc[pairing]
next_round_wc = next_round_wc.set_index('Team')

finals = ['round_of_16', 'quarterfinal', 'semifinal', 'final']


for f in finals:
    print("___Starting of the {}___".format(f))
    iterations = int(len(next_round_wc) / 2)
    winners = []

    for i in range(iterations):
        home = next_round_wc.index[i*2]
        away = next_round_wc.index[i*2+1]
        print("{} vs. {}: ".format(home, away), end='')
        
        row = get_prediction_row(home, away)
        # Game Winner
        game_winner = knn_wl.predict(row)[0]
        game_winner_prob = knn_wl.predict_proba(row)
        
        # Goals
        home_goals = poisson_model_home.predict(row)
        away_goals = poisson_model_away.predict(row)
        
        team_pred = [[poisson.pmf(idx, team_avg) for idx in range(0, 7)] for team_avg in [np.around(home_goals), np.round(away_goals)]]
        goal_matrix = np.outer(np.array(team_pred[0]), np.array(team_pred[1]))
        if game_winner == 1:
            a = np.tril(goal_matrix, -1)
            home_goals, away_goals = np.unravel_index(a.argmax(), a.shape)
            winners.append(home)
        else:
            a = np.triu(goal_matrix, 1)
            home_goals, away_goals = np.unravel_index(a.argmax(), a.shape)
            winners.append(away)
        
        print(f"Game {home} - {away} end result {home_goals} - {away_goals}")
        
    next_round_wc = next_round_wc.loc[winners]
    print("\n")

___Starting of the round_of_16___
Uruguay vs. Portugal: Game Uruguay - Portugal end result 0 - 1
France vs. Iceland: Game France - Iceland end result 1 - 0
Brazil vs. Mexico: Game Brazil - Mexico end result 2 - 0
Belgium vs. Poland: Game Belgium - Poland end result 1 - 0
Russia vs. Spain: Game Russia - Spain end result 0 - 1
Australia vs. Argentina: Game Australia - Argentina end result 0 - 1
Switzerland vs. Germany: Game Switzerland - Germany end result 0 - 1
England vs. Colombia: Game England - Colombia end result 1 - 0


___Starting of the quarterfinal___
Portugal vs. France: Game Portugal - France end result 1 - 0
Brazil vs. Belgium: Game Brazil - Belgium end result 1 - 0
Spain vs. Argentina: Game Spain - Argentina end result 1 - 0
Germany vs. England: Game Germany - England end result 1 - 0


___Starting of the semifinal___
Portugal vs. Brazil: Game Portugal - Brazil end result 0 - 1
Spain vs. Germany: Game Spain - Germany end result 0 - 1


___Starting of the final___
Brazil vs. 