In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn import decomposition
from sklearn import datasets, linear_model
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import explained_variance_score, classification_report, r2_score, accuracy_score
from itertools import combinations
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

In [162]:
dataset = pd.read_csv('dataset.csv')
dataset = dataset.drop(['Unnamed: 0'], axis=1)
dataset.head()
print(dataset.columns)

Index(['date', 'away_team', 'home_team', 'home_score', 'away_score',
       'tournament', 'city', 'country', 'neutral', 'home_elo', 'away_elo',
       'elo_diff', 'score_difference', 'home_score_difference_lag',
       'home_avg_goal_diff', 'away_score_difference_lag', 'away_avg_goal_diff',
       'avg_goals_received', 'home_win', 'wc_home_wins', 'wc_away_wins',
       'home_rank', 'home_country_abrv', 'home_total_points',
       'home_previous_points', 'home_rank_change', 'home_cur_year_avg',
       'home_cur_year_avg_weighted', 'home_last_year_avg',
       'home_last_year_avg_weighted', 'home_two_year_ago_avg',
       'home_two_year_ago_weighted', 'home_three_year_ago_avg',
       'home_three_year_ago_weighted', 'home_confederation', 'away_rank',
       'away_country_abrv', 'away_total_points', 'away_previous_points',
       'away_rank_change', 'away_cur_year_avg', 'away_cur_year_avg_weighted',
       'away_last_year_avg', 'away_last_year_avg_weighted',
       'away_two_year_ago_avg'

In [163]:
def get_previous_goals(row):
    home = row["home_team"]
    away = row["away_team"]
    date = row["date"]
    data = dataset[((dataset['home_team'] == home) | (dataset['away_team'] == home)) & 
            ((dataset['home_team'] == away) | (dataset['away_team'] == away)) &
            (dataset['date'] < date)
           ]
    home_goals = data[(data['home_team'] == home)]["home_score"].sum() +  data[(data['away_team'] == home)]["away_score"].sum()
    away_goals = data[(data['home_team'] == away)]["away_score"].sum() +  data[(data['away_team'] == away)]["away_score"].sum()
    return home_goals - away_goals
    
dataset["goal_history_with_opponent"] = dataset.apply(get_previous_goals, axis=1)

In [164]:
def get_boundries(elo_diff, k=10):
    lower_bound = elo_diff * 0.75
    upper_bound = elo_diff * 1.25
    
    if elo_diff > 0:
        if abs(elo_diff) < 15:
            lower_bound = min(lower_bound - k, 0)
            upper_bound = upper_bound + k
    else:
        if abs(elo_diff) < 15:
            upper_bound = min(lower_bound + k, 0)
            lower_bound = upper_bound - k
    return lower_bound, upper_bound

In [165]:
def get_avg_goal_for_rank(row):
    home = row["home_team"]
    away = row["away_team"]
    date = row["date"]
    elo_diff = row["elo_diff"]
    
    lower_bound, upper_bound = get_boundries(elo_diff)
    
    data = dataset[((dataset['home_team'] == home) | (dataset['away_team'] == home)) & 
                       (dataset['date'] < date) &
                       (dataset['elo_diff'].between(lower_bound, upper_bound))
                      ]
    home_avg = pd.concat([data[(data['home_team'] == home)]["home_score"], data[(data['away_team'] == home)]["away_score"]]).mean()
    opponent_avg = pd.concat([data[(data['home_team'] != home)]["away_score"], data[(data['away_team'] != home)]["away_score"]]).mean()
    
    if np.isnan(home_avg):
        home_avg = 0
    if np.isnan(opponent_avg):
        opponent_avg = 0
    return pd.Series({'home_goal_avg_rank': np.round(home_avg, 1), 'opponent_goal_avg_rank': np.round(opponent_avg,1)}) 
#dataset = pd.concat([dataset, dataset.apply(get_avg_goal_for_rank, axis=1)], axis=1)

In [166]:
important_columns = ["elo_diff", "rank_diff", "home_score", "away_score", "home_win",
                     "date", "tournament", "home_rank_change", "away_rank_change", 
                     "goal_history_with_opponent", "home_goal_avg_rank", "opponent_goal_avg_rank", "wc_home_wins", "wc_away_wins"]

In [167]:
rankings = dataset[important_columns]
rankings.head()

Unnamed: 0,elo_diff,rank_diff,home_score,away_score,home_win,date,tournament,home_rank_change,away_rank_change,goal_history_with_opponent,home_goal_avg_rank,opponent_goal_avg_rank,wc_home_wins,wc_away_wins
0,-354.537246,62,1,3,-1,1993-08-08,FIFA World Cup qualification,1,5,0,0.0,0.0,1,-1
1,-272.114466,51,0,1,-1,1993-08-08,FIFA World Cup qualification,8,16,0,0.0,0.0,0,-1
2,-25.898346,37,3,1,1,1993-08-08,FIFA World Cup qualification,28,-6,0,0.0,0.0,0,-3
3,394.718807,-59,5,0,1,1993-08-08,FIFA World Cup qualification,30,31,0,0.0,0.0,0,0
4,-111.634052,14,2,1,1,1993-08-15,FIFA World Cup qualification,16,5,0,0.0,0.0,0,-1


In [197]:
feature_columns = ["elo_diff", "rank_diff", "home_rank_change",
                   "away_rank_change", "goal_history_with_opponent"]

In [255]:
test_period_start = "2010-01-01"

# Data for classification
Xtrain = rankings[(rankings['date'] < test_period_start)]
Xtrain_wl = Xtrain[Xtrain["home_win"] != 0]
ytrain = Xtrain["home_win"]
ytrain_wl = Xtrain_wl["home_win"]
Xtrain = Xtrain[feature_columns]
Xtrain_wl = Xtrain_wl[feature_columns]

Xtest = rankings[(rankings['tournament'] == "FIFA World Cup") & (rankings['date'] >= test_period_start)]
Xtest_wl = Xtest[Xtest["home_win"] != 0]
ytest = Xtest["home_win"]
ytest_wl = Xtest_wl["home_win"]
Xtest = Xtest[feature_columns]
Xtest_wl = Xtest_wl[feature_columns]

CV_train = rankings[(rankings['tournament'] == "FIFA World Cup")]
CV_train_wl = CV_train[CV_train["home_win"] != 0]
CV_ytrain = CV_train["home_win"]
CV_ytrain_wl = CV_train_wl["home_win"]
CV_train = CV_train[feature_columns]
CV_train_wl = CV_train_wl[feature_columns]

print(len(Xtrain))
print(len(Xtest))
print(len(CV_train))

7289
119
345


In [256]:
knn = KNeighborsClassifier(n_neighbors=20)

print("CV score", cross_val_score(knn, CV_train, CV_ytrain, cv=10).mean())
knn.fit(Xtrain, ytrain) 

y_knn = knn.predict(Xtest)
y_knn_prob = knn.predict_proba(Xtest)
print(sum(y_knn == ytest) / len(Xtest))

CV score 0.719857397504
0.6890756302521008


In [257]:
knn_wl = KNeighborsClassifier(n_neighbors=20)

print("CV score", cross_val_score(knn, CV_train_wl, CV_ytrain_wl, cv=10).mean())
knn_wl.fit(Xtrain_wl, ytrain_wl) 

y_knn_wl = knn_wl.predict(Xtest_wl)
y_knn_prob_wl = knn_wl.predict_proba(Xtest_wl)
print(sum(y_knn_wl == ytest_wl) / len(Xtest_wl))

CV score 0.881458689459
0.9010989010989011


In [258]:
# Regression data
Xtrain = rankings[(rankings['date'] < test_period_start)]
# FIT only for country
ytrain = Xtrain["home_score"]
ytrain_away = Xtrain["away_score"]
Xtrain = Xtrain[feature_columns]


# Test with last world cup
Xtest = rankings[(rankings['tournament'] == "FIFA World Cup") & (rankings['date'] >= test_period_start)]
ytest_home = Xtest["home_score"]
ytest_away = Xtest["away_score"]
Xtest = Xtest[feature_columns]

print(len(Xtrain))
print(len(Xtest))

7289
119


In [268]:
rfr_home = RandomForestClassifier(n_estimators=)
rfr_home.fit(Xtrain, ytrain)
y_home = rfr_home.predict(Xtest)

rfr_away = KNeighborsClassifier(n_neighbors=50)
rfr_away.fit(Xtrain, ytrain_away)
y_away = rfr_away.predict(Xtest)


print("HOME", sum(np.around(y_home) == ytest_home) / len(Xtest))
print("AWAY", sum(np.around(y_away) == ytest_away) / len(Xtest))

HOME 0.35294117647058826
AWAY 0.36134453781512604


In [269]:
# Test with last world cup
Xtest_master = rankings[(rankings['tournament'] == "FIFA World Cup") & (rankings['date'] >= test_period_start)]
ytest_home = Xtest_master["home_score"]
ytest_away = Xtest_master["away_score"]
Xtest = Xtest[feature_columns]

print(len(Xtrain))
print(len(Xtest))

7289
119


In [248]:
def predict_score(data_row, sign_model, home_goal_model, away_goal_model):
    game_winner = sign_model.predict(data_row)[0]
    game_winner_prob = sign_model.predict_proba(data_row)[0]
    home_goals = home_goal_model.predict_proba(data_row)[0]
    away_goals = away_goal_model.predict_proba(data_row)[0]
    N = len(home_goals)
    M = len(away_goals)
    min_shape = min(N, M)

    goal_matrix = np.outer(home_goals, away_goals)
    
    # make square
    goal_matrix = goal_matrix[:min_shape, :min_shape]
    
    if game_winner == 1:
        a = np.tril(goal_matrix, -1)
        home_goals, away_goals = np.unravel_index(a.argmax(), a.shape)
        assert(home_goals > away_goals)
    elif game_winner == 0:
        a = np.diag(goal_matrix)
        home_goals, away_goals = a.argmax() + 1, a.argmax() + 1
        assert(home_goals == away_goals)
    else:
        a = np.triu(goal_matrix, 1)
        home_goals, away_goals = np.unravel_index(a.argmax(), a.shape)
        assert(home_goals < away_goals)

    return home_goals, away_goals, game_winner

In [270]:
game_predictions = []

for index in range(len(Xtest)):
    row = pd.DataFrame(Xtest.iloc[index]).T
    home_goals, away_goals, winner = predict_score(row, knn, rfr_home, rfr_away)
    game_predictions.append([home_goals, away_goals, winner])
    
predictions = pd.DataFrame(game_predictions, columns=["home_pred", "away_pred", "winner_pred"])
correct = Xtest_master[["home_score", "away_score"]].reset_index()
result_df = pd.concat([correct, predictions], axis=1)
result_df["home_win"] = np.sign(result_df["home_score"] - result_df["away_score"])

total_score = 0
max_score = 0
total_games_correct = 0
total_home_score_correct = 0
total_score_correct = 0

for index, game_row in result_df.iterrows():
    row_score = 0
    if game_row["home_win"] == game_row["winner_pred"]:
        row_score += 2
        total_games_correct += 1
    
    if (game_row["home_score"] == game_row["home_pred"]) or (game_row["away_score"] == game_row["away_pred"]):
        row_score += 1
        total_home_score_correct += 1
    
    if (game_row["home_score"] == game_row["home_pred"]) and (game_row["away_score"] == game_row["away_pred"]):
        row_score += 2
        total_score_correct += 1
    
    total_score += row_score
    max_score += 5
    
N = len(result_df)
print(f"{total_score} / {max_score}")
print(f"{total_games_correct} / {N}")
print(f"{total_home_score_correct} / {N}")
print(f"{total_score_correct} / {N}")

269 / 595
82 / 119
71 / 119
17 / 119


In [251]:
lost_games = result_df[result_df["home_win"] == 1]

(lost_games["away_pred"] - lost_games["away_score"]).mean()

-0.2608695652173913

In [149]:
world_cup = pd.read_csv('world_cup_2018_dataset.csv')
world_cup.head()

Unnamed: 0,Team,Group,Previous appearances,Previous titles,Previous finals,Previous semifinals,Current FIFA rank,Rank Change,First match against,Match index,history with first opponent W-L,history with first opponent goals,Second match against,Match index.1,history with second opponent W-L,history with second opponent goals,Third match against,Match index.2,history with third opponent W-L,history with third opponent goals
0,Russia,A,10,0,0,1,70,-4,Saudi Arabia,1,-1.0,-2.0,Egypt,17,,,Uruguay,33,0.0,0.0
1,Saudi Arabia,A,4,0,0,0,67,0,Russia,1,1.0,2.0,Uruguay,18,1.0,1.0,Egypt,34,-5.0,-5.0
2,Egypt,A,2,0,0,0,45,1,Uruguay,2,-1.0,-2.0,Russia,17,,,Saudi Arabia,34,5.0,5.0
3,Uruguay,A,12,2,2,5,14,3,Egypt,2,1.0,2.0,Saudi Arabia,18,-1.0,-1.0,Russia,33,0.0,0.0
4,Portugal,B,6,0,0,2,4,0,Spain,3,-12.0,-31.0,Morocco,19,-1.0,-2.0,Iran,35,2.0,5.0


In [150]:
def get_prediction_row(home, away):
    row = pd.DataFrame(np.array([[np.nan] * len(feature_columns)]), columns=feature_columns)
    
    home_row = dataset[(dataset["home_team"] == home) | (dataset["away_team"] == home)].tail(1)
    if home_row["home_team"].item() == home:
        home_elo = home_row["home_elo"].item()
        home_rank = home_row["home_rank"].item()
    else: 
        home_elo = home_row["away_elo"].item()
        home_rank = home_row["away_rank"].item()

    away_row = dataset[(dataset["home_team"] == away) | (dataset["away_team"] == away)].tail(1)
    if away_row["home_team"].item() == away:
        away_elo = away_row["home_elo"].item()
        away_rank = away_row["home_rank"].item()
    else: 
        away_elo = away_row["away_elo"].item()
        away_rank = away_row["away_rank"].item()
    row["elo_diff"] = home_elo - away_elo
    
    home_rank = world_cup[world_cup["Team"] == home]["Current FIFA rank"].item()
    away_rank = world_cup[world_cup["Team"] == away]["Current FIFA rank"].item()
    
    row["rank_diff"] = home_rank - away_rank

    row["home_rank_change"] = world_cup[world_cup["Team"] == home]["Rank Change"].item()
    row["away_rank_change"] = world_cup[world_cup["Team"] == away]["Rank Change"].item()
    
    goal_diff = dataset[((dataset['home_team'] == home) | (dataset['away_team'] == home)) & 
            ((dataset['home_team'] == away) | (dataset['away_team'] == away))].tail(1)
    
    if len(goal_diff) == 0:
        row["goal_history_with_opponent"] = 0
    elif goal_diff["home_team"].item() == home:
        row["goal_history_with_opponent"] = goal_diff["goal_history_with_opponent"].item()
    else:
        row["goal_history_with_opponent"] = -goal_diff["goal_history_with_opponent"].item()
    
    lb, ub = get_boundries(row["elo_diff"].item())
    avg_goal_home = dataset[((dataset['home_team'] == home) & 
                            dataset['elo_diff'].between(lb, ub))].tail(1)
    
    if len(avg_goal_home) > 0:
        row["home_goal_avg_rank"] = avg_goal_home["home_goal_avg_rank"].item()
        row["opponent_goal_avg_rank"] = avg_goal_home["opponent_goal_avg_rank"].item()
    else:
        row["home_goal_avg_rank"] = 0
        row["opponent_goal_avg_rank"] = 0
    return row

In [86]:
all_games = pd.DataFrame()

In [160]:
opponents = ['First match \nagainst', 'Second match\n against', 'Third match\n against']

world_cup['points'] = 0
world_cup['total_prob'] = 0

for group in set(world_cup['Group']):
    print('___Starting group {}:___'.format(group))
    for home, away in combinations(world_cup.query('Group == "{}"'.format(group)).Team, 2):
        print("{} vs. {}: ".format(home, away), end='')
        
        row = get_prediction_row(home, away)
        home_goals, away_goals, game_winner = predict_score(row, knn, rfr_home, rfr_away)
        if game_winner == 1:
            world_cup.loc[world_cup["Team"] == home, 'points'] += 3
        elif game_winner == 0:
            world_cup.loc[world_cup["Team"] == home, 'points'] += 1
            world_cup.loc[world_cup["Team"] == away, 'points'] += 1
        else:
            world_cup.loc[world_cup["Team"] == away, 'points'] += 3
        
        tmp_df = pd.DataFrame([{
            "home_team": home, 
            "away_team": away,
            "home_score": home_goals,
            "away_score": away_goals,
            "rank_diff": row["rank_diff"].item(),
            "elo_diff": row["elo_diff"].item(),
            "goal_history_with_opponent": row["goal_history_with_opponent"].item(),
            "home_goal_avg_rank": row["home_goal_avg_rank"].item(),
            "opponent_goal_avg_rank": row["opponent_goal_avg_rank"].item()
            
        }])
        all_games = pd.concat([all_games, tmp_df])
        print(f"Game {home} - {away} end result {home_goals} - {away_goals}")

___Starting group E:___
Brazil vs. Switzerland: Game Brazil - Switzerland end result 2 - 0
Brazil vs. Costa Rica: Game Brazil - Costa Rica end result 2 - 0
Brazil vs. Serbia: Game Brazil - Serbia end result 2 - 0
Switzerland vs. Costa Rica: Game Switzerland - Costa Rica end result 2 - 2
Switzerland vs. Serbia: Game Switzerland - Serbia end result 2 - 0
Costa Rica vs. Serbia: Game Costa Rica - Serbia end result 1 - 0
___Starting group A:___
Russia vs. Saudi Arabia: Game Russia - Saudi Arabia end result 2 - 2
Russia vs. Egypt: Game Russia - Egypt end result 1 - 0
Russia vs. Uruguay: Game Russia - Uruguay end result 0 - 1
Saudi Arabia vs. Egypt: Game Saudi Arabia - Egypt end result 2 - 0
Saudi Arabia vs. Uruguay: Game Saudi Arabia - Uruguay end result 1 - 1
Egypt vs. Uruguay: Game Egypt - Uruguay end result 0 - 1
___Starting group D:___
Argentina vs. Iceland: Game Argentina - Iceland end result 2 - 0
Argentina vs. Croatia: Game Argentina - Croatia end result 2 - 0
Argentina vs. Nigeria: G

In [161]:
pairing = [0,3,4,7,8,11,12,15,1,2,5,6,9,10,13,14]

sort_wc = world_cup.sort_values(by=['Group', 'points'], ascending=False).reset_index()
next_round_wc = sort_wc.groupby('Group').nth([0, 1])


next_round_wc = next_round_wc.reset_index()
next_round_wc = next_round_wc.loc[pairing]
next_round_wc = next_round_wc.set_index('Team')

finals = ['round_of_16', 'quarterfinal', 'semifinal', 'final']


for f in finals:
    print("___Starting of the {}___".format(f))
    iterations = int(len(next_round_wc) / 2)
    winners = []

    for i in range(iterations):
        home = next_round_wc.index[i*2]
        away = next_round_wc.index[i*2+1]
        print("{} vs. {}: ".format(home, away), end='')
        
        row = get_prediction_row(home, away)
        home_goals, away_goals, game_winner = predict_score(row, knn_wl, rfr_home, rfr_away)
        if game_winner == 1:
            winners.append(home)
        else:
            winners.append(away)
            
        tmp_df = pd.DataFrame([{
            "home_team": home, 
            "away_team": away,
            "home_score": home_goals,
            "away_score": away_goals,
            "rank_diff": row["rank_diff"].item(),
            "elo_diff": row["elo_diff"].item(),
            "goal_history_with_opponent": row["goal_history_with_opponent"].item(),
            "home_goal_avg_rank": row["home_goal_avg_rank"].item(),
            "opponent_goal_avg_rank": row["opponent_goal_avg_rank"].item()
            
        }])
        all_games = pd.concat([all_games, tmp_df])
        
        print(f"{home_goals} - {away_goals}")
        
    next_round_wc = next_round_wc.loc[winners]
    print("\n")

___Starting of the round_of_16___
Uruguay vs. Portugal: 0 - 2
France vs. Croatia: 2 - 0
Brazil vs. Korea Republic: 2 - 0
Belgium vs. Poland: 2 - 0
Saudi Arabia vs. Spain: 0 - 1
Australia vs. Argentina: 0 - 1
Switzerland vs. Germany: 0 - 2
England vs. Colombia: 2 - 1


___Starting of the quarterfinal___
Portugal vs. France: 2 - 1
Brazil vs. Belgium: 2 - 0
Spain vs. Argentina: 2 - 0
Germany vs. England: 2 - 0


___Starting of the semifinal___
Portugal vs. Brazil: 0 - 1
Spain vs. Germany: 0 - 1


___Starting of the final___
Brazil vs. Germany: 2 - 1




In [91]:
all_games[(all_games["home_team"] == "Croatia") | (all_games["away_team"] == "Croatia")]

Unnamed: 0,away_score,away_team,elo_diff,goal_history_with_opponent,home_goal_avg_rank,home_score,home_team,opponent_goal_avg_rank,rank_diff
0,0,Croatia,126.560682,0,1.1,2,Argentina,0.8,-15
0,2,Croatia,-4.102698,-2,0.0,2,Iceland,0.0,2
0,0,Nigeria,153.98101,0,1.6,1,Croatia,0.6,-28
0,0,Croatia,159.535443,0,1.8,1,France,0.7,-13
