In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn import decomposition
from sklearn import datasets, linear_model
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import explained_variance_score, classification_report, r2_score, accuracy_score
from itertools import combinations
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

In [27]:
dataset = pd.read_csv('dataset.csv')
dataset = dataset.drop(['Unnamed: 0'], axis=1)
dataset.head()
print(dataset.columns)

Index(['date', 'away_team', 'home_team', 'home_score', 'away_score',
       'tournament', 'city', 'country', 'neutral', 'home_elo', 'away_elo',
       'elo_diff', 'score_difference', 'home_score_difference_lag',
       'home_avg_goal_diff', 'away_score_difference_lag', 'away_avg_goal_diff',
       'avg_goals_received', 'home_win', 'wc_home_wins', 'wc_away_wins',
       'home_rank', 'home_country_abrv', 'home_total_points',
       'home_previous_points', 'home_rank_change', 'home_cur_year_avg',
       'home_cur_year_avg_weighted', 'home_last_year_avg',
       'home_last_year_avg_weighted', 'home_two_year_ago_avg',
       'home_two_year_ago_weighted', 'home_three_year_ago_avg',
       'home_three_year_ago_weighted', 'home_confederation', 'away_rank',
       'away_country_abrv', 'away_total_points', 'away_previous_points',
       'away_rank_change', 'away_cur_year_avg', 'away_cur_year_avg_weighted',
       'away_last_year_avg', 'away_last_year_avg_weighted',
       'away_two_year_ago_avg'

In [28]:
def get_previous_goals(row):
    home = row["home_team"]
    away = row["away_team"]
    date = row["date"]
    data = dataset[((dataset['home_team'] == home) | (dataset['away_team'] == home)) & 
            ((dataset['home_team'] == away) | (dataset['away_team'] == away)) &
            (dataset['date'] < date)
           ]
    
    home_team_goals = pd.concat([data[(data['home_team'] == home)]["home_score"], 
                                 data[(data['away_team'] == home)]["away_score"]])
    away_team_goals = pd.concat([data[(data['home_team'] == away)]["home_score"], 
                                 data[(data['away_team'] == away)]["away_score"]])
    
    home_goals = home_team_goals.sum()
    away_goals = away_team_goals.sum()
    
    home_avg = home_team_goals.mean()
    away_avg = away_team_goals.mean()
    if np.isnan(home_avg):
        home_avg = 0
    if np.isnan(away_avg):
        away_avg = 0
    return pd.Series({
        'goal_history_with_opponent': home_goals - away_goals, 
        'goal_history_with_opponent_home_avg': home_avg,
        'goal_history_with_opponent_away_avg': away_avg
    }) 

dataset = pd.concat([dataset, dataset.apply(get_previous_goals, axis=1)], axis=1)

In [29]:
def get_median_goals(row):
    home = row["home_team"]
    away = row["away_team"]
    date = row["date"]
    
    home_data = dataset[((dataset['home_team'] == home) | (dataset['away_team'] == home)) &
            (dataset['date'] < date)
           ]
    
    home_goal_median = pd.concat([home_data[(home_data['home_team'] == home)]["home_score"], 
                                 home_data[(home_data['away_team'] == home)]["away_score"]]).mean()
    
    
    
    away_data = dataset[((dataset['home_team'] == away) | (dataset['away_team'] == away)) &
            (dataset['date'] < date)
           ]
    away_goal_median = pd.concat([away_data[(away_data['home_team'] == away)]["home_score"], 
                                 away_data[(away_data['away_team'] == away)]["away_score"]]).mean()
    
    if np.isnan(home_goal_median):
        home_goal_median = 0
    if np.isnan(away_goal_median):
        away_goal_median = 0
    return pd.Series({
        'home_goal_mean': home_goal_median, 
        'away_goal_mean': away_goal_median
    })
dataset = pd.concat([dataset, dataset.apply(get_median_goals, axis=1)], axis=1)

In [None]:
def get_boundries(elo_diff, k=10):
    lower_bound = elo_diff * 0.75
    upper_bound = elo_diff * 1.25
    
    if elo_diff > 0:
        if abs(elo_diff) < 15:
            lower_bound = min(lower_bound - k, 0)
            upper_bound = upper_bound + k
    else:
        if abs(elo_diff) < 15:
            upper_bound = min(lower_bound + k, 0)
            lower_bound = upper_bound - k
    return lower_bound, upper_bound

In [None]:
def get_avg_goal_for_rank(row):
    home = row["home_team"]
    away = row["away_team"]
    date = row["date"]
    elo_diff = row["elo_diff"]
    
    lower_bound, upper_bound = get_boundries(elo_diff)
    
    data = dataset[((dataset['home_team'] == home) | (dataset['away_team'] == home)) & 
                       (dataset['date'] < date) &
                       (dataset['elo_diff'].between(lower_bound, upper_bound))
                      ]
    home_avg = pd.concat([data[(data['home_team'] == home)]["home_score"], data[(data['away_team'] == home)]["away_score"]]).mean()
    opponent_avg = pd.concat([data[(data['home_team'] != home)]["away_score"], data[(data['away_team'] != home)]["away_score"]]).mean()
    
    if np.isnan(home_avg):
        home_avg = 0
    if np.isnan(opponent_avg):
        opponent_avg = 0
    return pd.Series({'home_goal_avg_rank': np.round(home_avg, 1), 'opponent_goal_avg_rank': np.round(opponent_avg,1)}) 
#dataset = pd.concat([dataset, dataset.apply(get_avg_goal_for_rank, axis=1)], axis=1)

In [147]:
np.unique(dataset["tournament"])

array(['ABCS Tournament', 'AFC Asian Cup', 'AFC Asian Cup qualification',
       'AFC Challenge Cup', 'AFC Challenge Cup qualification',
       'AFF Championship', 'African Cup of Nations',
       'African Cup of Nations qualification',
       'African Nations Championship', 'Am<U+00ED>lcar Cabral Cup',
       'Baltic Cup', 'CECAFA Cup', 'CFU Caribbean Cup',
       'CFU Caribbean Cup qualification', 'COSAFA Cup',
       'Confederations Cup', 'Copa Am<U+00E9>rica', 'Copa Paz del Chaco',
       'Copa del Pac<U+00ED>fico', 'Cyprus International Tournament',
       'Dragon Cup', 'Dunhill Cup', 'Dynasty Cup', 'EAFF Championship',
       'FIFA World Cup', 'FIFA World Cup qualification', 'Gold Cup',
       'Gold Cup qualification', 'Gulf Cup', 'King Hassan II Tournament',
       "King's Cup", 'Kirin Cup', 'Korea Cup', 'Lunar New Year Cup',
       'Malta International Tournament', 'Merdeka Tournament',
       'Millennium Cup', 'Nations Cup', 'Nehru Cup',
       'Nile Basin Tournament', 'Nordic

In [33]:
important_columns = ["elo_diff", "rank_diff", "home_score", "away_score", "home_win",
                     "date", "tournament", "home_rank_change", "away_rank_change", 
                     "goal_history_with_opponent",
                     "goal_history_with_opponent_home_avg", "goal_history_with_opponent_away_avg",
                    "home_goal_mean", "away_goal_mean"]

In [34]:
rankings = dataset[important_columns]

In [35]:
feature_columns = ["elo_diff", "rank_diff", "home_rank_change",
                   "away_rank_change", "goal_history_with_opponent",
                  "goal_history_with_opponent_home_avg", "goal_history_with_opponent_away_avg",
                  "home_goal_mean", "away_goal_mean"]

In [148]:
test_period_start = "2010-01-01"

# Master data TRAIN
train_master = rankings[(rankings['date'] < test_period_start)]

# Data for classification
ytrain_home_win = train_master["home_win"]
Xtrain = train_master[feature_columns]

# Data for regression
ytrain_home_score = train_master["home_score"]
ytrain_away_score = train_master["away_score"]

# Data for classification WIN/LOSE
Xtrain_wl = train_master[train_master["home_win"] != 0]
ytrain_wl_home_win = Xtrain_wl["home_win"]
Xtrain_wl = Xtrain_wl[feature_columns]


# Master data World Cup TEST
test_master = rankings[(rankings['tournament'].isin(["FIFA World Cup", "Confederations Cup", "FIFA World Cup qualification"])) & (rankings['date'] >= test_period_start)]

# Data for classification
ytest_home_win = test_master["home_win"]
Xtest = test_master[feature_columns]

# Data for regression
ytest_home_score = test_master["home_score"]
ytest_away_score = test_master["away_score"]

# Data for classification WIN/LOSE
Xtest_wl = test_master[test_master["home_win"] != 0]
ytest_wl_home_win = Xtest_wl["home_win"]
Xtest_wl = Xtest_wl[feature_columns]

# WORLD CUP CROSS VALIDATION DATA
CV_train = rankings[rankings['tournament'].isin(["FIFA World Cup", "Confederations Cup", "FIFA World Cup qualification"])]
CV_train_wl = CV_train[CV_train["home_win"] != 0]
CV_ytrain = CV_train["home_win"]
CV_ytrain_wl = CV_train_wl["home_win"]
CV_ytrain_home_score = CV_train["home_score"]
CV_ytrain_away_score = CV_train["away_score"]
CV_train = CV_train[feature_columns]
CV_train_wl = CV_train_wl[feature_columns]


print("TRAIN: ", len(Xtrain))
print("WC TEST: ", len(Xtest))
print("WC CV TEST: ", len(CV_train))

TRAIN:  7289
WC TEST:  1544
WC CV TEST:  4642


In [149]:
knn = KNeighborsClassifier(n_neighbors=20)

print("CV score", cross_val_score(knn, CV_train, CV_ytrain, cv=10).mean())
knn.fit(Xtrain, ytrain_home_win) 

y_knn = knn.predict(Xtest)
print(sum(y_knn == ytest_home_win) / len(Xtest))

CV score 0.698863638811
0.7053108808290155


In [150]:
knn_wl = KNeighborsClassifier(n_neighbors=10)

print("CV score", cross_val_score(knn, CV_train_wl, CV_ytrain_wl, cv=10).mean())
knn_wl.fit(Xtrain_wl, ytrain_wl_home_win) 

y_knn_wl = knn_wl.predict(Xtest_wl)
print(sum(y_knn_wl == ytest_wl_home_win) / len(Xtest_wl))

CV score 0.906822291449
0.9050791007493755


In [155]:
rfr_home = KNeighborsClassifier(n_neighbors=40)
print("CV score home", cross_val_score(rfr_home, CV_train, CV_ytrain_home_score, cv=5).mean())
rfr_home.fit(Xtrain, ytrain_home_score)
y_home = rfr_home.predict(Xtest)

rfr_away = KNeighborsClassifier(n_neighbors=40)
print("CV score away", cross_val_score(rfr_away, CV_train, CV_ytrain_away_score, cv=5).mean())
rfr_away.fit(Xtrain, ytrain_away_score)
y_away = rfr_away.predict(Xtest)


print("HOME", sum(np.around(y_home) == ytest_home_score) / len(Xtest))
print("AWAY", sum(np.around(y_away) == ytest_away_score) / len(Xtest))



CV score home 0.35372761581
CV score away 0.479322163082
HOME 0.3898963730569948
AWAY 0.46696891191709844


In [156]:
def predict_score(data_row, sign_model, home_goal_model, away_goal_model):
    game_winner = sign_model.predict(data_row)[0]
    game_winner_prob = sign_model.predict_proba(data_row)[0]
    home_goals = home_goal_model.predict_proba(data_row)[0]
    away_goals = away_goal_model.predict_proba(data_row)[0]
    N = len(home_goals)
    M = len(away_goals)
    min_shape = min(N, M)

    goal_matrix = np.outer(home_goals, away_goals)
    
    # make square
    goal_matrix = goal_matrix[:min_shape, :min_shape]
    
    if game_winner == 1:
        a = np.tril(goal_matrix, -1)
        home_goals, away_goals = np.unravel_index(a.argmax(), a.shape)
        assert(home_goals > away_goals)
    elif game_winner == 0:
        a = np.diag(goal_matrix)
        home_goals, away_goals = a.argmax() + 1, a.argmax() + 1
        assert(home_goals == away_goals)
    else:
        a = np.triu(goal_matrix, 1)
        home_goals, away_goals = np.unravel_index(a.argmax(), a.shape)
        assert(home_goals < away_goals)

    return home_goals, away_goals, game_winner

In [161]:
game_predictions = []

for index in range(len(Xtest)):
    row = pd.DataFrame(Xtest.iloc[index]).T
    home_goals, away_goals, winner = predict_score(row, knn, rfr_home, rfr_away)
    game_predictions.append([home_goals, away_goals, winner])
    
predictions = pd.DataFrame(game_predictions, columns=["home_pred", "away_pred", "winner_pred"])
correct = test_master[["home_score", "away_score"]].reset_index()
result_df = pd.concat([correct, predictions], axis=1)
result_df["home_win"] = np.sign(result_df["home_score"] - result_df["away_score"])

total_score = 0
max_score = 0
total_games_correct = 0
total_home_score_correct = 0
total_score_correct = 0

for index, game_row in result_df.iterrows():
    row_score = 0
    if game_row["home_win"] == game_row["winner_pred"]:
        row_score += 2
        total_games_correct += 1
    
    if (game_row["home_score"] == game_row["home_pred"]) or (game_row["away_score"] == game_row["away_pred"]):
        row_score += 1
        total_home_score_correct += 1
    
    if (game_row["home_score"] == game_row["home_pred"]) and (game_row["away_score"] == game_row["away_pred"]):
        row_score += 2
        total_score_correct += 1
    
    total_score += row_score
    max_score += 5
    
N = len(result_df)
print(f"How many points available: {total_score/max_score}")
print(f"{total_games_correct/N}")
print(f"{total_home_score_correct/N}")
print(f"{total_score_correct/N}")

How many points available: 0.47461139896373056
0.7053108808290155
0.667098445595855
0.14766839378238342


In [139]:
world_cup = pd.read_csv('world_cup_2018_dataset.csv')
world_cup.head()

Unnamed: 0,Team,Group,Previous appearances,Previous titles,Previous finals,Previous semifinals,Current FIFA rank,Rank Change,First match against,Match index,history with first opponent W-L,history with first opponent goals,Second match against,Match index.1,history with second opponent W-L,history with second opponent goals,Third match against,Match index.2,history with third opponent W-L,history with third opponent goals
0,Russia,A,10,0,0,1,70,-4,Saudi Arabia,1,-1.0,-2.0,Egypt,17,,,Uruguay,33,0.0,0.0
1,Saudi Arabia,A,4,0,0,0,67,0,Russia,1,1.0,2.0,Uruguay,18,1.0,1.0,Egypt,34,-5.0,-5.0
2,Egypt,A,2,0,0,0,45,1,Uruguay,2,-1.0,-2.0,Russia,17,,,Saudi Arabia,34,5.0,5.0
3,Uruguay,A,12,2,2,5,14,3,Egypt,2,1.0,2.0,Saudi Arabia,18,-1.0,-1.0,Russia,33,0.0,0.0
4,Portugal,B,6,0,0,2,4,0,Spain,3,-12.0,-31.0,Morocco,19,-1.0,-2.0,Iran,35,2.0,5.0


In [140]:
def get_prediction_row(home, away):
    row = pd.DataFrame(np.array([[np.nan] * len(feature_columns)]), columns=feature_columns)
    
    home_row = dataset[(dataset["home_team"] == home) | (dataset["away_team"] == home)].tail(1)
    if home_row["home_team"].item() == home:
        home_elo = home_row["home_elo"].item()
        home_rank = home_row["home_rank"].item()
    else: 
        home_elo = home_row["away_elo"].item()
        home_rank = home_row["away_rank"].item()

    away_row = dataset[(dataset["home_team"] == away) | (dataset["away_team"] == away)].tail(1)
    if away_row["home_team"].item() == away:
        away_elo = away_row["home_elo"].item()
        away_rank = away_row["home_rank"].item()
    else: 
        away_elo = away_row["away_elo"].item()
        away_rank = away_row["away_rank"].item()
    row["elo_diff"] = home_elo - away_elo
    
    home_rank = world_cup[world_cup["Team"] == home]["Current FIFA rank"].item()
    away_rank = world_cup[world_cup["Team"] == away]["Current FIFA rank"].item()
    
    row["rank_diff"] = home_rank - away_rank

    row["home_rank_change"] = world_cup[world_cup["Team"] == home]["Rank Change"].item()
    row["away_rank_change"] = world_cup[world_cup["Team"] == away]["Rank Change"].item()
    
    goal_diff = dataset[((dataset['home_team'] == home) | (dataset['away_team'] == home)) & 
            ((dataset['home_team'] == away) | (dataset['away_team'] == away))].tail(1)
    
    if len(goal_diff) == 0:
        row["goal_history_with_opponent"] = 0
    elif goal_diff["home_team"].item() == home:
        row["goal_history_with_opponent"] = goal_diff["goal_history_with_opponent"].item()
    else:
        row["goal_history_with_opponent"] = -goal_diff["goal_history_with_opponent"].item()
    
    lb, ub = get_boundries(row["elo_diff"].item())
    avg_goal_home = dataset[((dataset['home_team'] == home) & 
                            dataset['elo_diff'].between(lb, ub))].tail(1)
    
    if len(avg_goal_home) > 0:
        row["home_goal_avg_rank"] = avg_goal_home["home_goal_avg_rank"].item()
        row["opponent_goal_avg_rank"] = avg_goal_home["opponent_goal_avg_rank"].item()
    else:
        row["home_goal_avg_rank"] = 0
        row["opponent_goal_avg_rank"] = 0
    return row

In [None]:
all_games = pd.DataFrame()

In [None]:
opponents = ['First match \nagainst', 'Second match\n against', 'Third match\n against']

world_cup['points'] = 0
world_cup['total_prob'] = 0

for group in set(world_cup['Group']):
    print('___Starting group {}:___'.format(group))
    for home, away in combinations(world_cup.query('Group == "{}"'.format(group)).Team, 2):
        print("{} vs. {}: ".format(home, away), end='')
        
        row = get_prediction_row(home, away)
        home_goals, away_goals, game_winner = predict_score(row, knn, rfr_home, rfr_away)
        if game_winner == 1:
            world_cup.loc[world_cup["Team"] == home, 'points'] += 3
        elif game_winner == 0:
            world_cup.loc[world_cup["Team"] == home, 'points'] += 1
            world_cup.loc[world_cup["Team"] == away, 'points'] += 1
        else:
            world_cup.loc[world_cup["Team"] == away, 'points'] += 3
        
        tmp_df = pd.DataFrame([{
            "home_team": home, 
            "away_team": away,
            "home_score": home_goals,
            "away_score": away_goals,
            "rank_diff": row["rank_diff"].item(),
            "elo_diff": row["elo_diff"].item(),
            "goal_history_with_opponent": row["goal_history_with_opponent"].item(),
            "home_goal_avg_rank": row["home_goal_avg_rank"].item(),
            "opponent_goal_avg_rank": row["opponent_goal_avg_rank"].item()
            
        }])
        all_games = pd.concat([all_games, tmp_df])
        print(f"Game {home} - {away} end result {home_goals} - {away_goals}")

In [None]:
pairing = [0,3,4,7,8,11,12,15,1,2,5,6,9,10,13,14]

sort_wc = world_cup.sort_values(by=['Group', 'points'], ascending=False).reset_index()
next_round_wc = sort_wc.groupby('Group').nth([0, 1])


next_round_wc = next_round_wc.reset_index()
next_round_wc = next_round_wc.loc[pairing]
next_round_wc = next_round_wc.set_index('Team')

finals = ['round_of_16', 'quarterfinal', 'semifinal', 'final']


for f in finals:
    print("___Starting of the {}___".format(f))
    iterations = int(len(next_round_wc) / 2)
    winners = []

    for i in range(iterations):
        home = next_round_wc.index[i*2]
        away = next_round_wc.index[i*2+1]
        print("{} vs. {}: ".format(home, away), end='')
        
        row = get_prediction_row(home, away)
        home_goals, away_goals, game_winner = predict_score(row, knn_wl, rfr_home, rfr_away)
        if game_winner == 1:
            winners.append(home)
        else:
            winners.append(away)
            
        tmp_df = pd.DataFrame([{
            "home_team": home, 
            "away_team": away,
            "home_score": home_goals,
            "away_score": away_goals,
            "rank_diff": row["rank_diff"].item(),
            "elo_diff": row["elo_diff"].item(),
            "goal_history_with_opponent": row["goal_history_with_opponent"].item(),
            "home_goal_avg_rank": row["home_goal_avg_rank"].item(),
            "opponent_goal_avg_rank": row["opponent_goal_avg_rank"].item()
            
        }])
        all_games = pd.concat([all_games, tmp_df])
        
        print(f"{home_goals} - {away_goals}")
        
    next_round_wc = next_round_wc.loc[winners]
    print("\n")

In [None]:
all_games[(all_games["home_team"] == "Croatia") | (all_games["away_team"] == "Croatia")]