In [None]:
import pandas as pd
import numpy as np
import scipy

from tqdm.notebook import tqdm
link_results = "../input/international-football-results-from-1872-to-2017/results.csv"
link_fifa_rank = "../input/fifaworldranking/fifa_ranking-2022-10-06.csv"

# The Project

The goal of this project is to create machine learning models to simulate the score of the FIFA 2022 World Cup games. In order to do so, we will use the following datasets: [International football results from 1872 to 2022](https://www.kaggle.com/datasets/martj42/international-football-results-from-1872-to-2017) and [FIFA World Ranking 1992-2022](https://www.kaggle.com/datasets/cashncarry/fifaworldranking).

We will predict the scores using a Poisson distribution to model the number of goals scored by each team. This in turn allows us to compute game-win probabilities for each team and forecast the whole tournament using Monte Carlo simulations. For example, to compute the probability of winning the World Cup, we simulate all the games 10000 times and count how many times a team wins in the simulation.

In [None]:
# Team's Name Abbreviation: need to get the link for the flag from the FIFA website.
abbreviations = {'Argentina': 'ARG', 'Australia': 'AUS', 'Belgium': 'BEL', 'Brazil': 'BRA',
                 'Cameroon': 'CMR', 'Canada': 'CAN', 'Colombia': 'COL', 'Costa Rica': 'CRC', 'Croatia': 'CRO',
                 'Denmark': 'DEN', 'Egypt': 'EGY', 'Ecuador': 'ECU', 'England': 'ENG', 'France': 'FRA',
                 'Germany': 'GER','Ghana': 'GHA', 'Iceland': 'ISL', 'Iran': 'IRN','Japan': 'JPN', 'South Korea': 'KOR',
                 'Mexico': 'MEX', 'Morocco': 'MAR', 'Netherlands': 'NED', 'Nigeria': 'NGA', 'Panama': 'PAN','Peru': 'PER', 'Poland': 'POL', 'Portugal': 'POR',
                 'Qatar': 'QAT', 'Saudi Arabia': 'KSA', 'Russia': 'RUS','Senegal': 'SEN', 'Serbia': 'SRB', 'Spain': 'ESP',
                 'Sweden': 'SWE', 'Switzerland': 'SUI','Tunisia': 'TUN', 'United States': 'USA','Uruguay': 'URU', 'Wales': 'WAL'}

Since our collection of games dates back to 1872, we will want to exclude older games. In addition, there is typically a squad renovation after each world cup, so we will develop our models based only on the games since the previous world cup.

In [None]:
def select_cycle(df, year, key):
    if year == 2018:
        df = df[np.logical_and(df[key] >= "2014-7-14",df[key] <= "2018-06-12")].reset_index(drop=True)
    elif year == 2022:
        df = df[(df[key] >= "2018-8-1")].reset_index(drop=True)
    return df

Auxiliary function to print the predicted group standings

In [None]:
def print_prediction(prediction, world_cup):
    for letter in prediction:
        print(f'Group {letter}')
        ordered_teams = np.array(world_cup['groups'][letter])[np.argsort(prediction[letter])]
        to_print = ''
        for pos in range(1,5):
            to_print += f'{pos}. {ordered_teams[pos-1]}\n'
        print(to_print)

Auxiliary function to print the predicted probabilities of the final standing in the group

In [None]:
def print_probs(probs):
    for letter in probs:
        print(f'Group {letter}')
        for team in probs[letter]:
            to_print = f'{team}\n'
            for pos in range(1,5):
                to_print += f'{pos}. {probs[letter][team][pos]:.2f}\n'
            print(to_print)

Auxiliary function to plot the probability of each team reaching the different knockout stages

In [None]:
def plot_knockout_stats(knockout_stats, world_cup):
    df = pd.DataFrame(knockout_stats, columns=['Make Round of 16', 'Make Quarters', 'Make Semis', 'Make Final', 'Win Final'])
    flag_links = []
    for team in world_cup['teams']:
        flag_links.append('<img src="'+ f'https://cloudinary.fifa.com/api/v3/picture/flags-sq-4/{abbreviations[team]}' + '", width="20px", height="20px" > ')
    df.insert(0, '', flag_links)
    df.insert(1, 'Team', world_cup['teams'])
    df = df.sort_values(
        ['Win Final','Make Final', 'Make Semis', 'Make Quarters', 'Make Round of 16'], ascending = [False, False, False, False, False])
    # df['Team'] = all_teams
    df = df.reset_index(drop=True)
    df.index += 1 
    df_style = df.style.background_gradient(cmap='YlOrRd', vmin=0, vmax=100, subset=['Make Round of 16', 'Make Quarters', 'Make Semis', 'Make Final', 'Win Final'])
    df_style = df_style.format((
        {'Make Round of 16':lambda x:  "{:2.2f}%".format(x) if x>0 else '< 1%',
         'Make Quarters':lambda x:  "{:2.2f}%".format(x) if x>0 else '< 1%',
         'Make Semis':lambda x:  "{:2.2f}%".format(x) if x>0 else '< 1%',
         'Make Final':lambda x:  "{:2.2f}%".format(x) if x>0 else '< 1%',
         'Win Final':lambda x:  "{:2.2f}%".format(x) if x>0 else '< 1%',
         }))
    df_style = df_style.set_properties(**{'text-align': 'center'})
    df_style = df_style.set_properties(**{'text-align': 'left'}, subset=['Team'])
    df_style = df_style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])
    return df_style

We will extract the list of games and groups Fifa World Cup pages on Wikipedia

In [None]:
from collections.abc import Iterable

def get_world_cup(year):
    world_cup = {}
    if year == 2022:
        dfs = pd.read_html(r"https://en.wikipedia.org/wiki/2022_FIFA_World_Cup#Teams")
        world_cup['rank_date'] ='2022-10-06'
    elif year == 2018:
        dfs = pd.read_html(r"https://en.wikipedia.org/wiki/2018_FIFA_World_Cup#Teams")
        world_cup['rank_date'] ='2018-06-07'
    else:
        raise NotImplementedError
    
    # Locate where the group stage info starts.
    # This little script was borrowed from https://www.kaggle.com/code/sslp23/predicting-fifa-2022-world-cup-with-ml
    for i in range(len(dfs)):
        df = dfs[i]
        cols = list(df.columns.values)

        if isinstance(cols[0], Iterable):
            if any("Tie-breaking criteria" in c for c in cols):
                start = i+1
        
    groups_names = ["A", "B", "C", "D", "E", "F", "G", "H"]
    groups = {"A":[], "B":[], "C":[], "D":[], "E":[], "F":[], "G":[], "H":[]}
    gx = 0
    matches = []
    for i in range(start,start+7*8):
        if (i-start)%7 == 0:
            groups[groups_names[gx]] = list(np.unique(dfs[i].iloc[:,1]))
            gx += 1

        else:
            team1 = dfs[i].columns[0]
            result = dfs[i].columns[1].split('–')
            team2 = dfs[i].columns[2]
            if len(result)>1:
                goals_team1 = int(result[0])
                goals_team2 = int(result[1])
            else:
                goals_team1 = np.nan
                goals_team2 = np.nan
            matches.append((team1, team2, groups_names[gx-1], goals_team1, goals_team2))

    teams_world_cup = []
    for letter in groups:
        teams_group = []
        for team in groups[letter]:
            team = team.replace(' (H)', '')
            teams_group.append(team)
            teams_world_cup.append(team)
        groups[letter] = teams_group

    world_cup['year'] = year
    world_cup['teams'] = teams_world_cup
    world_cup['groups'] = groups
    world_cup['games'] = pd.DataFrame(matches, columns=['Team1', 'Team2', 'Group', 'GT1', 'GT2'])
    return world_cup

We define a base class for our forecaster models. The `predict_game` method needs to be defined for each class. There are then four methods defined:

- `get_game_stats`

  We perform n_sim Monte Carlo simulations using the predict_game method. These allows us to compute the game winning probabilities for each team and the most likely final scores.

- `predict_group_stage`

  For each game in the group stage, we forecast the number of goals score for each team and output the number of points each team will obtain. Each game is forecasted n_sim times.
  
- `point_score_predictions_group_stage`

  For each game in the group stage, the final score is forecasted n_sim times. The most observed score is chosen as the model's prediction. The predictions are score following the FIFA Match Predictor game: https://play.fifa.com/match-predictor/help/guidelines:
      - Correct Outcome: +3 points
      - Correct Goals (Home Team): +1 points
      - Correct Goals (Away Team): +1 points
      - Correct Goal Difference: +1 points
      - Correct Score Bonus: +2 points


- `get_group_standings_stats`
 
  Based on the points predicted for each team with `predict_group_stage`, the final standings for each simulation are determined. The most frequent final standings for each group is chosen as the prediction. The probabilities of the final standing of each team are also computed.
  
- `predict_knockout_rounds`

  Given the final group standings predicted with `predict_group_stage` forecasts the knockout stage. In case the forecasted result is a tie, the forecast is repeated until a winner is found.

In [1]:
class BaseForecaster():
    def __init__(self):
        pass
    
    def predict_game(self, team1, team2, n_sim):
        raise NotImplementedError

    def print_game_stats(self, team1, team2, n_sim):
        gt1, gt2 = self.predict_game(team1, team2, n_sim)
        # Computing match-win percentages
        outcomes = np.where(gt1>gt2, 'W', np.where(gt2>gt1, 'L', 'D'))
        outcomes, counts = np.unique(outcomes, return_counts=True)
        pct_win_team1 = (counts[outcomes=='W']/sum(counts))[0]*100
        pct_win_team2 = (counts[outcomes=='L']/sum(counts))[0]*100
        pct_draw = (counts[outcomes=='D']/sum(counts))[0]*100
        print(f"Win {abbreviations[team1]}: {pct_win_team1:.2f}%")
        print(f"Win {abbreviations[team2]}: {pct_win_team2:.2f}%")
        print(f"Draw: {pct_draw:.2f}%")

        # Computing most likely score
        values, counts = np.unique(np.vstack([gt1,gt2]), return_counts=True, axis=1)
        goals_team1, goals_team2 = values[:, counts==max(counts)]
        print(f'Most likely final score: {abbreviations[team1]} {goals_team1[0]} - {goals_team2[0]} {abbreviations[team2]}')

    def predict_group_stage(self, world_cup, n_sim):        
        gt1 = np.zeros((len(world_cup['games']), n_sim))
        gt2 = np.zeros((len(world_cup['games']), n_sim))
        for i in range(len(world_cup['games'])):
            team1 = world_cup['games'].iloc[i]['Team1']
            team2 = world_cup['games'].iloc[i]['Team2']
            gt1[i,:], gt2[i,:] = self.predict_game(team1, team2, n_sim)

        pt1 = np.where(gt1>gt2, 3, np.where(gt1<gt2, 0, 1))
        pt2 = np.where(gt1>gt2, 0, np.where(gt1<gt2, 3, 1))
        return pt1, pt2
    
    def points_score_predictions_group_stage(self, world_cup, n_sim):
        gt1 = np.zeros((len(world_cup['games']), n_sim))
        gt2 = np.zeros((len(world_cup['games']), n_sim))
        for i in range(len(world_cup['games'])):
            team1 = world_cup['games'].iloc[i]['Team1']
            team2 = world_cup['games'].iloc[i]['Team2']
            gt1[i,:], gt2[i,:] = self.predict_game(team1, team2, n_sim)

        predicted_gt1 = np.zeros(len(world_cup['games']))
        predicted_gt2 = np.zeros(len(world_cup['games']))
        for i in range(len(world_cup['games'])):
            values, counts = np.unique(np.vstack([gt1[i,:], gt2[i,:]]), return_counts=True, axis=1)
            result = values[:,counts==max(counts)].flatten()
            predicted_gt1[i] = result[0]
            predicted_gt2[i] = result[1]
        true_outcome = np.where(world_cup['games']['GT1']>world_cup['games']['GT2'], 
                 'H', np.where(world_cup['games']['GT1']<world_cup['games']['GT2'], 'A','D'))
        predicted_outcome = np.where(predicted_gt1>predicted_gt2,'H', np.where(predicted_gt2>predicted_gt1, 'A','D'))
        correct_outcome = true_outcome == predicted_outcome
        correct_t1 = world_cup['games']['GT1'] == predicted_gt1
        correct_t2 = world_cup['games']['GT2'] == predicted_gt2
        correct_difference = (world_cup['games']['GT1']-world_cup['games']['GT2']) == predicted_gt1-predicted_gt2
        correct_score = np.logical_and(world_cup['games']['GT1'] == predicted_gt1, world_cup['games']['GT2'] == predicted_gt2)
        points = 3*sum(correct_outcome)+1*sum(correct_t1)+1*sum(correct_t2)+1*sum(correct_difference)+2*sum(correct_score)
        return points

    def get_group_stage_stats(self, pt1, pt2, world_cup):
        n_sim = pt1.shape[1]
        points_per_simulation = {}
        for team in world_cup['teams']:
            points_per_simulation[team] = pt1[world_cup['games']['Team1']==team,:].sum(axis=0)+pt2[world_cup['games']['Team2']==team,:].sum(axis=0)
        probs = {}
        prediction = {}
        positions = {}
        for letter in world_cup['groups']:
            teams = np.array(world_cup['groups'][letter])
            points_per_team = []
            for team in teams:
                points_per_team.append(points_per_simulation[team])
            positions[letter] = 5-scipy.stats.rankdata(points_per_team, axis=0, method='ordinal')
            probs[letter] = {}
            for i, team in enumerate(teams):
                probs[letter][team] = {}
                for pos in range(1,5):
                    probs[letter][team][pos] = sum(positions[letter][i,:]==pos)/n_sim*100
            values, counts = np.unique(positions[letter], return_counts=True, axis=1)
            prediction[letter] = list(values[:,counts==max(counts)].flatten())
        return probs, positions, prediction
    
    def predict_knockouts(self, positions, world_cup):   
        knockout_stages = ['round_of_16', 'quarterfinal', 'semifinal', 'final', 'winner']
        pairing = [0,3,4,7,8,11,12,15,1,2,5,6,9,10,13,14]
        n_teams = 16
        n_sim = positions['A'].shape[1]
        knockouts = {}
        for stage_ix, stage in enumerate(knockout_stages):
            if stage == 'round_of_16':
                knockouts[stage] = np.zeros((16, n_sim))
                ix = 0
                for letter in world_cup['groups']:
                    teams = np.array(world_cup['groups'][letter])
                    for i in range(4):
                        knockouts[stage][pairing[ix], positions[letter][i,:] == 1] = np.arange(32)[np.array(world_cup['teams'])==teams[i]]
                        knockouts[stage][pairing[ix+1], positions[letter][i,:] == 2] = np.arange(32)[np.array(world_cup['teams'])==teams[i]]
                    ix += 2
            else:
                prev_stage = knockout_stages[stage_ix-1]
                n_games = knockouts[prev_stage].shape[0]//2
                knockouts[stage] = np.zeros((n_games, n_sim))
                for i in range(n_games):
                    for sim in range(n_sim):
                        team1 = world_cup['teams'][int(knockouts[prev_stage][2*i,sim])]
                        team2 = world_cup['teams'][int(knockouts[prev_stage][2*i+1,sim])]
                        gt1, gt2 = self.predict_game(team1, team2, 1)
                        while gt1 == gt2:
                            gt1, gt2 = self.predict_game(team1, team2, 1)
                        if gt1>gt2:
                            knockouts[stage][i, sim] = np.arange(32)[np.array(world_cup['teams'])==team1]
                        elif gt2>gt1:
                            knockouts[stage][i, sim] = np.arange(32)[np.array(world_cup['teams'])==team2]
        return knockouts
    
    def get_knockouts_stats(self, knockouts, world_cup):
        n_sim = knockouts['winner'].shape[1]
        knockouts_stats = np.zeros((len(world_cup['teams']),5))
        for stage_ix, stage in enumerate(knockouts):
            values, counts = np.unique(knockouts[stage], return_counts=True)
            for values_ix, v in enumerate(values):
                knockouts_stats[int(v),stage_ix] = counts[values_ix]/n_sim*100
        return knockouts_stats

# Poisson Distribution Model (goal based)

We model model the goals scored with a Poisson Distribution. For each team, an attacking and defensive strength coefficient is determined for home and away games. Since we are dealing with national teams, a lot of the games take place in a neutral field. Those games are used to compute both the home and away coefficients of each team. Doing so allows to naturally give the home advantage to the organizing country (Russia for the 2018 World Cup and Qatar for the 2022 World Cup). More details here: https://www.sbo.net/strategy/football-prediction-model-poisson-distribution/

In [None]:
class PoissonGoals(BaseForecaster):
    def __init__(self, world_cup):
        db_games =  pd.read_csv(link_results)
        db_games["date"] = pd.to_datetime(db_games["date"])
        db_games.dropna(inplace=True)
        db_games = select_cycle(db_games, world_cup['year'], "date")
        self.db_games = db_games
        self.year = world_cup['year']
        data = []
        for team in world_cup['teams']:
            select = np.logical_and(db_games['home_team']==team, np.logical_not(db_games['neutral']))
            n_hg = sum(select)
            gf_hg = sum(db_games[select]['home_score'])
            ga_hg = sum(db_games[select]['away_score'])
            select = np.logical_and(db_games['away_team']==team, np.logical_not(db_games['neutral']))
            n_ag = sum(select)
            gf_ag = sum(db_games[select]['away_score'])
            ga_ag = sum(db_games[select]['home_score'])
            select = np.logical_and(db_games['home_team']==team, db_games['neutral'])
            n_ngh = sum(select)
            gf_ngh = sum(db_games[select]['home_score'])
            ga_ngh = sum(db_games[select]['away_score'])
            select = np.logical_and(db_games['away_team']==team, db_games['neutral'])
            n_nga = sum(select)
            gf_nga = sum(db_games[select]['away_score'])
            ga_nga = sum(db_games[select]['home_score'])
            gf_ng = gf_ngh+gf_nga
            ga_ng = ga_ngh+ga_nga
            n_ng = n_ngh+n_nga
            data.append([team, n_hg, gf_hg, ga_hg, n_ag, gf_ag, ga_ag, n_ng, gf_ng, ga_ng])
        df = pd.DataFrame(data, columns=['Team', 'n_hg', 'gf_hg', 'ga_hg', 'n_ag', 'gf_ag', 'ga_ag', 'n_ng', 'gf_ng', 'ga_ng'])
        self.df = df
        has = ((df['gf_hg']+df['gf_ng'])/(df['n_hg']+df['n_ng']))/((sum(df['gf_hg'])+sum(df['gf_ng']))/(sum(df['n_hg'])+sum(df['n_ng'])))
        hds = ((df['ga_hg']+df['ga_ng'])/(df['n_hg']+df['n_ng']))/((sum(df['ga_hg'])+sum(df['ga_ng']))/(sum(df['n_hg'])+sum(df['n_ng'])))
        aas = ((df['gf_ag']+df['gf_ng'])/(df['n_hg']+df['n_ng']))/((sum(df['gf_ag'])+sum(df['gf_ng']))/(sum(df['n_hg'])+sum(df['n_ng'])))
        ads = ((df['ga_ag']+df['ga_ng'])/(df['n_hg']+df['n_ng']))/((sum(df['ga_ag'])+sum(df['ga_ng']))/(sum(df['n_hg'])+sum(df['n_ng'])))
        self.strengths = {}
        for i, team in enumerate(world_cup['teams']):
            self.strengths[team] = {}
            self.strengths[team]['has'] = has[i]
            self.strengths[team]['hds'] = hds[i]
            self.strengths[team]['aas'] = aas[i]
            self.strengths[team]['ads'] = ads[i]
    
    def predict_game(self, team1, team2, n_sim):
        if (self.year == 2022 and team1 == 'Qatar') or (self.year == 2018 and team1 == 'Russia'):
            gt1 = np.random.poisson(lam=self.strengths[team1]['has']*self.strengths[team2]['ads'], size=n_sim)
            gt2 = np.random.poisson(lam=self.strengths[team2]['aas']*self.strengths[team1]['hds'], size=n_sim)
        elif (self.year == 2022 and team2 == 'Qatar') or (self.year == 2018 and team2 == 'Russia'):
            gt1 = np.random.poisson(lam=self.strengths[team1]['aas']*self.strengths[team2]['hds'], size=n_sim)
            gt2 = np.random.poisson(lam=self.strengths[team2]['has']*self.strengths[team1]['ads'], size=n_sim)
        else:
            gt1 = np.random.poisson(lam=(self.strengths[team1]['has']*self.strengths[team2]['ads']+self.strengths[team1]['aas']*self.strengths[team2]['hds'])/2, size=n_sim)
            gt2 = np.random.poisson(lam=(self.strengths[team2]['aas']*self.strengths[team1]['hds']+self.strengths[team2]['has']*self.strengths[team1]['ads'])/2, size=n_sim)
        return gt1, gt2

# Poisson Distribution Model (rank based)

Instead of using the goals scored for and agaisnt to ultimately determine the expected value of the Poisson distribution that models the goals scored, we will instead estimate this parameter based on the Fifa rank point difference. Here we ignore the home advantage and consider all games to have been played in neutral field. We learn two models: one corresponding to the team with the highest Fifa rank and one for the lowest ranked team.

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import PoissonRegressor
class PoissonFifaRank(BaseForecaster):
    def __init__(self, world_cup):
        db_games =  pd.read_csv(link_results)
        db_games["date"] = pd.to_datetime(db_games["date"])
        db_games.dropna(inplace=True)
        db_games = select_cycle(db_games, world_cup['year'], "date")
        db_games = db_games[['date', 'home_team', 'away_team', 'home_score', 'away_score', 'tournament', 'neutral']]
        rank = pd.read_csv(link_fifa_rank)
        rank["rank_date"] = pd.to_datetime(rank["rank_date"])
        rank = select_cycle(rank, world_cup['year'], key='rank_date')
        rank["country_full"] = rank["country_full"].str.replace("IR Iran", "Iran").str.replace("Korea Republic", "South Korea").str.replace("USA", "United States")
        rank = rank.set_index(['rank_date']).groupby(['country_full'], group_keys=False).resample('D').first().fillna(method='ffill').reset_index()
        df_wc_ranked = db_games.merge(rank[["country_full", "total_points", "rank_date"]], left_on=["date", "home_team"], right_on=["rank_date", "country_full"]).drop(["rank_date", "country_full"], axis=1)
        df_wc_ranked = df_wc_ranked.merge(rank[["country_full", "total_points", "rank_date"]], left_on=["date", "away_team"], right_on=["rank_date", "country_full"], suffixes=("_home", "_away")).drop(["rank_date", "country_full"], axis=1)
        df_wc_ranked['is_friendly'] = np.where(df_wc_ranked['tournament'] == 'Friendly',1,0)
        df_wc_ranked['diff_rank'] = np.abs(df_wc_ranked['total_points_home']-df_wc_ranked['total_points_away'])
        df_wc_ranked['ratio_rank'] = np.maximum(df_wc_ranked['total_points_home'],df_wc_ranked['total_points_away'])/np.minimum(df_wc_ranked['total_points_home'],df_wc_ranked['total_points_away'])
        df_wc_ranked['best_score'] = np.where(df_wc_ranked['total_points_home']>df_wc_ranked['total_points_away'], df_wc_ranked['home_score'], df_wc_ranked['away_score'])
        df_wc_ranked['worst_score'] = np.where(df_wc_ranked['total_points_home']>df_wc_ranked['total_points_away'], df_wc_ranked['away_score'], df_wc_ranked['home_score'])
        self.poisson_gml_best = make_pipeline(StandardScaler(), PoissonRegressor(alpha=1e-12, max_iter=500))
        self.poisson_gml_best.fit(df_wc_ranked[['diff_rank']].values, df_wc_ranked['best_score'])
        self.poisson_gml_worst = make_pipeline(StandardScaler(), PoissonRegressor(alpha=1e-12, max_iter=500))
        self.poisson_gml_worst.fit(df_wc_ranked[['diff_rank']].values, df_wc_ranked['worst_score'])
        
        self.fifa_rank = {}
        for team in world_cup['teams']:
            self.fifa_rank[team] = rank[np.logical_and(rank["rank_date"]== world_cup['rank_date'], rank['country_full'] == team)]['total_points'].values[0]
        
    def predict_game(self, team1, team2, n_sim):
        diff = np.abs(self.fifa_rank[team1]-self.fifa_rank[team2])
        if self.fifa_rank[team1]>self.fifa_rank[team2]:   
            gt1 = np.random.poisson(lam=self.poisson_gml_best.predict([[diff]]), size=n_sim)
            gt2 = np.random.poisson(lam=self.poisson_gml_worst.predict([[diff]]), size=n_sim)
        else:
            gt1 = np.random.poisson(lam=self.poisson_gml_worst.predict([[diff]]), size=n_sim)
            gt2 = np.random.poisson(lam=self.poisson_gml_best.predict([[diff]]), size=n_sim)
        return gt1, gt2

# World Cup 2018

We test our two models on the 2018 World Cup. We predict the scores for each group stage and score the predictions based on the Fifa Match Predictor Game.

In [None]:
world_cup = get_world_cup(2018)

In [None]:
scores = []
for n in tqdm(range(10)):
    scores.append(PoissonGoals(world_cup).points_score_predictions_group_stage(world_cup, 1000))
print('Poisson Goal Based')
print(f'Avg Score: {np.mean(scores):.2f}\u00B1{np.std(scores):.2f}')
print(f'Max Score: {np.max(scores)}')
print(f'Min Score: {np.min(scores)}')

In [None]:
scores = []
for n in tqdm(range(10)):
    scores.append(PoissonFifaRank(world_cup).points_score_predictions_group_stage(world_cup, 1000))
print('Poisson Rank Based')
print(f'Avg Score: {np.mean(scores):.2f}\u00B1{np.std(scores):.2f}')
print(f'Max Score: {np.max(scores)}')
print(f'Min Score: {np.min(scores)}')

The model based on the Fifa rank is substantially better. Let us see what were the knockout predictions for each model.

In [None]:
n_sim = 1000
model_goals = PoissonGoals(world_cup)
pt1, pt2 = model_goals.predict_group_stage(world_cup, n_sim)
probs, positions, prediction = model_goals.get_group_stage_stats(pt1, pt2, world_cup)
knockouts = model_goals.predict_knockouts(positions, world_cup)
knockout_stats = model_goals.get_knockouts_stats(knockouts, world_cup)
plot_knockout_stats(knockout_stats, world_cup)

In [None]:
n_sim = 1000
model_fifa_rank = PoissonFifaRank(world_cup)
pt1, pt2 = model_fifa_rank.predict_group_stage(world_cup, n_sim)
probs, positions, prediction = model_fifa_rank.get_group_stage_stats(pt1, pt2, world_cup)
knockouts = model_fifa_rank.predict_knockouts(positions, world_cup)
knockout_stats = model_fifa_rank.get_knockouts_stats(knockouts, world_cup)
plot_knockout_stats(knockout_stats, world_cup)

One clear difference between the models is the estimated probability of Iran reaching the Round of 16: 52.40% by the goal based model versus 29.9080% by the Fifa Rank based model. Clearly, using only goal statistics is not the best approach given that the teams do not all play against each other. Iran being part of the Asian Football Confederation (AFC) plays most of his games against teams in the AFC. In the 2014 Fifa World Cup qualifying round, it placed first in its group so it is not surprising that the goal based model overestimates its chances.

# World Cup 2022

We now use our models to forecast the 2022 World Cup.

In [None]:
world_cup = get_world_cup(2022)

In [None]:
n_sim = 1000
model_goals = PoissonGoals(world_cup)
pt1, pt2 = model_goals.predict_group_stage(world_cup, n_sim)
probs, positions, prediction = model_goals.get_group_stage_stats(pt1, pt2, world_cup)
knockouts = model_goals.predict_knockouts(positions, world_cup)
knockout_stats = model_goals.get_knockouts_stats(knockouts, world_cup)
plot_knockout_stats(knockout_stats, world_cup)

In [None]:
n_sim = 1000
model_fifa_rank = PoissonFifaRank(world_cup)
pt1, pt2 = model_fifa_rank.predict_group_stage(world_cup, n_sim)
probs, positions, prediction = model_fifa_rank.get_group_stage_stats(pt1, pt2, world_cup)
knockouts = model_fifa_rank.predict_knockouts(positions, world_cup)
knockout_stats = model_fifa_rank.get_knockouts_stats(knockouts, world_cup)
plot_knockout_stats(knockout_stats, world_cup)

The significant differences between the two models are Canada and Australia chances. They are 2nd and 3rd favorite to win the World Cup for the model based on goals alone, while being 26th and 30th for the Fifa ranked based model.

We will also use the Fifa ranked based model to predict the 1st game of the World Cup  (Qatar-Ecuador) that Ecuador won 2-0.

In [None]:
team1 = 'Qatar'
team2 = 'Ecuador'
model_fifa_rank.print_game_stats(team1, team2, 1000)

The most likely outcome was a win for Ecuador, but the most likely outcome was 1-1.