# Predicting the outcome of Belgian Pro League football games

## Exploratory data analysis

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

In [2]:
df_football = pd.read_csv('dataset.csv')
df_football.shape

(1508, 93)

In [3]:
df_football.head()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,B1,2024-09-01,18:15:00,Kortrijk,St Truiden,1.0,1.0,D,0.0,1.0,...,1.84,-0.25,1.75,2.05,1.79,2.11,1.83,2.12,1.78,2.05
1,B1,2024-09-01,17:30:00,St. Gilloise,Anderlecht,0.0,0.0,D,0.0,0.0,...,1.98,-0.5,1.93,1.93,1.94,1.94,2.0,1.94,1.95,1.85
2,B1,2024-09-01,15:00:00,Gent,Antwerp,1.0,1.0,D,1.0,1.0,...,2.13,0.0,1.8,2.05,1.85,2.05,1.91,2.06,1.84,2.0
3,B1,2024-09-01,12:30:00,Club Brugge,Cercle Brugge,3.0,0.0,H,2.0,0.0,...,2.58,-1.0,1.9,1.95,1.93,1.96,1.93,2.01,1.88,1.93
4,B1,2024-08-31,19:45:00,Oud-Heverlee Leuven,Standard,2.0,0.0,H,1.0,0.0,...,1.8,-0.5,1.98,1.88,1.96,1.93,1.98,1.93,1.95,1.86


In [4]:
df_football.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1508 entries, 0 to 1507
Data columns (total 93 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Div        1508 non-null   object 
 1   Date       1508 non-null   object 
 2   Time       1508 non-null   object 
 3   HomeTeam   1508 non-null   object 
 4   AwayTeam   1508 non-null   object 
 5   FTHG       1508 non-null   float64
 6   FTAG       1508 non-null   float64
 7   FTR        1508 non-null   object 
 8   HTHG       1504 non-null   float64
 9   HTAG       1504 non-null   float64
 10  HTR        1504 non-null   object 
 11  HS         1503 non-null   float64
 12  AS         1503 non-null   float64
 13  HST        1503 non-null   float64
 14  AST        1503 non-null   float64
 15  HF         1503 non-null   float64
 16  AF         1503 non-null   float64
 17  HC         1503 non-null   float64
 18  AC         1503 non-null   float64
 19  HY         1504 non-null   float64
 20  AY      

In [5]:
# Finding the number of null values per column and printing column containing null values
df_isna = df_football.isna().sum().sort_values(ascending=False)
df_isna[df_isna > 1]

WHCH       355
WHCD       355
WHCA       355
WHD        349
WHA        349
WHH        349
BWCA         8
BWCH         8
BWCD         8
AST          5
AC           5
HC           5
AF           5
HF           5
HST          5
AS           5
HS           5
PAHA         4
PSH          4
PSD          4
PSA          4
B365AHH      4
P>2.5        4
P<2.5        4
B365AHA      4
PAHH         4
AR           4
HR           4
AY           4
HTR          4
HTAG         4
HTHG         4
HY           4
BWD          3
BWA          3
BWH          3
dtype: int64

In [6]:
# Looking at the date column to see how dates are formatted
df_football['Date']

0       2024-09-01
1       2024-09-01
2       2024-09-01
3       2024-09-01
4       2024-08-31
           ...    
1503    2019-07-27
1504    2019-07-27
1505    2019-07-27
1506    2019-07-27
1507    2019-07-26
Name: Date, Length: 1508, dtype: object

## Data cleaning and feature engineering

In [7]:
class FootballDataPreprocessor:
    """
    A class to preprocess football match data for analysis and feature engineering.
    
    Attributes:
        df (DataFrame): A copy of the original dataframe containing football match data.
    """
    def __init__(self, df):
        """
        Initializing the FootballDataPreprocessor class with a copy of the provided dataframe.
        
        Args:
            df (DataFrame): Original football match dataframe.
        """
        self.df = df.copy()

    def create_season(self):
        """
        Creates a 'season' column based on match dates.
        The function assigns a season label (e.g., '20192020') to each match based on the date.
        """
        date_formats = ['%Y-%m-%d', '%d/%m/%Y']
        for date_format in date_formats:
            self.df['Date'] = pd.to_datetime(self.df['Date'], format=date_format, errors='coerce')
            bins = [
                pd.Timestamp('2019-07-01'),
                pd.Timestamp('2020-07-01'),
                pd.Timestamp('2021-07-01'),
                pd.Timestamp('2022-07-01'),
                pd.Timestamp('2023-07-01'),
                pd.Timestamp('2024-07-01'),
                pd.Timestamp('2025-07-01')
            ]
            season_labels = [
                '20192020',
                '20202021',
                '20212022',
                '20222023',
                '20232024',
                '20242025'
            ]
            self.df['season'] = pd.cut(self.df['Date'], bins=bins, labels=season_labels)

    def subset_df(self, cols_to_keep):
        """
        Subsets the dataframe to keep only the specified columns.
        
        Args:
            cols_to_keep (list): List of column names to retain in the dataframe.
        """
        self.df = self.df[cols_to_keep]

    def rewrite_date(self):
        """
        Rewrites the 'Date' column format from 'dd/mm/yyyy' to a standardized datetime format.
        """
        self.df["Date"] = pd.to_datetime(self.df["Date"], format="%d/%m/%Y")

    def clean_data(self):
        """
        Removes rows with missing values (NaN) from the dataframe.
        """
        self.df = self.df.dropna()

    def label_column_result(self):
        """
        Adds a numeric result column ('FTR_num') based on the full-time result ('FTR').
        'H' (Home Win) is mapped to 1, 'D' (Draw) to 0, and 'A' (Away Win) to 2.
        """
        self.df['FTR_num'] = self.df['FTR'].map({'H':1, 'D':0, 'A':2})

    def window_rows_last_n_games(self, team, n_list):
        """
        Retrieves the last 'n' games played by a specified team, either home or away.
        
        Args:
            team (str): The team name.
            n_list (list): A list of integers specifying the number of games to retrieve.

        Returns:
            list: A list of dataframes containing the last 'n' games.
        """
        team_games = self.df[(self.df['HomeTeam'] == team)|(self.df['AwayTeam'] == team)]
        team_games = team_games.sort_values(by="Date", ascending=False)
        return [team_games.head(n) for n in n_list]

    def window_rows_last_n_games_home(self, team, n_list):
        """
        Retrieves the last 'n' home games played by a specified team.
        
        Args:
            team (str): The team name.
            n_list (list): A list of integers specifying the number of games to retrieve.

        Returns:
            list: A list of dataframes containing the last 'n' home games.
        """
        team_games = self.df[self.df['HomeTeam'] == team]
        team_games = team_games.sort_values(by="Date", ascending=False)
        return [team_games.head(n) for n in n_list]

    def window_rows_last_n_games_away(self, team, n_list):
        """
        Retrieves the last 'n' away games played by a specified team.
        
        Args:
            team (str): The team name.
            n_list (list): A list of integers specifying the number of games to retrieve.

        Returns:
            list: A list of dataframes containing the last 'n' away games.
        """
        team_games = self.df[self.df['AwayTeam'] == team]
        team_games = team_games.sort_values(by="Date", ascending=False)
        return [team_games.head(n) for n in n_list]

    def window_rows_head_to_head(self, team1, team2):
        """
        Retrieves all head-to-head games between two specified teams.
        
        Args:
            team1 (str): Name of the first team.
            team2 (str): Name of the second team.

        Returns:
            DataFrame: A dataframe containing all head-to-head matches sorted by date.
        """
        teams_h2h = [team1, team2]
        h2h_df = self.df[(self.df['HomeTeam'].isin(teams_h2h)) & (self.df['AwayTeam'].isin(teams_h2h))].sort_values(by="Date", ascending=False)
        return h2h_df

    def compute_team_form(self, n_last_games, team):
        """
        Computes a form score for a team based on their last 'n' games.
        The score is based on results from the current and previous seasons, with different weights.

        Args:
            n_last_games (DataFrame): Dataframe containing the team's last 'n' games.
            team (str): The team name.

        Returns:
            float: The computed form score.
        """
        score = 0
        current_season = n_last_games.iloc[0]["season"]
        for idx, row in n_last_games.iloc[1:].iterrows():
            if row["season"] == current_season:
                score += 1 if (team == row['HomeTeam'] and row['FTR'] == 'H') or (team == row['AwayTeam'] and row['FTR'] == 'A') else 0.5 if row['FTR'] == 'D' else 0
            else:
                score += 0.5 if (team == row['HomeTeam'] and row['FTR'] == 'H') or (team == row['AwayTeam'] and row['FTR'] == 'A') else 0.25 if row['FTR'] == 'D' else 0
        return score
    
    def compute_team_form_home(self, n_last_games_home):
        """
        Computes a form score for a team based on their last 'n' home games.
        
        Args:
            n_last_games_home (DataFrame): Dataframe containing the team's last 'n' home games.

        Returns:
            float: The computed home form score.
        """
        home_score = 0
        current_season = n_last_games_home.iloc[0]["season"]
        for idx, row in n_last_games_home.iloc[1:].iterrows():
            if row["season"] == current_season:
                home_score += 1 if row['FTR'] == 'H' else 0.5 if row['FTR'] == 'D' else 0
            else:
                home_score += 0.5 if row['FTR'] == 'H' else 0.25 if row['FTR'] == 'D' else 0
        return home_score
    
    def compute_team_form_away(self, n_last_games_away):
        """
        Computes a form score for a team based on their last 'n' away games.
        
        Args:
            n_last_games_away (DataFrame): Dataframe containing the team's last 'n' away games.

        Returns:
            float: The computed away form score.
        """
        away_score = 0
        current_season = n_last_games_away.iloc[0]["season"]
        for idx, row in n_last_games_away.iloc[1:].iterrows():
            if row["season"] == current_season:
                away_score += 1 if row['FTR'] == 'A' else 0.5 if row['FTR'] == 'D' else 0
            else:
                away_score += 0.5 if row['FTR'] == 'A' else 0.25 if row['FTR'] == 'D' else 0
        return away_score

    def compute_team_stats(self, n_last_games, team):
        """
        Computes aggregate statistics (yellow cards, red cards, shots, goals, etc.) for a team's last 'n' games.
        The stats are averaged, and games from past seasons are weighted less.

        Args:
            n_last_games (DataFrame): Dataframe containing the team's last 'n' games.
            team (str): The team name.

        Returns:
            list: A list of averaged team statistics over the last 'n' games.
        """
        if len(n_last_games) <= 1:
            return [0] * 6
        
        def sum_stats(row, is_home, factor=1):
            return [row[col] / factor for col in (['HY', 'HR', 'HS', 'HST', 'HC', 'FTHG'] if is_home else ['AY', 'AR', 'AS', 'AST', 'AC', 'FTAG'])]
        
        stats = [0] * 6
        current_season = n_last_games.iloc[0]['season']
        for idx, row in n_last_games.iloc[1:].iterrows():
            factor = 2 if row['season'] != current_season else 1
            stats = [a + b for a, b in zip(stats, sum_stats(row, row['HomeTeam'] == team, factor))]
        return [stat / (len(n_last_games) - 1) for stat in stats]
        
    def compute_h2h_score(self, h2h_df, hometeam):
        """
        Computes a score for a team's head-to-head performance against another team.
        The score is based on wins, losses, and draws between the two teams.
        
        Args:
            h2h_df (DataFrame): Dataframe containing the head-to-head match data.
            hometeam (str): Name of the home team for which to compute the score.

        Returns:
            float: The computed head-to-head score.
        """
        score = sum(1 if row['HomeTeam'] == hometeam and row['FTR'] == 'H' else -1.5 if row['HomeTeam'] == hometeam and row['FTR'] == 'A' else -1 if row['FTR'] == 'H' else 1.5 if row["FTR"] == "A" else 0 for idx, row in h2h_df.iloc[1:].iterrows())
        return score / len(h2h_df) if len(h2h_df) else 0
    
    def add_features(self, n_list):
        """
        Adds various form and statistical features for each team in the dataset, such as form, averages for yellow cards, red cards, goals, etc.
        
        Args:
            n_list (list): A list of integers specifying the number of last games to consider for each feature.
            
        Returns:
            DataFrame: The updated dataframe with new features added.
        """
        for index, row in self.df.iterrows():
            home_team, away_team = row['HomeTeam'], row['AwayTeam']
            n_last_games_hometeam = self.window_rows_last_n_games(home_team, n_list)
            n_last_games_awayteam = self.window_rows_last_n_games(away_team, n_list)
            n_last_games_home = self.window_rows_last_n_games_home(home_team, n_list)
            n_last_games_away = self.window_rows_last_n_games_away(away_team, n_list)
            h2h_df = self.window_rows_head_to_head(home_team, away_team)

            for i, n in enumerate(n_list):
                self.df.loc[index, f'home_team_form{n}'] = self.compute_team_form(n_last_games_hometeam[i], home_team)
                self.df.loc[index, f'away_team_form{n}'] = self.compute_team_form(n_last_games_awayteam[i], away_team)
                self.df.loc[index, f'home_team_form_home{n}'] = self.compute_team_form_home(n_last_games_home[i])
                self.df.loc[index, f'away_team_form_away{n}'] = self.compute_team_form_away(n_last_games_away[i])
                self.df.loc[index, [f'home_avg_yellow{n}', f'home_avg_red{n}', f'home_avg_shots{n}', f'home_avg_target{n}', f'home_avg_corners{n}', f'home_avg_goals{n}']] = self.compute_team_stats(n_last_games_hometeam[i], home_team)
                self.df.loc[index, [f'away_avg_yellow{n}', f'away_avg_red{n}', f'away_avg_shots{n}', f'away_avg_target{n}', f'away_avg_corners{n}', f'away_avg_goals{n}']] = self.compute_team_stats(n_last_games_awayteam[i], away_team)

            
            self.df.loc[index, 'h2h_record'] = self.compute_h2h_score(h2h_df, home_team)
        self.df['home_away_ratio'] = self.df['AvgH'] / self.df['AvgA']
        return self.df

    def subset_clean_rewrite(self):
        """
        A helper function that subsets the dataframe, adds a 'season' column, cleans the data, and rewrites the date format.
        
        Returns:
            DataFrame: The cleaned and subset dataframe.
        """
        self.create_season()
        cols_to_keep = ['Date', 'season', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 
                        'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR', 'AvgH', 'AvgA', 
                        'AHCh', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'PSH', 'PSD', 'PSA']
        self.subset_df(cols_to_keep)
        self.clean_data()
        return self.df

    def full_process(self):
        """
        Executes the full preprocessing pipeline: cleaning, subsetting, feature engineering, and labeling results.
        
        Returns:
            DataFrame: The fully processed dataframe with additional features.
        """
        self.subset_clean_rewrite()
        self.add_features([6, 8, 10, 12, 14])
        self.label_column_result()
        return self.df
    
    def add_new_row(self, date, home_team, away_team, n_list):
        """
        Adds a new row to the DataFrame with the provided match information (date, home team, away team),
        computes additional features based on the last n games for each team, and imputes missing values
        for specific columns using KNN. 
        Ensures that the new row is not added if it already exists in the DataFrame.

        Args:
            date (str or datetime): The date of the match to be added.
            home_team (str): The name of the home team for the match.
            away_team (str): The name of the away team for the match.
            n_list (list of int): The list specifying how many last games to consider when calculating aggregate features 
        
        Returns:
            DataFrame: The updated DataFrame with the new row added, computed features, and missing values imputed.

        Steps:
            1. Append a new row to the DataFrame with match information (Date, HomeTeam, AwayTeam), sort the DataFrame by the 'Date', reset the index.
            2. Compute and add to the new row all last n games statistics for both the home and away teams (team form, average stats).
            3. Impute missing values in the specified betting columns using KNNImputer.

        Missing Value Imputation:
            - Uses KNNImputer with 5 nearest neighbors to fill missing values in specific betting-related columns: 
            ['AvgH', 'AvgA', 'AHCh', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'PSH', 'PSD', 'PSA'].
        """
        if not self.df[(self.df['Date'] == pd.to_datetime(date, format="%d/%m/%Y")) & (self.df['HomeTeam'] == home_team) & (self.df['AwayTeam'] == away_team)].empty:
            return self.df
        
        new_row = {'Date':date, 'HomeTeam': home_team, 'AwayTeam': away_team}
        new_row = pd.DataFrame([{'Date': date, 'HomeTeam': home_team, 'AwayTeam': away_team}])
        self.df = pd.concat([new_row, self.df], ignore_index=True)        
        self.rewrite_date()
        self.df = self.df.sort_values(by="Date", ascending=False).reset_index(drop=True)
        new_row_index = self.df[(self.df['Date'] == date) & (self.df['HomeTeam'] == home_team) & (self.df['AwayTeam'] == away_team)].index

        n_last_games_hometeam = self.window_rows_last_n_games(home_team, n_list)
        n_last_games_awayteam = self.window_rows_last_n_games(away_team, n_list)
        n_last_games_home = self.window_rows_last_n_games_home(home_team, n_list)
        n_last_games_away = self.window_rows_last_n_games_away(away_team, n_list)
        h2h_df = self.window_rows_head_to_head(home_team, away_team)

        for i, n in enumerate(n_list):
            self.df.loc[new_row_index, f'home_team_form{n}'] = self.compute_team_form(n_last_games_hometeam[i], home_team)
            self.df.loc[new_row_index, f'away_team_form{n}'] = self.compute_team_form(n_last_games_awayteam[i], away_team)
            self.df.loc[new_row_index, f'home_team_form_home{n}'] = self.compute_team_form_home(n_last_games_home[i])
            self.df.loc[new_row_index, f'away_team_form_away{n}'] = self.compute_team_form_away(n_last_games_away[i])

            home_stats = self.compute_team_stats(n_last_games_hometeam[i], home_team)
            self.df.loc[new_row_index, f'home_avg_yellow{n}'] = home_stats[0]
            self.df.loc[new_row_index, f'home_avg_red{n}'] = home_stats[1]
            self.df.loc[new_row_index, f'home_avg_shots{n}'] = home_stats[2]
            self.df.loc[new_row_index, f'home_avg_target{n}'] = home_stats[3]
            self.df.loc[new_row_index, f'home_avg_corners{n}'] = home_stats[4]
            self.df.loc[new_row_index, f'home_avg_goals{n}'] = home_stats[5]

            away_stats = self.compute_team_stats(n_last_games_awayteam[i], away_team)
            self.df.loc[new_row_index, f'away_avg_yellow{n}'] = away_stats[0]
            self.df.loc[new_row_index, f'away_avg_red{n}'] = away_stats[1]
            self.df.loc[new_row_index, f'away_avg_shots{n}'] = away_stats[2]
            self.df.loc[new_row_index, f'away_avg_target{n}'] = away_stats[3]
            self.df.loc[new_row_index, f'away_avg_corners{n}'] = away_stats[4]
            self.df.loc[new_row_index, f'away_avg_goals{n}'] = away_stats[5]

        self.df.loc[new_row_index, 'h2h_record'] = self.compute_h2h_score(h2h_df, home_team)

        imputer = KNNImputer(n_neighbors=5)
        betting_cols_to_impute = ['AvgH', 'AvgA', 'AHCh', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'PSH', 'PSD', 'PSA']
        self.df[betting_cols_to_impute] = imputer.fit_transform(self.df[betting_cols_to_impute])
        self.df.loc[new_row_index, 'home_away_ratio'] = self.df.loc[new_row_index, 'AvgH'] / self.df.loc[new_row_index, 'AvgA']

        return self.df

In [8]:
# Initializes a `FootballDataPreprocessor` instance with the football match data DataFrame `df_football`
preprocessor_df = FootballDataPreprocessor(df_football)
# Runs the full preprocessing pipeline using the `full_process` method, which cleans the df and add features in the data
updated_df = preprocessor_df.full_process()

updated_df.to_csv('updated.csv')
updated_df.shape

(1495, 112)

## Target: Possible game outcomes: Home win, Draw, Away win

- **FTR_num**: Numerical representation of the full-time result (1 for home win, 0 for draw, 2 for away win).
    - This numeric version of the result is suited for modelling.

## Features to keep for the model and underlying reasoning

#### Features Explanation & Reasons for Keeping:

- **season**: The season during which the match occurred.
    - **Reason**: Important for tracking seasonal performance and accounting for roster changes, form variability, or season-specific factors.

- **AHCh**: Asian Handicap for the home team.
    - **Reason**: Provides a more nuanced view of team strength, often used for predicting match outcomes and accounting for perceived strengths and weaknesses.

- **home_away_ratio**: Ratio of the home team's average odds (AvgH) to the away team's average odds (AvgA).
    - **Reason**: Gives a measure of market-based team strength, useful for comparing the teams’ relative strengths in a given match.

- **B365H, B365D, B365A**: Bet365 odds for home win, draw, and away win.
    - **Reason**: Betting odds are predictive of outcomes; Bet365 data gives a concrete indication of how markets expect matches to end.

- **BWH, BWD, BWA**: Betway odds for home win, draw, and away win.
    - **Reason**: Having multiple odds sources allows you to compare predictions and see how different bookmakers perceive match outcomes.

- **PSH, PSD, PSA**: Pinnacle odds for home win, draw, and away win.
    - **Reason**: Another key odds provider, often more reflective of professional betting behavior, which can aid in prediction models.

- **home_team_form6 / away_team_form6 / home_team_form_home6 / away_team_form_away6**: Form of the home and away teams over their last 6 games (overall and in specific conditions).
    - **Reason**: Provides recent performance data, crucial for short-term prediction, as form is often a key factor in determining match outcomes.

- **home_avg_yellow6 / home_avg_red6 / home_avg_shots6 / home_avg_target6 / home_avg_corners6 / home_avg_goals6**: Average statistics for the home team over the last 6 games (yellow/red cards, shots, goals, etc.).
    - **Reason**: Recent team statistics are more predictive than overall season averages, as they capture current trends and performances.

- **away_avg_yellow6 / away_avg_red6 / away_avg_shots6 / away_avg_target6 / away_avg_corners6 / away_avg_goals6**: Same as above, but for the away team.
    - **Reason**: Similarly, for the away team, recent match data helps in predicting upcoming game outcomes, particularly for away performance.

- **home_team_form10 / away_team_form10 / home_team_form_home10 / away_team_form_away10**: Form of home and away teams over the last 10 games (overall and specific conditions).
    - **Reason**: Medium-term form helps in balancing short-term form and more general team strength, capturing team performance over a larger set of games.

- **home_avg_yellow10 / home_avg_red10 / home_avg_shots10 / home_avg_target10 / home_avg_corners10 / home_avg_goals10**: Average stats for the home team over the last 10 games.
    - **Reason**: Slightly longer-term statistics give a broader picture of the team’s performance, while still focusing on recent games.

- **away_avg_yellow10 / away_avg_red10 / away_avg_shots10 / away_avg_target10 / away_avg_corners10 / away_avg_goals10**: Same as above, but for the away team.
    - **Reason**: Similar rationale for away team stats over 10 games—balances short-term and long-term trends for predictive purposes.

- **home_team_form14 / away_team_form14 / home_team_form_home14 / away_team_form_away14**: Form of home and away teams over the last 14 games (overall and specific conditions).
    - **Reason**: Long-term form gives a clearer idea of the team’s overall consistency and strength throughout the season.

- **home_avg_yellow14 / home_avg_red14 / home_avg_shots14 / home_avg_target14 / home_avg_corners14 / home_avg_goals14**: Average stats for the home team over the last 14 games.
    - **Reason**: These columns reflect more general team behavior over a longer term, important for identifying overall trends or tendencies.

- **away_avg_yellow14 / away_avg_red14 / away_avg_shots14 / away_avg_target14 / away_avg_corners14 / away_avg_goals14**: Same as above, but for the away team.
    - **Reason**: Similar importance as for home team stats, giving a long-term view of away performance.

- **h2h_record**: Head-to-head record between the two teams.
    - **Reason**: Head-to-head matchups often influence outcomes, especially in long-standing rivalries, so this can be critical for prediction.

#### Why Keep These Columns in the DataFrame:

1. **Form Analysis**: The team form columns (e.g., home_team_form6, away_team_form6) reflect recent performance trends, crucial for short- and medium-term predictions.
2. **Team Strength Comparison**: Metrics like AvgH, AvgA, and team stats over 6, 10, and 14 games provide detailed views of team performance across different timeframes.
3. **Betting Market Data**: Including betting odds columns from multiple sources allows for comparing actual match outcomes with market predictions.
4. **Head-to-Head Impact**: The h2h_record column helps in identifying how specific team matchups affect game outcomes, especially in rivalries.
5. **Discipline Metrics**: Yellow and red card columns are important for assessing potential game disruptions due to bookings.
6. **Historical Consistency**: The use of form data over different game windows (6, 10, 14) gives flexibility in analyzing both recent and long-term trends.
7. **Market Comparison**: By keeping odds data from multiple sources (B365, BW, PS), we can evaluate market consistency and potential biases.

## Important note

**We do not use any information from the actual games to predict the outcome.**  
All features in the dataset are derived either from:  
- Aggregated information based on previous game sequences, or  
- Betting odds disclosed prior to the start of the match.  

This ensures that the prediction model is based solely on data available before the game is played,  
preventing any leakage of future information into the model.


## Training a Random forest model 

#### Objective to identify as a first step the best combination of n previous games for each aggregation feature

### Explanation

This code performs a parallelized search over 1,000 random feature combinations to evaluate their performance in predicting football match outcomes using a Random Forest Classifier. 

The feature combinations are constructed by randomly selecting one `n` value (e.g., 6, 8, 10) for each aggregated feature 
(e.g., `home_team_form`, `away_avg_shots`). 

Constant features like betting odds and seasonal information are included in every combination. 

The model accuracy is calculated for each feature set, and the results are ranked to identify the best-performing combination.

### Relevance of the Random Forest Classifier

The Random Forest Classifier is relevant here because it is a robust ensemble learning method that excels at handling large datasets with many features. 

Its ability to handle complex feature interactions and reduce overfitting by averaging multiple decision trees makes it a good choice for predicting football match outcomes based on a variety of feature combinations.

In [22]:
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from joblib import Parallel, delayed

# Constant columns to include in every feature set
constant_cols = ['season', 'h2h_record', 'home_away_ratio', 'AHCh', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'PSH', 'PSD', 'PSA']

# Possible values for n (related to the number of rows included in the previous game sequences used for feature engineering)
n_values = [6, 8, 10, 12, 14]

# Aggregate features with variable number of game sequences
features_aggregated = [
    'home_team_form', 'away_team_form', 'home_team_form_home', 'away_team_form_away',
    'home_avg_yellow', 'home_avg_red', 'home_avg_shots', 'home_avg_target', 'home_avg_corners', 'home_avg_goals',
    'away_avg_yellow', 'away_avg_red', 'away_avg_shots', 'away_avg_target', 'away_avg_corners', 'away_avg_goals'
]

# Function to randomly select one n for each aggregate feature
def get_random_feature_set(features_aggregated, n_values):
    return [f'{feature}{random.choice(n_values)}' for feature in features_aggregated]

# Evaluation of the random forest classifier model
def evaluation_model(df, features, target_col='FTR_num'):
    X = df[features]
    y = df[target_col]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

    model = RandomForestClassifier(n_estimators=100, random_state=42)

    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    return accuracy_score(y_test, predictions)

# Function to return and train a model on each combination of features with variable games sequences n
def process_single_combination(i, updated_df, features_aggregated, n_values):
    # Generating a random feature set
    random_feature_set = get_random_feature_set(features_aggregated, n_values)

    # Combine constant columns and the randomly generated set of aggregate features
    total_features = constant_cols + random_feature_set

    # Check if all selected features exist in the DataFrame
    available_features = [f for f in total_features if f in updated_df.columns]
    
    # Skip this combination if not all features exist
    if len(available_features) < len(total_features):
        return None

    # Evaluate the model for the available features
    accuracy = evaluation_model(updated_df, available_features)
    
    return (random_feature_set, accuracy)

# Number of combinations to try
n_combinations = 25000

# Parallel processing to evaluate multiple combinations of features
results = Parallel(n_jobs=-1)(delayed(process_single_combination)(i, updated_df, features_aggregated, n_values) for i in range(n_combinations))

# Filter out None values in the results (in case some combinations had missing features)
results = [result for result in results if result is not None]

# Convert results to DataFrame for easier analysis
results_df = pd.DataFrame(results, columns=['final_feature_set', 'accuracy'])

# Sort results by accuracy to find the best combinations
sorted_results = results_df.sort_values(by='accuracy', ascending=False)

# Print the top 5 combinations with the best accuracy
print(sorted_results.head())

# Printing the best feature set and its accuracy score
best_feature_set = sorted_results.iloc[0]['final_feature_set']
accuracy_score_best_set = sorted_results.iloc[0]['accuracy']
print("Best feature set:", best_feature_set)
print("Best accuracy score:", accuracy_score_best_set)

                                       final_feature_set  accuracy
6754   [home_team_form6, away_team_form12, home_team_...  0.608889
23889  [home_team_form10, away_team_form8, home_team_...  0.604444
3534   [home_team_form6, away_team_form14, home_team_...  0.600000
20006  [home_team_form12, away_team_form10, home_team...  0.600000
15531  [home_team_form8, away_team_form14, home_team_...  0.600000
Best feature set: ['home_team_form6', 'away_team_form12', 'home_team_form_home14', 'away_team_form_away10', 'home_avg_yellow6', 'home_avg_red6', 'home_avg_shots6', 'home_avg_target12', 'home_avg_corners10', 'home_avg_goals10', 'away_avg_yellow14', 'away_avg_red8', 'away_avg_shots12', 'away_avg_target12', 'away_avg_corners12', 'away_avg_goals10']
Best accuracy score: 0.6088888888888889


## Adding a future match to the dataframe


The objective is now to a new row for the latest match, to compute the corresponding aggregate stats, and to impute the missing betting odds using KNN. 

1. **Add New Row**: Adds a row with `date`, `home_team`, and `away_team` at the top of the DataFrame, sorted by date.
2. **Input Teams**: Uses `home_team` and `away_team` to populate the row and compute performance stats.
3. **Game Stats**: Calculates team form, yellow/red cards, shots, corners, goals, and head-to-head based on the last `n` games for the new row.
4. **KNN Imputation**: Imputes missing betting odds (`AvgH`, `AvgA`, etc.) using KNN, filling in NaN values.
5. **Return DataFrame**: Updates the row with calculated features and returns the modified DataFrame.

### Integration into Preprocessor

This function is integrated into the preprocessor, automatically handling new match rows, game sequence calculations, and missing betting odds imputation.


In [10]:
new_df = preprocessor_df.add_new_row('23/09/2024', 'Charleroi', 'Club Brugge', [6, 8, 10, 12, 14])

print(new_df.iloc[0].to_dict())

new_df.head()

{'Date': Timestamp('2024-09-23 00:00:00'), 'HomeTeam': 'Charleroi', 'AwayTeam': 'Club Brugge', 'season': nan, 'FTHG': nan, 'FTAG': nan, 'FTR': nan, 'HS': nan, 'AS': nan, 'HST': nan, 'AST': nan, 'HC': nan, 'AC': nan, 'HY': nan, 'AY': nan, 'HR': nan, 'AR': nan, 'AvgH': 2.571846153846154, 'AvgA': 3.874648829431438, 'AHCh': -0.25301003344481604, 'B365H': 2.5789632107023412, 'B365D': 3.9316923076923076, 'B365A': 3.8922675585284283, 'BWH': 2.588227424749164, 'BWD': 4.023598662207358, 'BWA': 3.9238795986622073, 'PSH': 2.634849498327759, 'PSD': 4.1127023411371235, 'PSA': 4.03961872909699, 'home_team_form6': 1.5, 'away_team_form6': 1.5, 'home_team_form_home6': 2.0, 'away_team_form_away6': 1.5, 'home_avg_yellow6': 1.2, 'home_avg_red6': 0.0, 'home_avg_shots6': 7.5, 'home_avg_target6': 2.2, 'home_avg_corners6': 2.2, 'home_avg_goals6': 0.8, 'away_avg_yellow6': 0.8, 'away_avg_red6': 0.0, 'away_avg_shots6': 8.1, 'away_avg_target6': 3.2, 'away_avg_corners6': 2.6, 'away_avg_goals6': 0.8, 'home_team_for

Unnamed: 0,Date,HomeTeam,AwayTeam,season,FTHG,FTAG,FTR,HS,AS,HST,...,home_avg_goals14,away_avg_yellow14,away_avg_red14,away_avg_shots14,away_avg_target14,away_avg_corners14,away_avg_goals14,h2h_record,home_away_ratio,FTR_num
0,2024-09-23,Charleroi,Club Brugge,,,,,,,,...,0.730769,0.769231,0.038462,7.769231,2.5,2.461538,0.884615,-0.545455,0.663762,
1,2024-09-01,Gent,Antwerp,20242025.0,1.0,1.0,D,15.0,15.0,8.0,...,1.230769,1.538462,0.076923,9.076923,3.307692,4.307692,0.884615,-0.772727,0.946768,0.0
2,2024-09-01,Club Brugge,Cercle Brugge,20242025.0,3.0,0.0,H,21.0,11.0,10.0,...,1.115385,1.423077,0.038462,8.923077,3.038462,3.692308,0.923077,0.384615,0.276364,1.0
3,2024-09-01,Kortrijk,St Truiden,20242025.0,1.0,1.0,D,15.0,8.0,4.0,...,0.615385,0.884615,0.076923,6.5,2.307692,1.923077,0.807692,-0.454545,0.628743,0.0
4,2024-09-01,St. Gilloise,Anderlecht,20242025.0,0.0,0.0,D,22.0,10.0,6.0,...,1.115385,1.0,0.038462,8.538462,3.192308,3.153846,0.961538,0.555556,0.621212,0.0


## Saving the model with the best feature set and using it the predict the outcome of the new match

In [23]:
import joblib

# Retraining the model using the best feature set
def train_model_with_best_features(df, best_feature_set, target_col='FTR_num'):
    total_features = constant_cols + best_feature_set
    
    X = df[total_features]
    y = df[target_col]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

    # Initialize the RandomForestClassifier model
    best_model = RandomForestClassifier(n_estimators=100, random_state=42)

    # Train the model
    best_model.fit(X_train, y_train)

    # Save the model with the best feature set
    joblib.dump(best_model, 'best_random_forest_model.joblib')

    # Return accuracy on the test set for validation
    predictions = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    
    return accuracy

# Retraining and saving the model using the best feature set
best_model_accuracy = train_model_with_best_features(updated_df, best_feature_set)

In [24]:
def predict_match_result(new_df, model_path, best_feature_set, constant_cols):
    # Loading the pre-trained model
    loaded_model = joblib.load(model_path)
    
    # Selecting the first row of the appended df for prediction
    match_for_prediction = new_df.head(1)
    
    # Selecting the features used in the model
    total_features = constant_cols + best_feature_set

    match_for_prediction_features = match_for_prediction[total_features]
    
    # Making the prediction and mapping to a readbale outcome
    predicted_result = loaded_model.predict(match_for_prediction_features)
    result_map = {1: 'Home Win', 0: 'Draw', 2: 'Away Win'}

    # Calculating the probabilities associated with each of the 3 possible outcomes
    probs = loaded_model.predict_proba(match_for_prediction_features)
    
    return result_map[predicted_result[0]], probs

predicted_result = predict_match_result(new_df, 'best_random_forest_model.joblib', best_feature_set, constant_cols)[0]
probabilities_outcomes = predict_match_result(new_df, 'best_random_forest_model.joblib', best_feature_set, constant_cols)[1]

print(f"Predicted result for the first row: {predicted_result}")
print(probabilities_outcomes)

Predicted result for the first row: Home Win
[[0.3  0.38 0.32]]
