# Imports

In [90]:
import pandas as pd
import seaborn as sns
import sklearn as sk
import datetime as dt

# To ignore annoying warning
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Loading all Premier League results since 2000

In [94]:
xls_file = pd.ExcelFile('/Users/theophanegregoir/Desktop/BettingAI/Football_Data/European_Leagues/Premier_League/Entire_Data_PL.xlsx')

# Creation and selection of features

In [69]:
#Selection of existing relevant features per game : events that happened during each game
relevant_features = ['Date','HomeTeam','AwayTeam','IWH','IWD','IWA','FTR','FTHG','FTAG','HTHG','HTAG','HS','AS',
                     'HST','AST','HF','AF','HY','AY','HR','AR','HC','AC']

# Function for data processing of each season

In [70]:
def create_stat_dico(season: str) -> pd.DataFrame:
    
    """Compute statistics for a given season all the relevan statistics by going through each game
    
    Output a Dataframe of each game and each team statistics before the game
    
    Arguments : name of the season as a string 
    """
    
    #stat_seasons is a 3-layer nested dico :
    # stats_season ---> dico of all clubs
    # club ---> dico of all week results
    # week ---> all stats for that week
    # stat ---> value for that specific game
     
    stats_season = {}
    xls_file = pd.ExcelFile('/Users/theophanegregoir/Desktop/SportsCoding/Football/Entire_Data_PL.xlsx')
    table = xls_file.parse(season)[relevant_features].dropna(axis=0)

    ###Features that describe team before the game
    table['week'] = 0
    
    ###For Home Team (HT)
    table['HT_league_points'] = 0.0
    table['HT_average_goal_scored'] = 0.0
    table['HT_average_goal_conceeded'] = 0.0
    table['HT_average_goal_scored_half'] = 0.0
    table['HT_average_goal_conceeded_half'] = 0.0
    table['HT_average_yellow'] = 0.0
    table['HT_average_red'] = 0.0
    table['HT_average_shots'] = 0.0
    table['HT_average_shots_on_target'] = 0.0
    table['HT_average_shots_conceeded'] = 0.0
    table['HT_average_fouls_committed'] = 0.0
    table['HT_average_fouls_conceeded'] = 0.0
    
    ###For Away Team (AT)
    table['AT_league_points'] = 0.0
    table['AT_average_goal_scored'] = 0.0
    table['AT_average_goal_conceeded'] = 0.0
    table['AT_average_goal_scored_half'] = 0.0
    table['AT_average_goal_conceeded_half'] = 0.0
    table['AT_average_yellow'] = 0.0
    table['AT_average_red'] = 0.0
    table['AT_average_shots'] = 0.0
    table['AT_average_shots_on_target'] = 0.0
    table['AT_average_shots_conceeded'] = 0.0
    table['AT_average_fouls_committed'] = 0.0
    table['AT_average_fouls_conceeded'] = 0.0
    
    #Number of weeks available
    nb_weeks = int(len(table)/10)
    
    #Getting unique list of club names for this season
    clubs = list(set(list(table['HomeTeam'])))

    #Creation of the dictionary of stats for each club       
    for c in clubs:
        
        #First layer of dict
        stats_season[c] = {}
        
        #Selection of the games where the club is involved
        isHome = table['HomeTeam'] == c
        isAway = table['AwayTeam'] == c
        selected = table.loc[isHome | isAway]
        stats_season[c]['games_index'] = list(selected.index)
        
        
        #Initialisation of week 0 for additive params
        stats_season[c][0] = {}
        stats_season[c][0]['league_points'] = 0.0
        stats_season[c][0]['goals_scored'] = 0.0
        stats_season[c][0]['goals_scored_half'] = 0.0
        stats_season[c][0]['goals_conceeded'] = 0.0
        stats_season[c][0]['goals_conceeded_half'] = 0.0
        stats_season[c][0]['yellow'] = 0.0
        stats_season[c][0]['red'] = 0.0
        stats_season[c][0]['fouls_conceeded'] = 0.0
        stats_season[c][0]['fouls_committed'] = 0.0
        stats_season[c][0]['shots'] = 0.0
        stats_season[c][0]['shots_on_target'] = 0.0
        stats_season[c][0]['shots_conceeded'] = 0.0
        
        #Going through all the weeks of the season
        for k in range(1,nb_weeks+1):
            
            #Second layer of dict
            stats_season[c][k] = {}
            
            #Third layer of dict
            if selected['AwayTeam'][selected.index[k-1]] == c :
                stats_season[c][k]['is_Home'] = False
            else :
                stats_season[c][k]['is_Home'] = True
            
            #Modifying the dataframe to add the statistics before game (based on previous weeks) use as features for ML
            if stats_season[c][k]['is_Home']:
                table.at[selected.index[k-1],'week'] = k
                table.at[selected.index[k-1],'HT_league_points'] = stats_season[c][k-1]['league_points']
                table.at[selected.index[k-1],'HT_average_goal_scored'] = stats_season[c][k-1]['goals_scored'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'HT_average_goal_conceeded'] = stats_season[c][k-1]['goals_conceeded'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'HT_average_goal_scored_half'] = stats_season[c][k-1]['goals_scored_half'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'HT_average_goal_conceeded_half'] =  stats_season[c][k-1]['goals_conceeded_half'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'HT_average_shots'] = stats_season[c][k-1]['shots'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'HT_average_shots_on_target'] = stats_season[c][k-1]['shots_on_target'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'HT_average_shots_conceeded']= stats_season[c][k-1]['shots_conceeded'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'HT_average_yellow'] = stats_season[c][k-1]['yellow'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'HT_average_red'] = stats_season[c][k-1]['red'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'HT_average_fouls_committed'] = stats_season[c][k-1]['fouls_committed'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'HT_average_fouls_conceeded'] = stats_season[c][k-1]['fouls_conceeded'] / max(k-1.0,1.0)
            else :
                table.at[selected.index[k-1],'AT_league_points'] = stats_season[c][k-1]['league_points']
                table.at[selected.index[k-1],'AT_average_goal_scored'] = stats_season[c][k-1]['goals_scored'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'AT_average_goal_conceeded'] = stats_season[c][k-1]['goals_conceeded'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'AT_average_goal_scored_half'] = stats_season[c][k-1]['goals_scored_half'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'AT_average_goal_conceeded_half'] =  stats_season[c][k-1]['goals_conceeded_half'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'AT_average_shots'] = stats_season[c][k-1]['shots'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'AT_average_shots_on_target'] = stats_season[c][k-1]['shots_on_target'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'AT_average_shots_conceeded']= stats_season[c][k-1]['shots_conceeded'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'AT_average_yellow'] = stats_season[c][k-1]['yellow'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'AT_average_red'] = stats_season[c][k-1]['red'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'AT_average_fouls_committed'] = stats_season[c][k-1]['fouls_committed'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'AT_average_fouls_conceeded'] = stats_season[c][k-1]['fouls_conceeded'] / max(k-1.0,1.0)
            
            #Gathering the statistics of the game for the club for next games
            #League points
            if selected['FTR'][selected.index[k-1]] == "D" :
                stats_season[c][k]['league_points'] = stats_season[c][k-1]['league_points'] + 1.0
            elif selected['FTR'][selected.index[k-1]] == "A" and selected['AwayTeam'][selected.index[k-1]] == c:
                stats_season[c][k]['league_points'] = stats_season[c][k-1]['league_points'] + 3.0
            elif selected['FTR'][selected.index[k-1]] == "H" and selected['HomeTeam'][selected.index[k-1]] == c:
                stats_season[c][k]['league_points'] = stats_season[c][k-1]['league_points'] + 3.0
            else:
                stats_season[c][k]['league_points'] = stats_season[c][k-1]['league_points']
            
            #Goals and shots
            if stats_season[c][k]['is_Home']:
                stats_season[c][k]['goals_scored'] = stats_season[c][k-1]['goals_scored'] + selected['FTHG'][selected.index[k-1]]
                stats_season[c][k]['goals_scored_half'] = stats_season[c][k-1]['goals_scored_half'] + selected['HTHG'][selected.index[k-1]]
                stats_season[c][k]['goals_conceeded'] = stats_season[c][k-1]['goals_conceeded'] + selected['FTAG'][selected.index[k-1]]
                stats_season[c][k]['goals_conceeded_half'] = stats_season[c][k-1]['goals_conceeded_half'] + selected['HTAG'][selected.index[k-1]]
                stats_season[c][k]['shots'] = stats_season[c][k-1]['shots'] + selected['HS'][selected.index[k-1]]
                stats_season[c][k]['shots_on_target'] = stats_season[c][k-1]['shots_on_target'] + selected['HST'][selected.index[k-1]]
                stats_season[c][k]['shots_conceeded'] = stats_season[c][k-1]['shots_conceeded'] + selected['AS'][selected.index[k-1]]
            else :
                stats_season[c][k]['goals_scored'] = stats_season[c][k-1]['goals_scored'] + selected['FTAG'][selected.index[k-1]]
                stats_season[c][k]['goals_scored_half'] = stats_season[c][k-1]['goals_scored_half'] + selected['HTAG'][selected.index[k-1]]
                stats_season[c][k]['goals_conceeded'] = stats_season[c][k-1]['goals_conceeded'] + selected['FTHG'][selected.index[k-1]]
                stats_season[c][k]['goals_conceeded_half'] = stats_season[c][k-1]['goals_conceeded_half'] + selected['HTHG'][selected.index[k-1]]
                stats_season[c][k]['shots'] = stats_season[c][k-1]['shots'] + selected['AS'][selected.index[k-1]]
                stats_season[c][k]['shots_on_target'] = stats_season[c][k-1]['shots_on_target'] + selected['AST'][selected.index[k-1]]
                stats_season[c][k]['shots_conceeded'] = stats_season[c][k-1]['shots_conceeded'] + selected['HS'][selected.index[k-1]]
            
            #Fouls and cards
            if stats_season[c][k]['is_Home']:
                stats_season[c][k]['yellow'] = stats_season[c][k-1]['yellow'] + selected['HY'][selected.index[k-1]]
                stats_season[c][k]['red'] = stats_season[c][k-1]['red'] + selected['HR'][selected.index[k-1]]
                stats_season[c][k]['fouls_conceeded'] = stats_season[c][k-1]['fouls_conceeded'] + selected['AF'][selected.index[k-1]]
                stats_season[c][k]['fouls_committed'] = stats_season[c][k-1]['fouls_committed'] + selected['HF'][selected.index[k-1]]
            else:
                stats_season[c][k]['yellow'] = stats_season[c][k-1]['yellow'] + selected['AY'][selected.index[k-1]]
                stats_season[c][k]['red'] = stats_season[c][k-1]['red'] + selected['AR'][selected.index[k-1]]
                stats_season[c][k]['fouls_conceeded'] = stats_season[c][k-1]['fouls_conceeded'] + selected['HF'][selected.index[k-1]]
                stats_season[c][k]['fouls_committed'] = stats_season[c][k-1]['fouls_committed'] + selected['AF'][selected.index[k-1]]
     
    return(table)

# Using the previous function to get the relevant data

In [83]:
#list of dataframes of each seasons
data_seasons = []

start_year = 2015
end_year = 2019

print("Loading seasons : " + str(start_year) + '-' + str(start_year+ 1) + " to " + str(end_year) + '-' + str(end_year+ 1))

for i in range(start_year,end_year):
  
    season = str(i) + '-' + str(i + 1)
    
    table = create_stat_dico(season)
    
    full_table_add = table[['week', 'HT_league_points', 'HT_average_goal_scored',
       'HT_average_goal_conceeded', 'HT_average_goal_scored_half',
       'HT_average_goal_conceeded_half', 'HT_average_yellow', 'HT_average_red',
       'HT_average_shots', 'HT_average_shots_on_target',
       'HT_average_shots_conceeded', 'HT_average_fouls_committed',
       'HT_average_fouls_conceeded', 'AT_league_points',
       'AT_average_goal_scored', 'AT_average_goal_conceeded',
       'AT_average_goal_scored_half', 'AT_average_goal_conceeded_half',
       'AT_average_yellow', 'AT_average_red', 'AT_average_shots',
       'AT_average_shots_on_target', 'AT_average_shots_conceeded',
       'AT_average_fouls_committed', 'AT_average_fouls_conceeded','FTR','IWH','IWD','IWA']]
    
    data_seasons.append(full_table_add) 

Loading seasons : 2015-2016 to 2019-2020


In [84]:
#Visualize the format of dataframe
full_table_add

Unnamed: 0,week,HT_league_points,HT_average_goal_scored,HT_average_goal_conceeded,HT_average_goal_scored_half,HT_average_goal_conceeded_half,HT_average_yellow,HT_average_red,HT_average_shots,HT_average_shots_on_target,...,AT_average_red,AT_average_shots,AT_average_shots_on_target,AT_average_shots_conceeded,AT_average_fouls_committed,AT_average_fouls_conceeded,FTR,IWH,IWD,IWA
0,1,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,H,1.55,3.80,7.00
1,1,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,H,1.90,3.50,4.10
2,1,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,A,2.40,3.30,2.95
3,1,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,A,6.20,4.00,1.55
4,1,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,A,3.70,3.35,2.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,38,94.0,2.351351,0.594595,1.000000,0.270270,1.000000,0.054054,15.162162,5.972973,...,0.027027,12.621622,4.027027,11.972973,10.756757,8.945946,H,1.31,5.40,9.30
376,38,66.0,1.756757,1.405405,0.945946,0.594595,1.891892,0.108108,13.513514,5.810811,...,0.027027,10.918919,3.297297,14.729730,10.216216,10.972973,A,1.28,6.10,9.20
377,38,38.0,1.189189,1.729730,0.567568,0.810811,1.918919,0.081081,12.702703,4.297297,...,0.108108,10.540541,3.108108,13.783784,11.189189,10.162162,D,1.42,4.80,7.10
378,38,70.0,1.756757,1.000000,0.810811,0.297297,1.513514,0.081081,14.189189,5.027027,...,0.108108,12.918919,4.324324,10.567568,11.324324,11.621622,D,2.05,3.45,3.60


# Starting to define train and test datasets

# Training dataset

In [85]:
#Number of seasons used to train
nb_train_seasons = 2
train_df = pd.concat((data_seasons[k] for k in range(nb_train_seasons)))

print("The algorithm will train on " + str(nb_train_seasons) + " seasons i.e " + str(len(train_df)) + " matches !")

#Choice of the features (we first delete the odds)
train_x = train_df[['week', 'HT_league_points', 'HT_average_goal_scored',
       'HT_average_goal_conceeded', 'HT_average_goal_scored_half',
       'HT_average_goal_conceeded_half', 'HT_average_yellow', 'HT_average_red',
       'HT_average_shots', 'HT_average_shots_on_target',
       'HT_average_shots_conceeded', 'HT_average_fouls_committed',
       'HT_average_fouls_conceeded', 'AT_league_points',
       'AT_average_goal_scored', 'AT_average_goal_conceeded',
       'AT_average_goal_scored_half', 'AT_average_goal_conceeded_half',
       'AT_average_yellow', 'AT_average_red', 'AT_average_shots',
       'AT_average_shots_on_target', 'AT_average_shots_conceeded',
       'AT_average_fouls_committed', 'AT_average_fouls_conceeded']]

#Labels for training
train_y = train_df['FTR']


The algorithm will train on 2 seasons i.e 760 matches !


# Testing dataset

In [87]:
#Number of seasons used to train
nb_test_seasons = len(data_seasons) - nb_train_seasons
test_df = pd.concat((data_seasons[len(data_seasons)-1-k] for k in range(nb_test_seasons)))

#Choice of the features (we first delete the odds)
test_x = test_df[['week', 'HT_league_points', 'HT_average_goal_scored',
       'HT_average_goal_conceeded', 'HT_average_goal_scored_half',
       'HT_average_goal_conceeded_half', 'HT_average_yellow', 'HT_average_red',
       'HT_average_shots', 'HT_average_shots_on_target',
       'HT_average_shots_conceeded', 'HT_average_fouls_committed',
       'HT_average_fouls_conceeded', 'AT_league_points',
       'AT_average_goal_scored', 'AT_average_goal_conceeded',
       'AT_average_goal_scored_half', 'AT_average_goal_conceeded_half',
       'AT_average_yellow', 'AT_average_red', 'AT_average_shots',
       'AT_average_shots_on_target', 'AT_average_shots_conceeded',
       'AT_average_fouls_committed', 'AT_average_fouls_conceeded']]

#Labels for training
test_y = test_df['FTR']