# Imports

In [2]:
import pandas as pd
import seaborn as sns
import sklearn as sk
import datetime as dt

# To ignore annoying warning
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Loading all Premier League results since 2000

In [3]:
xls_file = pd.ExcelFile('/Users/theophanegregoir/Desktop/BettingAI/Football_Data/European_Leagues/Premier_League/Entire_Data_PL.xlsx')

# Creation and selection of features

In [4]:
#Selection of existing relevant features per game : events that happened during each game
relevant_features = ['Date','HomeTeam','AwayTeam','IWH','IWD','IWA','FTR','FTHG','FTAG','HTHG','HTAG','HS','AS',
                     'HST','AST','HF','AF','HY','AY','HR','AR','HC','AC']

# Function for data processing of each season

In [5]:
def create_stat_dico(season: str) -> pd.DataFrame:
    
    """Compute statistics for a given season all the relevan statistics by going through each game
    
    Output a Dataframe of each game and each team statistics before the game
    
    Arguments : name of the season as a string 
    """
    
    #stat_seasons is a 3-layer nested dico :
    # stats_season ---> dico of all clubs
    # club ---> dico of all week results
    # week ---> all stats for that week
    # stat ---> value for that specific game
     
    stats_season = {}
    xls_file = pd.ExcelFile('/Users/theophanegregoir/Desktop/BettingAI/Football_Data/European_Leagues/Premier_League/Entire_Data_PL.xlsx')
    table = xls_file.parse(season)[relevant_features].dropna(axis=0)

    ###Features that describe team before the game
    table['week'] = 0
    
    ###For Home Team (HT)
    table['HT_league_points'] = 0.0
    table['HT_average_goal_scored'] = 0.0
    table['HT_average_goal_conceeded'] = 0.0
    table['HT_average_goal_scored_half'] = 0.0
    table['HT_average_goal_conceeded_half'] = 0.0
    table['HT_average_yellow'] = 0.0
    table['HT_average_red'] = 0.0
    table['HT_average_shots'] = 0.0
    table['HT_average_shots_on_target'] = 0.0
    table['HT_average_shots_conceeded'] = 0.0
    table['HT_average_fouls_committed'] = 0.0
    table['HT_average_fouls_conceeded'] = 0.0
    
    ###For Away Team (AT)
    table['AT_league_points'] = 0.0
    table['AT_average_goal_scored'] = 0.0
    table['AT_average_goal_conceeded'] = 0.0
    table['AT_average_goal_scored_half'] = 0.0
    table['AT_average_goal_conceeded_half'] = 0.0
    table['AT_average_yellow'] = 0.0
    table['AT_average_red'] = 0.0
    table['AT_average_shots'] = 0.0
    table['AT_average_shots_on_target'] = 0.0
    table['AT_average_shots_conceeded'] = 0.0
    table['AT_average_fouls_committed'] = 0.0
    table['AT_average_fouls_conceeded'] = 0.0
    
    #Number of weeks available
    nb_weeks = int(len(table)/10)
    
    #Getting unique list of club names for this season
    clubs = list(set(list(table['HomeTeam'])))

    #Creation of the dictionary of stats for each club       
    for c in clubs:
        
        #First layer of dict
        stats_season[c] = {}
        
        #Selection of the games where the club is involved
        isHome = table['HomeTeam'] == c
        isAway = table['AwayTeam'] == c
        selected = table.loc[isHome | isAway]
        stats_season[c]['games_index'] = list(selected.index)
        
        
        #Initialisation of week 0 for additive params
        stats_season[c][0] = {}
        stats_season[c][0]['league_points'] = 0.0
        stats_season[c][0]['goals_scored'] = 0.0
        stats_season[c][0]['goals_scored_half'] = 0.0
        stats_season[c][0]['goals_conceeded'] = 0.0
        stats_season[c][0]['goals_conceeded_half'] = 0.0
        stats_season[c][0]['yellow'] = 0.0
        stats_season[c][0]['red'] = 0.0
        stats_season[c][0]['fouls_conceeded'] = 0.0
        stats_season[c][0]['fouls_committed'] = 0.0
        stats_season[c][0]['shots'] = 0.0
        stats_season[c][0]['shots_on_target'] = 0.0
        stats_season[c][0]['shots_conceeded'] = 0.0
        
        #Going through all the weeks of the season
        for k in range(1,nb_weeks+1):
            
            #Second layer of dict
            stats_season[c][k] = {}
            
            #Third layer of dict
            if selected['AwayTeam'][selected.index[k-1]] == c :
                stats_season[c][k]['is_Home'] = False
            else :
                stats_season[c][k]['is_Home'] = True
            
            #Modifying the dataframe to add the statistics before game (based on previous weeks) use as features for ML
            if stats_season[c][k]['is_Home']:
                table.at[selected.index[k-1],'week'] = k
                table.at[selected.index[k-1],'HT_league_points'] = stats_season[c][k-1]['league_points'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'HT_average_goal_scored'] = stats_season[c][k-1]['goals_scored'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'HT_average_goal_conceeded'] = stats_season[c][k-1]['goals_conceeded'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'HT_average_goal_scored_half'] = stats_season[c][k-1]['goals_scored_half'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'HT_average_goal_conceeded_half'] =  stats_season[c][k-1]['goals_conceeded_half'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'HT_average_shots'] = stats_season[c][k-1]['shots'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'HT_average_shots_on_target'] = stats_season[c][k-1]['shots_on_target'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'HT_average_shots_conceeded']= stats_season[c][k-1]['shots_conceeded'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'HT_average_yellow'] = stats_season[c][k-1]['yellow'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'HT_average_red'] = stats_season[c][k-1]['red'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'HT_average_fouls_committed'] = stats_season[c][k-1]['fouls_committed'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'HT_average_fouls_conceeded'] = stats_season[c][k-1]['fouls_conceeded'] / max(k-1.0,1.0)
            else :
                table.at[selected.index[k-1],'AT_league_points'] = stats_season[c][k-1]['league_points'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'AT_average_goal_scored'] = stats_season[c][k-1]['goals_scored'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'AT_average_goal_conceeded'] = stats_season[c][k-1]['goals_conceeded'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'AT_average_goal_scored_half'] = stats_season[c][k-1]['goals_scored_half'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'AT_average_goal_conceeded_half'] =  stats_season[c][k-1]['goals_conceeded_half'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'AT_average_shots'] = stats_season[c][k-1]['shots'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'AT_average_shots_on_target'] = stats_season[c][k-1]['shots_on_target'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'AT_average_shots_conceeded']= stats_season[c][k-1]['shots_conceeded'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'AT_average_yellow'] = stats_season[c][k-1]['yellow'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'AT_average_red'] = stats_season[c][k-1]['red'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'AT_average_fouls_committed'] = stats_season[c][k-1]['fouls_committed'] / max(k-1.0,1.0)
                table.at[selected.index[k-1],'AT_average_fouls_conceeded'] = stats_season[c][k-1]['fouls_conceeded'] / max(k-1.0,1.0)
            
            #Gathering the statistics of the game for the club for next games
            #League points
            if selected['FTR'][selected.index[k-1]] == "D" :
                stats_season[c][k]['league_points'] = stats_season[c][k-1]['league_points'] + 1.0
            elif selected['FTR'][selected.index[k-1]] == "A" and selected['AwayTeam'][selected.index[k-1]] == c:
                stats_season[c][k]['league_points'] = stats_season[c][k-1]['league_points'] + 3.0
            elif selected['FTR'][selected.index[k-1]] == "H" and selected['HomeTeam'][selected.index[k-1]] == c:
                stats_season[c][k]['league_points'] = stats_season[c][k-1]['league_points'] + 3.0
            else:
                stats_season[c][k]['league_points'] = stats_season[c][k-1]['league_points']
            
            #Goals and shots
            if stats_season[c][k]['is_Home']:
                stats_season[c][k]['goals_scored'] = stats_season[c][k-1]['goals_scored'] + selected['FTHG'][selected.index[k-1]]
                stats_season[c][k]['goals_scored_half'] = stats_season[c][k-1]['goals_scored_half'] + selected['HTHG'][selected.index[k-1]]
                stats_season[c][k]['goals_conceeded'] = stats_season[c][k-1]['goals_conceeded'] + selected['FTAG'][selected.index[k-1]]
                stats_season[c][k]['goals_conceeded_half'] = stats_season[c][k-1]['goals_conceeded_half'] + selected['HTAG'][selected.index[k-1]]
                stats_season[c][k]['shots'] = stats_season[c][k-1]['shots'] + selected['HS'][selected.index[k-1]]
                stats_season[c][k]['shots_on_target'] = stats_season[c][k-1]['shots_on_target'] + selected['HST'][selected.index[k-1]]
                stats_season[c][k]['shots_conceeded'] = stats_season[c][k-1]['shots_conceeded'] + selected['AS'][selected.index[k-1]]
            else :
                stats_season[c][k]['goals_scored'] = stats_season[c][k-1]['goals_scored'] + selected['FTAG'][selected.index[k-1]]
                stats_season[c][k]['goals_scored_half'] = stats_season[c][k-1]['goals_scored_half'] + selected['HTAG'][selected.index[k-1]]
                stats_season[c][k]['goals_conceeded'] = stats_season[c][k-1]['goals_conceeded'] + selected['FTHG'][selected.index[k-1]]
                stats_season[c][k]['goals_conceeded_half'] = stats_season[c][k-1]['goals_conceeded_half'] + selected['HTHG'][selected.index[k-1]]
                stats_season[c][k]['shots'] = stats_season[c][k-1]['shots'] + selected['AS'][selected.index[k-1]]
                stats_season[c][k]['shots_on_target'] = stats_season[c][k-1]['shots_on_target'] + selected['AST'][selected.index[k-1]]
                stats_season[c][k]['shots_conceeded'] = stats_season[c][k-1]['shots_conceeded'] + selected['HS'][selected.index[k-1]]
            
            #Fouls and cards
            if stats_season[c][k]['is_Home']:
                stats_season[c][k]['yellow'] = stats_season[c][k-1]['yellow'] + selected['HY'][selected.index[k-1]]
                stats_season[c][k]['red'] = stats_season[c][k-1]['red'] + selected['HR'][selected.index[k-1]]
                stats_season[c][k]['fouls_conceeded'] = stats_season[c][k-1]['fouls_conceeded'] + selected['AF'][selected.index[k-1]]
                stats_season[c][k]['fouls_committed'] = stats_season[c][k-1]['fouls_committed'] + selected['HF'][selected.index[k-1]]
            else:
                stats_season[c][k]['yellow'] = stats_season[c][k-1]['yellow'] + selected['AY'][selected.index[k-1]]
                stats_season[c][k]['red'] = stats_season[c][k-1]['red'] + selected['AR'][selected.index[k-1]]
                stats_season[c][k]['fouls_conceeded'] = stats_season[c][k-1]['fouls_conceeded'] + selected['HF'][selected.index[k-1]]
                stats_season[c][k]['fouls_committed'] = stats_season[c][k-1]['fouls_committed'] + selected['AF'][selected.index[k-1]]
    
    valid = table['week'] > 10
    
    return(table[valid])

# Using the previous function to get the relevant data

In [7]:
#list of dataframes of each seasons
data_seasons = []

start_year = 2003
end_year = 2020

print("Loading seasons : " + str(start_year) + '-' + str(start_year+ 1) + " to " + str(end_year) + '-' + str(end_year+ 1))

for i in range(start_year,end_year):
  
    season = str(i) + '-' + str(i + 1)
    
    table = create_stat_dico(season)
    
    full_table_add = table[['week', 'HT_league_points', 'HT_average_goal_scored',
       'HT_average_goal_conceeded', 'HT_average_goal_scored_half',
       'HT_average_goal_conceeded_half', 'HT_average_yellow', 'HT_average_red',
       'HT_average_shots', 'HT_average_shots_on_target',
       'HT_average_shots_conceeded', 'HT_average_fouls_committed',
       'HT_average_fouls_conceeded', 'AT_league_points',
       'AT_average_goal_scored', 'AT_average_goal_conceeded',
       'AT_average_goal_scored_half', 'AT_average_goal_conceeded_half',
       'AT_average_yellow', 'AT_average_red', 'AT_average_shots',
       'AT_average_shots_on_target', 'AT_average_shots_conceeded',
       'AT_average_fouls_committed', 'AT_average_fouls_conceeded','FTR','IWH','IWD','IWA']]
    
    data_seasons.append(full_table_add) 

Loading seasons : 2003-2004 to 2020-2021


In [9]:
#Visualize the format of dataframe
full_table_add

Unnamed: 0,week,HT_league_points,HT_average_goal_scored,HT_average_goal_conceeded,HT_average_goal_scored_half,HT_average_goal_conceeded_half,HT_average_yellow,HT_average_red,HT_average_shots,HT_average_shots_on_target,...,AT_average_red,AT_average_shots,AT_average_shots_on_target,AT_average_shots_conceeded,AT_average_fouls_committed,AT_average_fouls_conceeded,FTR,IWH,IWD,IWA
100,11,1.30,1.30,1.30,0.80,0.70,2.00,0.00,11.10,3.90,...,0.00,13.80,4.60,9.80,10.50,13.90,H,3.30,3.50,2.15
101,11,1.60,1.50,1.40,0.70,0.80,2.60,0.10,13.60,4.80,...,0.20,9.40,2.90,13.90,11.30,9.40,D,1.77,3.80,4.40
102,11,1.10,1.50,1.60,0.70,0.30,1.50,0.10,14.30,5.80,...,0.00,16.20,6.60,9.50,9.30,8.80,A,7.90,5.30,1.36
103,11,1.20,1.20,1.40,0.50,0.40,1.30,0.20,12.20,3.60,...,0.00,11.00,3.80,17.30,10.30,11.20,H,1.70,4.05,4.60
104,11,2.20,3.20,0.90,1.60,0.50,1.90,0.10,21.50,7.90,...,0.20,12.70,4.20,13.20,10.80,8.90,H,1.08,11.00,27.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255,26,0.72,0.96,1.88,0.52,0.88,1.88,0.04,12.00,3.96,...,0.04,15.56,6.24,9.68,8.36,8.32,A,9.60,5.50,1.30
256,26,1.00,1.28,1.88,0.60,0.76,1.84,0.04,12.80,4.40,...,0.12,11.96,4.40,13.80,9.96,11.24,A,3.95,3.80,1.87
257,26,1.24,1.28,1.36,0.60,0.72,2.40,0.12,11.32,3.84,...,0.08,9.72,3.20,15.88,9.20,10.80,H,1.45,4.50,7.00
258,26,1.64,1.72,1.36,0.88,0.48,1.80,0.00,16.28,5.76,...,0.00,14.88,5.72,10.16,11.24,12.24,A,1.85,3.60,4.30


# Starting to define train and test datasets

# Training dataset

In [10]:
#Number of seasons used to train
nb_train_seasons = 12
train_df = pd.concat((data_seasons[k] for k in range(nb_train_seasons)))

print("The algorithm will train on " + str(nb_train_seasons) + " seasons i.e " + str(len(train_df)) + " matches !")

#Choice of the features
features1 = ['week', 'HT_league_points', 'HT_average_goal_scored',
       'HT_average_goal_conceeded', 'HT_average_goal_scored_half',
       'HT_average_goal_conceeded_half', 'HT_average_yellow', 'HT_average_red',
       'HT_average_shots', 'HT_average_shots_on_target',
       'HT_average_shots_conceeded', 'HT_average_fouls_committed',
       'HT_average_fouls_conceeded', 'AT_league_points',
       'AT_average_goal_scored', 'AT_average_goal_conceeded',
       'AT_average_goal_scored_half', 'AT_average_goal_conceeded_half',
       'AT_average_yellow', 'AT_average_red', 'AT_average_shots',
       'AT_average_shots_on_target', 'AT_average_shots_conceeded',
       'AT_average_fouls_committed', 'AT_average_fouls_conceeded']

features = ['HT_league_points', 'HT_average_goal_scored',
       'HT_average_goal_conceeded',
       'HT_average_shots', 'HT_average_shots_on_target',
       'HT_average_shots_conceeded','AT_league_points',
       'AT_average_goal_scored', 'AT_average_goal_conceeded',
       'AT_average_shots_on_target', 'AT_average_shots_conceeded']

n_features = len(features)

train_x = train_df[features]

#Labels for training
train_y = train_df['FTR']


The algorithm will train on 12 seasons i.e 3336 matches !


# Testing dataset

In [11]:
#Number of seasons used to train
nb_test_seasons = len(data_seasons) - nb_train_seasons
test_df = pd.concat((data_seasons[len(data_seasons)-1-k] for k in range(nb_test_seasons)))

#Choice of the features (we first delete the odds)
test_x = test_df[features]

#Labels for training
test_y = test_df['FTR']

# Let's apply classical ML algorithms

In [12]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

C = 10
kernel = 1.0 * RBF([1.0 for k in range(n_features)])  # for GPC

# Create different classifiers.
classifiers = {
    'L1 logistic': LogisticRegression(C=C, penalty='l1',
                                      solver='saga',
                                      multi_class='multinomial',
                                      max_iter=1000000),
    'L2 logistic (Multinomial)': LogisticRegression(C=C, penalty='l2',
                                                    solver='saga',
                                                    multi_class='multinomial',
                                                    max_iter=1000000),
    'L2 logistic (OvR)': LogisticRegression(C=C, penalty='l2',
                                            solver='saga',
                                            multi_class='ovr',
                                            max_iter=10000),
    'Linear SVC': SVC(kernel='linear', C=C, probability=True,
                      random_state=0)
    #'GPC': GaussianProcessClassifier(kernel)
}

n_classifiers = len(classifiers)

#plt.figure(figsize=(3 * 2, n_classifiers * 2))
#plt.subplots_adjust(bottom=.2, top=.95)

#xx = np.linspace(3, 9, 100)
#yy = np.linspace(1, 5, 100).T
#xx, yy = np.meshgrid(xx, yy)
#Xfull = np.c_[xx.ravel(), yy.ravel()]

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(train_x, train_y)

    y_pred = classifier.predict(test_x)
    accuracy = accuracy_score(test_y, y_pred)
    print("Accuracy (test) for %s: %0.1f%% " % (name, accuracy * 100))
    
    # View probabilities:
    probas = classifier.predict_proba(test_x)
    print(probas)

Accuracy (test) for L1 logistic: 55.5% 
[[0.50835582 0.22249216 0.26915202]
 [0.22230042 0.25355525 0.52414433]
 [0.85062234 0.0997673  0.04961036]
 ...
 [0.27676782 0.25685488 0.46637731]
 [0.36631749 0.27559825 0.35808426]
 [0.19531414 0.27747399 0.52721188]]
Accuracy (test) for L2 logistic (Multinomial): 55.5% 
[[0.50896936 0.22204421 0.26898642]
 [0.22149711 0.25277194 0.52573095]
 [0.85113756 0.09851826 0.05034418]
 ...
 [0.27674685 0.25680728 0.46644587]
 [0.36647022 0.27511958 0.35841019]
 [0.19547914 0.27707099 0.52744987]]
Accuracy (test) for L2 logistic (OvR): 55.5% 
[[0.49530462 0.22691828 0.27777709]
 [0.22102212 0.24967337 0.5293045 ]
 [0.8188677  0.1194377  0.06169459]
 ...
 [0.27664268 0.2574474  0.46590992]
 [0.36190247 0.27973744 0.35836009]
 [0.19508642 0.27560832 0.52930525]]
Accuracy (test) for Linear SVC: 55.6% 
[[0.42619703 0.23592176 0.33788121]
 [0.20161207 0.24946398 0.54892395]
 [0.79348332 0.17410647 0.03241021]
 ...
 [0.23527009 0.2556394  0.50909051]
 [0.31