In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("E0.csv")

In [4]:
data.head()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,E0,05/08/2022,20:00,Crystal Palace,Arsenal,0,2,A,0,1,...,1.76,0.5,2.09,1.84,2.04,1.88,2.09,1.88,2.03,1.85
1,E0,06/08/2022,12:30,Fulham,Liverpool,2,2,D,1,0,...,2.73,1.75,1.9,2.03,1.91,2.02,2.01,2.06,1.89,1.99
2,E0,06/08/2022,15:00,Bournemouth,Aston Villa,2,0,H,1,0,...,1.76,0.5,1.93,2.0,1.93,2.0,1.94,2.04,1.88,2.0
3,E0,06/08/2022,15:00,Leeds,Wolves,2,1,H,1,1,...,1.87,-0.25,2.08,1.85,2.1,1.84,2.14,1.87,2.08,1.81
4,E0,06/08/2022,15:00,Newcastle,Nott'm Forest,2,0,H,0,0,...,1.89,-1.0,1.97,1.96,1.99,1.93,2.19,1.97,2.03,1.86


In [5]:
epl = data[['HomeTeam', 'AwayTeam','FTHG', 'FTAG']]
epl = epl.rename(columns={'FTHG': 'HomeGoals', 'FTAG':'AwayGoals'})
print(epl.head())

         HomeTeam       AwayTeam  HomeGoals  AwayGoals
0  Crystal Palace        Arsenal          0          2
1          Fulham      Liverpool          2          2
2     Bournemouth    Aston Villa          2          0
3           Leeds         Wolves          2          1
4       Newcastle  Nott'm Forest          2          0


In [6]:
test = epl[-20:]
epl = epl[:-20]
print(epl[['HomeGoals', 'AwayGoals']].mean())

HomeGoals    1.598174
AwayGoals    1.141553
dtype: float64


In [7]:
from scipy.stats import skellam, poisson
# probability of a draw
skellam.pmf(0.0, epl.HomeGoals.mean(), epl.AwayGoals.mean())
# probability of a win by one goal
skellam.pmf(1.0, epl.HomeGoals.mean(), epl.AwayGoals.mean())

0.23078876992933062

In [8]:
home = epl.iloc[:,0:3].assign(home=1).rename(columns={'HomeTeam':'team', 'AwayTeam':'opponent', 'HomeGoals':'goals'})
away = epl.iloc[:, [1, 0, 3]].assign(home=0).rename(columns={'AwayTeam': 'team', 'HomeTeam': 'opponent', 'AwayGoals': 'goals'})
df = pd.concat([home, away])
print(df)

               team       opponent  goals  home
0    Crystal Palace        Arsenal      0     1
1            Fulham      Liverpool      2     1
2       Bournemouth    Aston Villa      2     1
3             Leeds         Wolves      2     1
4         Newcastle  Nott'm Forest      2     1
..              ...            ...    ...   ...
214       Tottenham      Leicester      1     0
215          Wolves    Southampton      2     0
216       Newcastle    Bournemouth      1     0
217      Man United          Leeds      2     0
218     Aston Villa       Man City      1     0

[438 rows x 4 columns]


In [9]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
formula = 'goals ~ team + opponent + home'
model = smf.glm(formula=formula, data=df, family=sm.families.Poisson()).fit()
print(model.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  goals   No. Observations:                  438
Model:                            GLM   Df Residuals:                      398
Model Family:                 Poisson   Df Model:                           39
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -611.15
Date:                Thu, 02 Mar 2023   Deviance:                       454.99
Time:                        02:32:30   Pearson chi2:                     380.
No. Iterations:                     5   Pseudo R-squ. (CS):             0.2575
Covariance Type:            nonrobust                                         
                                 coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------
Intercept           

In [10]:
def predict_match(model, homeTeam, awayTeam, max_goals=10):
    home_goals = model.predict(pd.DataFrame(data={'team': homeTeam,
                                                  'opponent':awayTeam,
                                                  'home': 1},
                                            index=[1])).values[0]
    away_goals = model.predict(pd.DataFrame(data={'team': awayTeam,
                                                  'opponent': homeTeam,
                                                  'home':0},
                                            index=[1])).values[0]
    pred = [[poisson.pmf(i, team_avg) for i in range(0, max_goals+1)] for team_avg in [home_goals, away_goals]]
    return(np.outer(np.array(pred[0]), np.array(pred[1])))

In [11]:
print(model.predict(pd.DataFrame(data={'team': 'Arsenal', 'opponent': 'Man City', 'home':1}, index=[1])))

1    1.992139
dtype: float64


In [12]:
print(model.predict(pd.DataFrame(data={'team': 'Man City', 'opponent': 'Arsenal', 'home':0}, index=[1])))

1    1.306237
dtype: float64


In [13]:
ars_man = predict_match(model, 'Arsenal', 'Man City', max_goals=3)

In [15]:
np.sum(np.tril(ars_man, -1)) * 100
# 40.23456259724963
# victory for Man City
np.sum(np.triu(ars_man, 1)) * 100
# 20.34309498981432
# a draw
np.sum(np.diag(ars_man)) * 100

21.36989057716443