In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import poisson

from scipy import stats
from itertools import product


%matplotlib qt
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

Import DataSet
Statistics from the English Premier League for the 2019-2020 season are used here

In [11]:
home = pd.read_csv('home_table_full.csv')
away = pd.read_csv('away_table_full.csv')
elo = pd.read_csv('elo.csv') #football ELO rating from http://elofootball.com/
matches = pd.read_csv('matches.csv')
home_shots = pd.read_csv('epl_2020_2021_shots_home.csv')
away_shots = pd.read_csv('epl_2020_2021_shots_away.csv')

Define all function

In [5]:
class stat:
    def goals_avg(team:str, ha = True, scored = True):
        if ha == True:
            if scored == True:
                team_stats = home[['Team','Played','F','A']]
                team_value = team_stats.loc[team_stats['Team']==team]
                return team_value['F']/team_value['Played']
            if scored == False:
                team_stats = home[['Team','Played','F','A']]
                team_value = team_stats.loc[team_stats['Team']==team]
                return team_value['A']/team_value['Played']
        if ha == False:
            if scored == True:
                team_stats = away[['Team','Played','F','A']]
                team_value = team_stats.loc[team_stats['Team']==team]
                return team_value['F']/team_value['Played']
            if scored == False:
                team_stats = away[['Team','Played','F','A']]
                team_value = team_stats.loc[team_stats['Team']==team]
                return team_value['A']/team_value['Played']
    def xG_stat(home_team,away_team):
        team_home = stat.goals_avg(home_team, ha=True, scored=True), stat.goals_avg(home_team, ha=True, scored=False)
        team_away = stat.goals_avg(away_team, ha=False, scored=True), stat.goals_avg(away_team, ha=False, scored=False)

        a, b = team_home
        a1, b1 = team_away
        xG_home = (a.values[0]+b1.values[0])/2
        xG_away = (a1.values[0]+b.values[0])/2
        return xG_home,xG_away
    def elo_goals(home_team:str, away_team:str):
        eloh = elo.loc[elo['Team']==home_team]
        eloa = elo.loc[elo['Team']==away_team]
        elo_home = eloh['ELO'].values
        elo_away = eloa['ELO'].values

        dataframe = matches
        total_avg = dataframe['home_goal_count'].mean()+dataframe['away_goal_count'].mean()
        elo1 = 1/(10**((elo_away[0]-elo_home[0])/400)+1)
        elo2 = 1/(10**((elo_home[0]-elo_away[0])/400)+1)
        xG_home = elo1 * total_avg
        xG_away = elo2 * total_avg
        return xG_home, xG_away

In [6]:
class expected_goals:
    def xG(home_team,away_team, values = True):
        stat_xG = stat.xG_stat(home_team, away_team)
        elo_xG = stat.elo_goals(home_team, away_team)
        home_xG = (stat_xG[0]+elo_xG[0])/2
        away_xG = (stat_xG[1]+elo_xG[1])/2
        if values == True:
            return float("{:.2f}".format(home_xG)), float("{:.2f}".format(away_xG))
        if values == False:
            return home_team + ' ' + str(home_xG), away_team + ' ' + str(away_xG)

Coefficients are indicators that adjust xG. Here draw_coef is a coefficient that increases the probabilities of a draw, as it is observed that draws are played more often by equal teams according to the ELO rating.

Zero-inf is a suggested Dixon and Cole score that increases the score to 0.



In [7]:
class coef:
    def zero_inf():
        zero = 0
        n=0
        for i in matches.values:
            a = matches.loc[n]
            n +=1
            if a['home_goal_count'] == 0 or a['away_goal_count'] == 0:
                zero += 1
            else:
                continue
        avg_home = matches['home_goal_count'].values.mean()
        avg_away = matches['away_goal_count'].values.mean()

        home_prb = []
        for i in np.arange(10):
            home_prb.append(poisson.pmf(k=i, mu=avg_home))

        away_prb = []
        for i in np.arange(10):
            away_prb.append(poisson.pmf(k=i, mu=avg_away)) 
        
        prod_table = np.array([(i*j) for i, j in product(home_prb, away_prb)])
        prod_table.shape = (10, 10)

        prob_df = pd.DataFrame(prod_table, index=range(10), columns=range(10))

        zero_result = [prob_df.iat[0,0],prob_df.iat[0,1],prob_df.iat[0,2],prob_df.iat[0,3],prob_df.iat[0,4],prob_df.iat[0,5],prob_df.iat[0,6],prob_df.iat[0,7],prob_df.iat[0,8],
        prob_df.iat[1,0],prob_df.iat[2,0],prob_df.iat[3,0],prob_df.iat[4,0],prob_df.iat[5,0],prob_df.iat[6,0],prob_df.iat[7,0],prob_df.iat[8,0],prob_df.iat[9,0]]
        
        poisson_0 = np.array(zero_result).sum()
        count = len(matches['home_goal_count'].values)+len(matches['away_goal_count'].values)
        return poisson_0 - n/count
    def draw_coef(home_team:str, away_team:str):
        t1 = elo.loc[elo['Team']==home_team]
        t2 = elo.loc[elo['Team']==away_team]
        elo1 = t1['ELO'].values
        elo2 = t2['ELO'].values
        if abs(elo1-elo2) <= 50:
            return 0.1
        if abs(elo1-elo2)>50 and abs(elo1-elo2) <= 150:
            return 0.08
        if abs(elo1-elo2)>150 and abs(elo1-elo2) <= 250:
            return 0.06
        if abs(elo1-elo2)>250 and abs(elo1-elo2) <= 350:
            return 0.03
        if abs(elo1-elo2)>350:
            return 0
    def home_advantage(home_team,away_team):
        a = home.loc[home['Team']==home_team]
        b= home.loc[home['Team']==away_team]
        a1 = (a['F']-a['A'])/a['Played']
        b1 = (b['F']-b['A'])/b['Played']
        c = ((a1.values[0]-b1.values[0]))
        
        return c/7

In [8]:
class prb:
    def prb_matrix(home_team:str, away_team:str):
        zero_inf = abs(coef.zero_inf())
        draw_coef= coef.draw_coef(home_team,away_team)
        a = expected_goals.xG(home_team,away_team)
        xG_home = a[0]+coef.home_advantage(home_team,away_team)
        xG_away = a[1]
        
        home_prb = []
        for i in np.arange(10):
            home_prb.append(poisson.pmf(k=i, mu=xG_home)) 

        away_prb = []
        for i in np.arange(10):
            away_prb.append(poisson.pmf(k=i, mu=xG_away)) 

        home_prb1 = [(zero_inf+(1-draw_coef)*(1-zero_inf)*home_prb[0]),(draw_coef+(1-draw_coef)*(1-zero_inf)*home_prb[1])]
        lst = home_prb[2::]
        for i in lst:
            home_prb1.append(i*(1-draw_coef)*(1-zero_inf))
        
        away_prb1 = [(zero_inf+(1-draw_coef)*(1-zero_inf)*away_prb[0]),(draw_coef+(1-draw_coef)*(1-zero_inf)*away_prb[1])]
        lst1 = away_prb[2::]
        for i in lst1:
            away_prb1.append(i*(1-draw_coef)*(1-zero_inf))
        

        prod_table = np.array([(i*j) for i, j in product(home_prb1, away_prb1)])
        prod_table.shape = (10, 10)

        prob_df = pd.DataFrame(prod_table, index=range(10), columns=range(10))
        return prob_df
    def prb_chances(home_team,away_team):
        data = prb.prb_matrix(home_team,away_team)
        home_win = sum((data.iat[1,0]+data.iat[2,0]+data.iat[3,0]+data.iat[4,0]+data.iat[5,0]+data.iat[6,0]+data.iat[7,0]+data.iat[8,0]+data.iat[9,0],
        data.iat[2,1]+data.iat[3,1]+data.iat[4,1]+data.iat[5,1]+data.iat[6,1]+data.iat[7,1]+data.iat[8,1]+data.iat[9,1],
        data.iat[3,2]+data.iat[4,2]+data.iat[5,2]+data.iat[6,2]+data.iat[7,2]+data.iat[8,2]+data.iat[9,2],
        data.iat[4,3]+data.iat[5,3]+data.iat[6,3]+data.iat[7,3]+data.iat[8,3]+data.iat[9,3],
        data.iat[5,4]+data.iat[6,4]+data.iat[7,4]+data.iat[8,4]+data.iat[9,4],
        data.iat[6,5]+data.iat[7,5]+data.iat[8,5]+data.iat[9,5],
        data.iat[7,6]+data.iat[8,6]+data.iat[9,6],
        data.iat[8,7]+data.iat[9,7]+data.iat[9,8]))*100
        draw = (data.iat[0,0]+data.iat[1,1]+data.iat[2,2]+data.iat[3,3]+data.iat[4,4]+data.iat[5,5]+data.iat[6,6]+data.iat[7,7]+data.iat[8,8]+data.iat[9,9])*100
        away_win = sum((data.iat[0,1]+data.iat[0,2]+data.iat[0,3]+data.iat[0,4]+data.iat[0,5]+data.iat[0,6]+data.iat[0,7]+data.iat[0,8]+data.iat[0,9],
        data.iat[1,2]+data.iat[1,3]+data.iat[1,4]+data.iat[1,5]+data.iat[1,6]+data.iat[1,7]+data.iat[1,8]+data.iat[1,9],
        data.iat[2,3]+data.iat[2,4]+data.iat[2,5]+data.iat[2,6]+data.iat[2,7]+data.iat[2,8]+data.iat[2,9],
        data.iat[3,4]+data.iat[3,5]+data.iat[3,6]+data.iat[3,7]+data.iat[3,8]+data.iat[3,9],
        data.iat[4,5]+data.iat[4,6]+data.iat[4,7]+data.iat[4,8]+data.iat[4,9],
        data.iat[5,6]+data.iat[5,7]+data.iat[5,8]+data.iat[5,9],
        data.iat[6,7]+data.iat[6,8]+data.iat[6,9],
        data.iat[7,8]+data.iat[7,9]+data.iat[8,8]))*100
        return float("{:.2f}".format(home_win)), float("{:.2f}".format(draw)), float("{:.2f}".format(away_win))
    def xG_final(home_team,away_team):
        a = expected_goals.xG(home_team,away_team)
        xG_home = a[0]+coef.home_advantage(home_team,away_team)
        xG_away = a[1]
        return xG_home,xG_away
    
        


Draw regression by shots. 

This allows you to see how one or another factor correlates with goals. Using regression, we can obtain xG without additional factors (own goals, penalty).

In [12]:
y = home_shots['On_Target_pg_for'].values
x = home_shots['Shots_pg_for'].values
cg = home_shots['Team'].values

fig, g = plt.subplots()
slope, intercept, r, p, std_err = stats.linregress(x,y)

def myfunc(m):
    return slope * m + intercept
mymodel = list(map(myfunc, x))
res = stats.linregress(x, y)

g.scatter(x,y)
for i, txt in enumerate(cg):
    g.annotate(txt, (x[i], y[i]))

plt.plot(x, mymodel, color = 'red')
rsquare = r * r
g.annotate(str(rsquare), (min(x)+0.5,max(y))) 


plt.show()
slope, intercept

(0.3584558678130462, -0.04847040189200147)

In [13]:
# All team keys
home['Team'].values

array(['Man City', 'Liverpool', 'Watford', 'Brentford', 'Tottenham',
       'West Ham', 'Arsenal', 'Chelsea', 'Man United', 'Norwich',
       'Leicester', 'Southampton', 'Leeds', 'Wolves', 'Aston Villa',
       'Crystal Palace', 'Everton', 'Newcastle', 'Brighton', 'Burnley'],
      dtype=object)

In [14]:
# Draw propabilities matrix
prb.prb_matrix('Arsenal', 'Norwich')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.087151,0.052507,0.014338,0.002868,0.0004301364,5.161637e-05,5.161637e-06,4.42426e-07,3.318195e-08,2.21213e-09
1,0.146104,0.088024,0.024037,0.004807,0.0007210982,8.653179e-05,8.653179e-06,7.41701e-07,5.562758e-08,3.708505e-09
2,0.13809,0.083196,0.022718,0.004544,0.0006815443,8.178531e-05,8.178531e-06,7.01017e-07,5.257627e-08,3.505085e-09
3,0.098193,0.059159,0.016154,0.003231,0.0004846349,5.815619e-05,5.815619e-06,4.984817e-07,3.738612e-08,2.492408e-09
4,0.052368,0.03155,0.008615,0.001723,0.000258462,3.101544e-05,3.101544e-06,2.658466e-07,1.993849e-08,1.329233e-09
5,0.022343,0.013461,0.003676,0.000735,0.0001102728,1.323274e-05,1.323274e-06,1.134235e-07,8.506762e-09,5.671174e-10
6,0.007944,0.004786,0.001307,0.000261,3.92066e-05,4.704792e-06,4.704792e-07,4.032679e-08,3.024509e-09,2.01634e-10
7,0.002421,0.001459,0.000398,8e-05,1.194822e-05,1.433786e-06,1.433786e-07,1.228959e-08,9.217196e-10,6.144797e-11
8,0.000646,0.000389,0.000106,2.1e-05,3.186068e-06,3.823281e-07,3.823281e-08,3.277098e-09,2.457824e-10,1.638549e-11
9,0.000153,9.2e-05,2.5e-05,5e-06,7.551868e-07,9.062242e-08,9.062242e-09,7.767636e-10,5.825727e-11,3.883818e-12


In [15]:
#Calculate Chances
prb.prb_chances('Arsenal', 'Norwich')

(69.56, 20.14, 10.58)

We have such probabilities and we can put a margin and derive the coefficients

In [19]:
odd1, odd2, odd3 = prb.prb_chances('Arsenal', 'Norwich')
print(100/(odd1+2))
print(100/(odd2+2))
print(100/(odd3+2))

1.397428731134712
4.5167118337850045
7.94912559618442
