# [1] BUILD HOME & AWAY data frames
*note: eventually I want to scrape this data from , I don't know how yet. For the moment, this data is coming from a google sheet that I'm copying and pasting from understat.*

### Running this frame will:
1. generate average goals scored and conceded for the whole league. 
2. create dataframes to access specific clubs for later functions

In [3]:
import pandas as pd

# HOME DATA FRAME
# Goals scored, goals conceded, matches playing at home

# all clubs
clubs = [
    "Arsenal",
    "Aston Villa",
    "Bournemouth",
    "Brentford",
    "Brighton",
    "Chelsea",
    "Crystal Palace",
    "Everton",
    "Fulham",
    "Ipswich",
    "Leicester",
    "Liverpool",
    "Manchester City",
    "Manchester United",
    "Newcastle United",
    "Nottingham Forest",
    "Southampton",
    "Tottenham",
    "West Ham",
    "Wolverhampton Wanderers",
]

# ============================== HOME ===================================

home_scored = [
    12,
    7,
    8,
    15,
    9,
    8,
    3,
    5,
    9,
    4,
    5,
    9,
    12,
    4,
    5,
    6,
    4,
    15,
    8,
    7,
]

home_conceded = [
    6,
    6,
    4,
    9,
    7,
    7,
    5,
    8,
    7,
    8,
    7,
    3,
    6,
    8,
    3,
    3,
    8,
    4,
    10,
    14,
]

# currently (matchday 10), all clubs with 5 home and away matches 
matches = [5] * 20

# create initial data frame
home_df = pd.DataFrame(
    list(zip(matches, home_scored, home_conceded)), 
    index=clubs,
    columns=["matches", "scored", "conceded"]
)

# add goals per match (scored and conceded)
home_df["gpm_scored"] = home_df["scored"] / home_df["matches"]
home_df["gpm_conceded"] = home_df["conceded"] / home_df["matches"]

# Calculate league averages
epl_home_avg_gpm_scored = round(sum(home_df["gpm_scored"].values) / 20, 2)
epl_home_avg_gpm_conceded = round(sum(home_df["gpm_conceded"].values) / 20, 2)
print("\nAVERAGE HOME GOALS SCORED PER MATCH: ", epl_home_avg_gpm_scored)
print("AVERAGE HOME GOALS CONCEDED PER MATCH: ", epl_home_avg_gpm_conceded)

print("\n======================= HOME TEAM RESULTS (matchday 10) ========================\n")
print(home_df)



# ======================================= AWAY ==============================================
away_scored = [
    5,
    10,
    5,
    4,
    8,
    12,
    5,
    5,
    5,
    6,
    9,
    10,
    9,
    5,
    5,
    8,
    3,
    7,
    5,
    7,
]

away_conceded = [
    5,
    9,
    8,
    11,
    7,
    5,
    8,
    9,
    6,
    13,
    11,
    3,
    5,
    4,
    7,
    4,
    11,
    7,
    9,
    13, 
]

matches = [5] * 20

away_df = pd.DataFrame(
    list(zip(matches, away_scored, away_conceded)),
    index=clubs,
    columns=["matches", "scored", "conceded"]
)

away_df["gpm_scored"] = away_df["scored"] / away_df["matches"]
away_df["gpm_conceded"] = away_df["conceded"] / away_df["matches"]

print("\n======================= AWAY TEAM RESULTS (matchday 10) ========================\n")
print(away_df)






AVERAGE HOME GOALS SCORED PER MATCH:  1.55
AVERAGE HOME GOALS CONCEDED PER MATCH:  1.33


                         matches  scored  conceded  gpm_scored  gpm_conceded
Arsenal                        5      12         6         2.4           1.2
Aston Villa                    5       7         6         1.4           1.2
Bournemouth                    5       8         4         1.6           0.8
Brentford                      5      15         9         3.0           1.8
Brighton                       5       9         7         1.8           1.4
Chelsea                        5       8         7         1.6           1.4
Crystal Palace                 5       3         5         0.6           1.0
Everton                        5       5         8         1.0           1.6
Fulham                         5       9         7         1.8           1.4
Ipswich                        5       4         8         0.8           1.6
Leicester                      5       5         7         1.0

# [2] Calculate projected goals
### 1. define home and away sides for the match in question
### 2. retrieve from the respective home and away data frames the scoring data per match for each club
### 3. create attack and defense ratings by comparing average goals per match to the league average
   
In this step, I create a ratio comparing the home and away data to the league average.

### 4. Make a projection for goals scored using attack, defense and league average

Using the away attack rating and the home defense rating I adjust the league average to reflect the stats of the home and away sides. This is best demonstrated with an example. If the away side attack rating is high, they score more per game than the average team in the league. If the home side defense rating is high, they concede more than average team in the league. In this case, we expect a high number of goals scored in the matchup.

If the same high attack rating goes against a club with a low defense rating, our expected goals should be less as a result.

So, multiplying our attack and defense ratios, we adjust the league average for 'home' or 'away' scores by a calculated amount to project the number of goals we expect for each side.

In [32]:
home_side = "Manchester United"
away_side = "Leicester"

print(f"MATCH: {home_side} (home) vs {away_side} (away)")

# retrieve data for home scored and away conceded to project goals for home side
home_side_gpm_scored = home_df.loc[home_side, "gpm_scored"]
away_side_gpm_conceded = away_df.loc[away_side, "gpm_conceded"]

print("\n================== HOME SCORE DATA ==================")
print("home_side_gpm_scored:", home_side_gpm_scored)
print("away_side_gpm_conceded:", away_side_gpm_conceded)

# retreive data for away scored and home conceded to project goals for away side 
away_side_gpm_scored = away_df.loc[away_side, "gpm_scored"]
home_side_gpm_conceded = home_df.loc[home_side, "gpm_conceded"]

print("\n================== AWAY SCORE DATA ==================")
print("away_side_gpm_scored:", away_side_gpm_scored)
print("home_side_gpm_conceded:", home_side_gpm_conceded)

print("\n================== LEAGUE AVG DATA ==================")
print("epl_home_avg_gpm_scored", epl_home_avg_gpm_scored)
print("epl_home_avg_gpm_conceded", epl_home_avg_gpm_conceded)

# attack rating - a ratio of the home scored to the average 'scored' for the league (home table)
home_attack_rating = home_side_gpm_scored / epl_home_avg_gpm_scored

# defense rating - a ratio of away side conceded to the average 'conceded' in the away table
away_defense_rating = away_side_gpm_conceded / epl_home_avg_gpm_scored
# NOTE: league average for home scored is the same as league average for away 
# conceded (which is the ratio we want here)

# away attack rating - a ratio of away scored to the average 'scored' in the away table
away_attack_rating = away_side_gpm_scored / epl_home_avg_gpm_conceded 
# NOTE: home average conceded for the league is the same as aways side scored

# away defense rating - a ratio of home conceded to average 'conceded' in the away table
home_defense_rating = home_side_gpm_conceded / epl_home_avg_gpm_conceded

# print ratios:
print("home_attack_rating: ", home_attack_rating)
print("away_defense_rating: ", away_defense_rating)
print("away_attack_rating: ", away_attack_rating)
print("home_defense_rating: ", home_defense_rating)


# home projected goals
home_projected_goals = home_attack_rating * away_defense_rating * epl_home_avg_gpm_scored
print("\n================== PROJECTED GOALS ====================")
print("HOME projected goals:", home_projected_goals)
# away projected goals
away_projected_goals = away_attack_rating * home_defense_rating * epl_home_avg_gpm_conceded
print("AWAY projected goals: ", away_projected_goals)

MATCH: Manchester United (home) vs Leicester (away)

home_side_gpm_scored: 0.8
away_side_gpm_conceded: 2.2

away_side_gpm_scored: 1.8
home_side_gpm_conceded: 1.6

epl_home_avg_gpm_scored 1.55
epl_home_avg_gpm_conceded 1.33
home_attack_rating:  0.5161290322580645
away_defense_rating:  1.4193548387096775
away_attack_rating:  1.3533834586466165
home_defense_rating:  1.2030075187969924

HOME projected goals: 1.135483870967742
AWAY projected goals:  2.1654135338345863


In [56]:
# At this point, we've projected scores based on club and league records.
# Now, we create the poisson distribution. I need to understand what that is
# and how to create the table I want. From that table, I'm creating win
# probabilities. Understand this table, write the code for it. 

"""
Poisson distribution: with the expectation of mu events (goals) in a given interval (match),
the probability of k events ocurring in that interval is:
( mu^k * e^(-mu) ) / k!
This equation will be handled with scipy in the code.
"""
from scipy.stats import poisson
import numpy as np
# Poisson probability for home team goals 0 - 8
home_score_prob = []
away_score_prob = []
for i in range(9):
    prob = poisson.pmf(i, home_projected_goals)
    home_score_prob.append(prob)
for i in range(9):
    prob = poisson.pmf(i, away_projected_goals)
    away_score_prob.append(prob)

home_score_prob = np.array(home_score_prob)
away_score_prob = np.array(away_score_prob)
# print(home_score_prob)
# print(away_score_prob)

exact_score_prob = np.outer(home_score_prob, away_score_prob)

# print(exact_score_prob)
home_win_prob = np.sum(np.triu(exact_score_prob, k=1))
away_win_prob = np.sum(np.tril(exact_score_prob, k=-1))
draw_prob = np.trace(exact_score_prob)
print("home_win_prob: ", home_win_prob, "\naway_win_prob: ", away_win_prob, "\ndraw_prob", draw_prob)

home_win_prob:  0.6078429858501884 
away_win_prob:  0.19078081305644942 
draw_prob 0.20095338694695747


In [54]:
# little learning corner :-)

test_1 = np.array([1, 2, 3, 4])
test_2 = np.array([1, 2, 3, 4])

test_3 = np.outer(test_1, test_2)
print(test_3)
print(np.trace(test_3))
print(np.tril(test_3, k=-1))

[[ 1  2  3  4]
 [ 2  4  6  8]
 [ 3  6  9 12]
 [ 4  8 12 16]]
30
35
