In [1]:
import pandas as pd
import numpy as np


In [5]:
import glob

files = glob.glob(r"C:\Users\shrey\Downloads\epl_temp\*.csv")
len(files)


13

In [6]:
import pandas as pd

df = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)
df.shape


(4801, 183)

In [7]:
df.to_csv("../data/raw/epl_matches.csv", index=False)


In [8]:
pd.read_csv("../data/raw/epl_matches.csv").shape


(4801, 183)

In [9]:
required_cols = [
    "Date",
    "HomeTeam",
    "AwayTeam",
    "FTHG",
    "FTAG",
    "FTR"
]

df = df[required_cols]
df.head()


Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR
0,16/08/14,Arsenal,Crystal Palace,2.0,1.0,H
1,16/08/14,Leicester,Everton,2.0,2.0,D
2,16/08/14,Man United,Swansea,1.0,2.0,A
3,16/08/14,QPR,Hull,0.0,1.0,A
4,16/08/14,Stoke,Aston Villa,0.0,1.0,A


In [10]:
df["Date"] = pd.to_datetime(df["Date"], dayfirst=True)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4801 entries, 0 to 4800
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Date      4800 non-null   datetime64[ns]
 1   HomeTeam  4800 non-null   object        
 2   AwayTeam  4800 non-null   object        
 3   FTHG      4800 non-null   float64       
 4   FTAG      4800 non-null   float64       
 5   FTR       4800 non-null   object        
dtypes: datetime64[ns](1), float64(2), object(3)
memory usage: 225.2+ KB


  df["Date"] = pd.to_datetime(df["Date"], dayfirst=True)


In [11]:
df["Date"] = pd.to_datetime(df["Date"], dayfirst=True, errors="coerce")


In [12]:
df["Date"].isna().sum()


np.int64(1)

In [13]:
df = df.dropna(subset=["Date"])


In [14]:
df = df.sort_values("Date").reset_index(drop=True)


In [15]:
df.to_csv("../data/processed/base_matches.csv", index=False)


In [16]:
base_df = pd.read_csv("../data/processed/base_matches.csv")
base_df.head()


Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR
0,2013-08-17,Arsenal,Aston Villa,1.0,3.0,A
1,2013-08-17,Liverpool,Stoke,1.0,0.0,H
2,2013-08-17,Norwich,Everton,2.0,2.0,D
3,2013-08-17,Sunderland,Fulham,0.0,1.0,A
4,2013-08-17,Swansea,Man United,1.0,4.0,A


In [17]:
teams = pd.unique(base_df[["HomeTeam", "AwayTeam"]].values.ravel())
len(teams)


35

In [18]:
base_df["home_goals_last5"] = (
    base_df
    .groupby("HomeTeam")["FTHG"]
    .transform(lambda x: x.shift().rolling(5).mean())
)


In [19]:
base_df["away_goals_last5"] = (
    base_df
    .groupby("AwayTeam")["FTAG"]
    .transform(lambda x: x.shift().rolling(5).mean())
)


In [20]:
base_df["home_conceded_last5"] = (
    base_df
    .groupby("HomeTeam")["FTAG"]
    .transform(lambda x: x.shift().rolling(5).mean())
)


In [21]:
base_df["away_conceded_last5"] = (
    base_df
    .groupby("AwayTeam")["FTHG"]
    .transform(lambda x: x.shift().rolling(5).mean())
)


In [22]:
base_df = base_df.dropna().reset_index(drop=True)


In [23]:
base_df.to_csv("../data/processed/features.csv", index=False)


In [24]:
df_feat = pd.read_csv("../data/processed/features.csv")
df_feat.shape


(4552, 10)

In [25]:
X = df_feat[
    [
        "home_goals_last5",
        "away_goals_last5",
        "home_conceded_last5",
        "away_conceded_last5",
    ]
]

y_home = df_feat["FTHG"]
y_away = df_feat["FTAG"]


In [26]:
split_idx = int(len(df_feat) * 0.8)

X_train = X.iloc[:split_idx]
X_test  = X.iloc[split_idx:]

y_home_train = y_home.iloc[:split_idx]
y_home_test  = y_home.iloc[split_idx:]

y_away_train = y_away.iloc[:split_idx]
y_away_test  = y_away.iloc[split_idx:]


In [27]:
from sklearn.linear_model import LinearRegression

home_model = LinearRegression()
away_model = LinearRegression()

home_model.fit(X_train, y_home_train)
away_model.fit(X_train, y_away_train)


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [28]:
home_pred = home_model.predict(X_test)
away_pred = away_model.predict(X_test)


In [29]:
from sklearn.metrics import mean_absolute_error

home_mae = mean_absolute_error(y_home_test, home_pred)
away_mae = mean_absolute_error(y_away_test, away_pred)

home_mae, away_mae


(1.0229933890264569, 0.9106141566216205)

In [30]:
home_mae, away_mae


(1.0229933890264569, 0.9106141566216205)

In [31]:
from sklearn.ensemble import RandomForestRegressor

rf_home = RandomForestRegressor(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

rf_away = RandomForestRegressor(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

rf_home.fit(X_train, y_home_train)
rf_away.fit(X_train, y_away_train)


0,1,2
,n_estimators,300
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [32]:
rf_home_pred = rf_home.predict(X_test)
rf_away_pred = rf_away.predict(X_test)


In [33]:
rf_home_mae = mean_absolute_error(y_home_test, rf_home_pred)
rf_away_mae = mean_absolute_error(y_away_test, rf_away_pred)

rf_home_mae, rf_away_mae


(1.117997182900037, 0.9901155537656635)

In [34]:
df_feat["home_advantage"] = 1


In [35]:
X = df_feat[
    [
        "home_goals_last5",
        "away_goals_last5",
        "home_conceded_last5",
        "away_conceded_last5",
        "home_advantage",
    ]
]


In [36]:
split_idx = int(len(df_feat) * 0.8)

X_train = X.iloc[:split_idx]
X_test  = X.iloc[split_idx:]

y_home_train = y_home.iloc[:split_idx]
y_home_test  = y_home.iloc[split_idx:]

y_away_train = y_away.iloc[:split_idx]
y_away_test  = y_away.iloc[split_idx:]


In [37]:
home_model = LinearRegression()
away_model = LinearRegression()

home_model.fit(X_train, y_home_train)
away_model.fit(X_train, y_away_train)


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [38]:
home_pred = home_model.predict(X_test)
away_pred = away_model.predict(X_test)


In [39]:
home_mae = mean_absolute_error(y_home_test, home_pred)
away_mae = mean_absolute_error(y_away_test, away_pred)

home_mae, away_mae


(1.0229933890264569, 0.9106141566216205)

In [40]:
def match_points(row, team):
    if row["FTHG"] > row["FTAG"]:
        return 3 if row[team] == row["HomeTeam"] else 0
    if row["FTHG"] < row["FTAG"]:
        return 3 if row[team] == row["AwayTeam"] else 0
    return 1

df_feat["home_points"] = (
    df_feat
    .apply(lambda r: match_points(r, "HomeTeam"), axis=1)
    .groupby(df_feat["HomeTeam"])
    .shift()
    .rolling(5)
    .mean()
)

df_feat["away_points"] = (
    df_feat
    .apply(lambda r: match_points(r, "AwayTeam"), axis=1)
    .groupby(df_feat["AwayTeam"])
    .shift()
    .rolling(5)
    .mean()
)


In [41]:
df_feat = df_feat.dropna().reset_index(drop=True)


In [42]:
X = df_feat[
    [
        "home_goals_last5",
        "away_goals_last5",
        "home_conceded_last5",
        "away_conceded_last5",
        "home_points",
        "away_points",
    ]
]


In [43]:
split_idx = int(len(df_feat) * 0.8)

X_train = X.iloc[:split_idx]
X_test  = X.iloc[split_idx:]

y_home_train = y_home.iloc[:split_idx]
y_home_test  = y_home.iloc[split_idx:]

y_away_train = y_away.iloc[:split_idx]
y_away_test  = y_away.iloc[split_idx:]


In [44]:
home_model = LinearRegression()
away_model = LinearRegression()

home_model.fit(X_train, y_home_train)
away_model.fit(X_train, y_away_train)


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [45]:
home_pred = home_model.predict(X_test)
away_pred = away_model.predict(X_test)


In [46]:
home_mae = mean_absolute_error(y_home_test, home_pred)
away_mae = mean_absolute_error(y_away_test, away_pred)

home_mae, away_mae


ValueError: Found input variables with inconsistent numbers of samples: [1031, 881]

In [47]:
y_home = df_feat["FTHG"]
y_away = df_feat["FTAG"]


In [48]:
split_idx = int(len(df_feat) * 0.8)

X_train = X.iloc[:split_idx]
X_test  = X.iloc[split_idx:]

y_home_train = y_home.iloc[:split_idx]
y_home_test  = y_home.iloc[split_idx:]

y_away_train = y_away.iloc[:split_idx]
y_away_test  = y_away.iloc[split_idx:]


In [49]:
home_model = LinearRegression()
away_model = LinearRegression()

home_model.fit(X_train, y_home_train)
away_model.fit(X_train, y_away_train)


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [50]:
home_pred = home_model.predict(X_test)
away_pred = away_model.predict(X_test)


In [51]:
home_mae = mean_absolute_error(y_home_test, home_pred)
away_mae = mean_absolute_error(y_away_test, away_pred)

home_mae, away_mae


(1.0286373903849466, 0.910857642582655)

In [52]:
df_feat["home_gd_last5"] = (
    (df_feat["FTHG"] - df_feat["FTAG"])
    .groupby(df_feat["HomeTeam"])
    .shift()
    .rolling(5)
    .mean()
)

df_feat["away_gd_last5"] = (
    (df_feat["FTAG"] - df_feat["FTHG"])
    .groupby(df_feat["AwayTeam"])
    .shift()
    .rolling(5)
    .mean()
)


In [53]:
df_feat = df_feat.dropna().reset_index(drop=True)


In [54]:
X = df_feat[
    [
        "home_goals_last5",
        "away_goals_last5",
        "home_conceded_last5",
        "away_conceded_last5",
        "home_points",
        "away_points",
        "home_gd_last5",
        "away_gd_last5",
    ]
]


In [56]:
y_home = df_feat["FTHG"]
y_away = df_feat["FTAG"]

split_idx = int(len(df_feat) * 0.8)

X_train = X.iloc[:split_idx]
X_test  = X.iloc[split_idx:]

y_home_train = y_home.iloc[:split_idx]
y_home_test  = y_home.iloc[split_idx:]

y_away_train = y_away.iloc[:split_idx]
y_away_test  = y_away.iloc[split_idx:]


In [57]:
home_model = LinearRegression()
away_model = LinearRegression()

home_model.fit(X_train, y_home_train)
away_model.fit(X_train, y_away_train)


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [58]:
home_pred = home_model.predict(X_test)
away_pred = away_model.predict(X_test)


In [59]:
home_mae = mean_absolute_error(y_home_test, home_pred)
away_mae = mean_absolute_error(y_away_test, away_pred)

home_mae, away_mae


(1.018727281170612, 0.9106803191579307)

In [60]:
from sklearn.linear_model import PoissonRegressor


In [61]:
pois_home = PoissonRegressor(alpha=0.1, max_iter=1000)
pois_home.fit(X_train, y_home_train)


[WinError 2] The system cannot find the file specified
  File "C:\Users\shrey\premier-league-score-prediction\venv\lib\site-packages\joblib\externals\loky\backend\context.py", line 247, in _count_physical_cores
    cpu_count_physical = _count_physical_cores_win32()
  File "C:\Users\shrey\premier-league-score-prediction\venv\lib\site-packages\joblib\externals\loky\backend\context.py", line 299, in _count_physical_cores_win32
    cpu_info = subprocess.run(
  File "C:\Users\shrey\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 503, in run
    with Popen(*popenargs, **kwargs) as process:
  File "C:\Users\shrey\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 971, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\shrey\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 1456, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


0,1,2
,alpha,0.1
,fit_intercept,True
,solver,'lbfgs'
,max_iter,1000
,tol,0.0001
,warm_start,False
,verbose,0


In [62]:
home_pred_pois = pois_home.predict(X_test)


In [63]:
home_pois_mae = mean_absolute_error(y_home_test, home_pred_pois)
home_pois_mae


1.0194733346373692

In [64]:
pois_away = PoissonRegressor(alpha=0.1, max_iter=1000)
pois_away.fit(X_train, y_away_train)


0,1,2
,alpha,0.1
,fit_intercept,True
,solver,'lbfgs'
,max_iter,1000
,tol,0.0001
,warm_start,False
,verbose,0


In [65]:
away_pred_pois = pois_away.predict(X_test)


In [66]:
away_pois_mae = mean_absolute_error(y_away_test, away_pred_pois)
away_pois_mae


0.9047231307514007

In [67]:
results = {
    "Linear_home_MAE": home_mae,
    "Linear_away_MAE": away_mae,
    "Poisson_home_MAE": home_pois_mae,
    "Poisson_away_MAE": away_pois_mae,
}

results


{'Linear_home_MAE': 1.018727281170612,
 'Linear_away_MAE': 0.9106803191579307,
 'Poisson_home_MAE': 1.0194733346373692,
 'Poisson_away_MAE': 0.9047231307514007}

In [68]:
import joblib

joblib.dump(home_model, "../results/linear_home_model.pkl")
joblib.dump(away_model, "../results/linear_away_model.pkl")
joblib.dump(pois_home, "../results/poisson_home_model.pkl")
joblib.dump(pois_away, "../results/poisson_away_model.pkl")


['../results/poisson_away_model.pkl']

In [69]:
# take the most recent match features as an example
latest_features = X_test.iloc[-1].values.reshape(1, -1)

pred_home_goals = pois_home.predict(latest_features)[0]
pred_away_goals = pois_away.predict(latest_features)[0]

pred_home_goals, pred_away_goals




(np.float64(1.6862491595044342), np.float64(1.0174179286407947))

In [70]:
latest_features_df = X_test.iloc[[-1]]

pred_home_goals = pois_home.predict(latest_features_df)[0]
pred_away_goals = pois_away.predict(latest_features_df)[0]

pred_home_goals, pred_away_goals


(np.float64(1.6862491595044342), np.float64(1.0174179286407947))

In [71]:
pred_home_score = int(round(pred_home_goals))
pred_away_score = int(round(pred_away_goals))

pred_home_score, pred_away_score


(2, 1)

In [74]:
def predict_match(home_team, away_team, df_feat, pois_home, pois_away):
    # get latest match involving home team (home OR away)
    home_matches = df_feat[
        (df_feat["HomeTeam"] == home_team) | (df_feat["AwayTeam"] == home_team)
    ]
    
    away_matches = df_feat[
        (df_feat["HomeTeam"] == away_team) | (df_feat["AwayTeam"] == away_team)
    ]
    
    if len(home_matches) == 0 or len(away_matches) == 0:
        raise ValueError("One of the teams has no historical data after filtering.")
    
    home_last = home_matches.iloc[-1]
    away_last = away_matches.iloc[-1]
    
    X_match = pd.DataFrame([{
        "home_goals_last5": home_last["home_goals_last5"],
        "away_goals_last5": away_last["away_goals_last5"],
        "home_conceded_last5": home_last["home_conceded_last5"],
        "away_conceded_last5": away_last["away_conceded_last5"],
        "home_points": home_last["home_points"],
        "away_points": away_last["away_points"],
        "home_gd_last5": home_last["home_gd_last5"],
        "away_gd_last5": away_last["away_gd_last5"],
    }])
    
    home_xg = pois_home.predict(X_match)[0]
    away_xg = pois_away.predict(X_match)[0]
    
    return int(round(home_xg)), int(round(away_xg)), home_xg, away_xg


In [76]:
sorted(df_feat["HomeTeam"].unique())


['Arsenal',
 'Aston Villa',
 'Bournemouth',
 'Brentford',
 'Brighton',
 'Burnley',
 'Cardiff',
 'Chelsea',
 'Crystal Palace',
 'Everton',
 'Fulham',
 'Huddersfield',
 'Hull',
 'Ipswich',
 'Leeds',
 'Leicester',
 'Liverpool',
 'Luton',
 'Man City',
 'Man United',
 'Middlesbrough',
 'Newcastle',
 'Norwich',
 "Nott'm Forest",
 'QPR',
 'Sheffield United',
 'Southampton',
 'Stoke',
 'Sunderland',
 'Swansea',
 'Tottenham',
 'Watford',
 'West Brom',
 'West Ham',
 'Wolves']

In [77]:
predict_match("Arsenal", "Man City", df_feat, pois_home, pois_away)


(2, 1, np.float64(1.6345166171702306), np.float64(1.2848901440827267))

In [81]:
import numpy as np
import pandas as pd
from scipy.stats import poisson

def predict_match_with_probs(
    home_team,
    away_team,
    df_feat,
    pois_home,
    pois_away,
    max_goals=6
):
    # get latest matches involving the teams
    home_matches = df_feat[
        (df_feat["HomeTeam"] == home_team) | (df_feat["AwayTeam"] == home_team)
    ]
    away_matches = df_feat[
        (df_feat["HomeTeam"] == away_team) | (df_feat["AwayTeam"] == away_team)
    ]
    
    if len(home_matches) == 0 or len(away_matches) == 0:
        raise ValueError("One of the teams has no historical data.")
    
    home_last = home_matches.iloc[-1]
    away_last = away_matches.iloc[-1]
    
    # build feature row
    X_match = pd.DataFrame([{
        "home_goals_last5": home_last["home_goals_last5"],
        "away_goals_last5": away_last["away_goals_last5"],
        "home_conceded_last5": home_last["home_conceded_last5"],
        "away_conceded_last5": away_last["away_conceded_last5"],
        "home_points": home_last["home_points"],
        "away_points": away_last["away_points"],
        "home_gd_last5": home_last["home_gd_last5"],
        "away_gd_last5": away_last["away_gd_last5"],
    }])
    
    # expected goals
    home_xg = float(pois_home.predict(X_match)[0])
    away_xg = float(pois_away.predict(X_match)[0])
    
    # predicted scoreline
    pred_home = int(round(home_xg))
    pred_away = int(round(away_xg))
    
    # goal probability distributions
    home_probs = poisson.pmf(np.arange(0, max_goals + 1), home_xg)
    away_probs = poisson.pmf(np.arange(0, max_goals + 1), away_xg)
    
    win_prob = 0.0
    draw_prob = 0.0
    loss_prob = 0.0
    
    for i in range(max_goals + 1):
        for j in range(max_goals + 1):
            p = home_probs[i] * away_probs[j]
            if i > j:
                win_prob += p
            elif i == j:
                draw_prob += p
            else:
                loss_prob += p
    
    return {
        "predicted_score": f"{pred_home} - {pred_away}",
        "home_xG": home_xg,
        "away_xG": away_xg,
        "home_win_prob": float(win_prob),
        "draw_prob": float(draw_prob),
        "away_win_prob": float(loss_prob),
    }


In [82]:
predict_match_with_probs("Arsenal", "Man City", df_feat, pois_home, pois_away)


{'predicted_score': '2 - 1',
 'home_xG': 1.6345166171702306,
 'away_xG': 1.2848901440827267,
 'home_win_prob': 0.45460597668787206,
 'draw_prob': 0.24268099434191556,
 'away_win_prob': 0.3008308577188214}

In [83]:
predict_match_with_probs("Liverpool", "Chelsea", df_feat, pois_home, pois_away)


{'predicted_score': '2 - 1',
 'home_xG': 1.6532645184079127,
 'away_xG': 1.0379092937912557,
 'home_win_prob': 0.5158501290537898,
 'draw_prob': 0.24466310490879575,
 'away_win_prob': 0.23777707379405832}

In [84]:
predict_match_with_probs("Tottenham", "Newcastle", df_feat, pois_home, pois_away)


{'predicted_score': '1 - 1',
 'home_xG': 1.2903616599248038,
 'away_xG': 1.4159294830087454,
 'home_win_prob': 0.342105627647234,
 'draw_prob': 0.2572419485391406,
 'away_win_prob': 0.3996019242437913}

In [85]:
predict_match_with_probs("Liverpool", "West Ham", df_feat, pois_home, pois_away)


{'predicted_score': '2 - 1',
 'home_xG': 1.6532645184079127,
 'away_xG': 1.0379092937912557,
 'home_win_prob': 0.5158501290537898,
 'draw_prob': 0.24466310490879575,
 'away_win_prob': 0.23777707379405832}

In [86]:
predict_match_with_probs("Sunderland", "Burnley", df_feat, pois_home, pois_away)


{'predicted_score': '2 - 1',
 'home_xG': 1.6862491595044342,
 'away_xG': 1.0174179286407947,
 'home_win_prob': 0.5286199963665892,
 'draw_prob': 0.24157980797966114,
 'away_win_prob': 0.22791550937587318}

In [87]:
predict_match_with_probs("Tottenham", "Man City", df_feat, pois_home, pois_away)

{'predicted_score': '1 - 1',
 'home_xG': 1.251319802621914,
 'away_xG': 1.4747486616126673,
 'home_win_prob': 0.3210647191079797,
 'draw_prob': 0.2548989902131123,
 'away_win_prob': 0.4228742528690706}