## Load data

In [2]:
import pandas as pd

tm = pd.read_csv("../data/processed/arsenal_team_match_stats.csv")
pm = pd.read_csv("../data/processed/arsenal_player_match_stats.csv")
ars_sched = pd.read_csv("../data/processed/arsenal_schedule.csv")

tm["goal_diff"] = tm["GF"] - tm["GA"]

## Prepare schedule mapping

In [4]:
import re

ars_sched.columns = [c.strip().lower().replace(" ", "_") for c in ars_sched.columns]
ars_sched["date"] = pd.to_datetime(ars_sched["date"])

def parse_score(score):
    s = str(score).replace("â€“", "-")
    if "-" not in s:
        return (None, None)
    a, b = s.split("-", 1)
    try:
        return int(a.strip()), int(b.strip())
    except:
        return (None, None)

hg_ag = ars_sched["score"].apply(parse_score)
ars_sched["home_goals"] = [x[0] for x in hg_ag]
ars_sched["away_goals"] = [x[1] for x in hg_ag]

ars_sched["venue"] = ars_sched["home_team"].astype(str).str.strip().apply(
    lambda x: "Home" if x == "Arsenal" else "Away"
)
ars_sched["opponent"] = ars_sched.apply(
    lambda r: r["away_team"] if r["venue"] == "Home" else r["home_team"],
    axis=1
)
ars_sched["gf"] = ars_sched.apply(
    lambda r: r["home_goals"] if r["venue"] == "Home" else r["away_goals"],
    axis=1
)
ars_sched["ga"] = ars_sched.apply(
    lambda r: r["away_goals"] if r["venue"] == "Home" else r["home_goals"],
    axis=1
)

if "game_id" not in ars_sched.columns and "match_report" in ars_sched.columns:
    def extract_game_id(url):
        m = re.search(r"/matches/([^/]+)/", str(url))
        return m.group(1) if m else None
    ars_sched["game_id"] = ars_sched["match_report"].apply(extract_game_id)

ars_sched[["date", "venue", "opponent", "gf", "ga", "game_id"]].head()


Unnamed: 0,date,venue,opponent,gf,ga,game_id
0,2016-08-14,Home,Liverpool,3,4,0e815975
1,2016-08-20,Away,Leicester City,0,0,7dcbb8b3
2,2016-08-27,Away,Watford,3,1,b123da8c
3,2016-09-10,Home,Southampton,2,1,a8053d4d
4,2016-09-17,Away,Hull City,4,1,f2805e6a


## Merge game_id into team match table

In [5]:
tm_clean = tm.copy()
tm_clean["date"] = pd.to_datetime(tm_clean["date"])
tm_clean["venue"] = tm_clean["venue"].astype(str).str.strip().str.title()

sched_map = ars_sched[["date", "venue", "opponent", "gf", "ga", "game_id"]].drop_duplicates()

tm_with_id = tm_clean.merge(
    sched_map,
    left_on=["date", "venue", "opponent", "GF", "GA"],
    right_on=["date", "venue", "opponent", "gf", "ga"],
    how="left"
)

tm_with_id["game_id"].isna().sum()


np.int64(0)

## Build player minutes feature matrix

In [6]:
# Pivot player minutes
player_minutes = pm.pivot_table(
    index="game_id",
    columns="player",
    values="min",
    aggfunc="sum",
    fill_value=0
)

player_minutes.shape


(304, 80)

In [7]:
# Join match-level features
match_features = tm_with_id.set_index("game_id")[["goal_diff", "venue"]]

model_df = match_features.join(player_minutes, how="inner")

model_df.shape


(304, 82)

In [8]:
# Encode home vs away
model_df["is_home"] = (model_df["venue"].str.lower() == "home").astype(int)
model_df = model_df.drop(columns=["venue"])


In [9]:
# Define X and y
y = model_df["goal_diff"]
X = model_df.drop(columns=["goal_diff"])

X.shape, y.shape


((304, 81), (304,))

- Each coefficient means **"Holding everything else constant, an extra minute played by player X is associated with a delta in goal difference"**

We will not be training our data just yet, 