In [2]:
import json
import pandas as pd
import os

# Load player data
with open("../data/raw/bootstrap-static.json") as f:
    data = json.load(f)

players_df = pd.DataFrame(data["elements"])
teams_df = pd.DataFrame(data["teams"])

# Create team ID → team name mapping
team_id_to_name = dict(zip(teams_df["id"], teams_df["name"]))

# Group players by team ID
team_players = players_df.groupby("team")["id"].apply(list).to_dict()

In [4]:
def load_player_history(player_id):
    filepath = f"../data/raw/player_histories/player_{player_id}.json"
    if os.path.exists(filepath):
        with open(filepath) as f:
            return json.load(f)["history"]
    return []

In [21]:
from collections import defaultdict

def compute_team_form_home_away(last_n_home=3, last_n_away=3):
    team_stats = defaultdict(lambda: {
        "goals_scored_home": [],
        "goals_conceded_home": [],
        "goals_scored_away": [],
        "goals_conceded_away": []
    })

    for team_id, player_ids in team_players.items():
        # Group stats by fixture (to avoid double-counting goals from multiple players)
        fixture_stats_home = defaultdict(lambda: {"scored": 0, "conceded": 0})
        fixture_stats_away = defaultdict(lambda: {"scored": 0, "conceded": 0})

        for pid in player_ids:
            history = load_player_history(pid)

            for match in history:
                fixture_id = match["fixture"]
                if match["was_home"]:
                    fixture_stats_home[fixture_id]["scored"] += match["goals_scored"]
                    fixture_stats_home[fixture_id]["conceded"] += match["goals_conceded"]
                else:
                    fixture_stats_away[fixture_id]["scored"] += match["goals_scored"]
                    fixture_stats_away[fixture_id]["conceded"] += match["goals_conceded"]

        # Get only the last N fixtures
        home_matches = list(fixture_stats_home.values())[-last_n_home:]
        away_matches = list(fixture_stats_away.values())[-last_n_away:]

        # Calculate averages
        def avg(lst, key): return round(sum(m[key] for m in lst) / len(lst), 2) if lst else 0

        team_stats[team_id]["att_form_home"] = avg(home_matches, "scored")
        team_stats[team_id]["att_form_away"] = avg(away_matches, "scored")
        team_stats[team_id]["def_form_home"] = avg(home_matches, "conceded")
        team_stats[team_id]["def_form_away"] = avg(away_matches, "conceded")

        # Add team name
        team_stats[team_id]["team"] = team_id_to_name[team_id]

    return pd.DataFrame.from_dict(team_stats, orient="index").reset_index(drop=True)

In [24]:
team_form_df = compute_team_form_home_away(last_n_home=3, last_n_away=3)
team_form_df = team_form_df[["team", "att_form_home", "def_form_home", "att_form_away", "def_form_away"]]
team_form_df.sort_values("att_form_home", ascending=False)

Unnamed: 0,team,att_form_home,def_form_home,att_form_away,def_form_away
14,Newcastle,2.0,20.67,1.33,6.67
11,Liverpool,2.0,3.67,2.0,18.33
8,Fulham,1.67,10.33,0.33,10.33
4,Brighton,1.33,16.33,2.0,13.33
17,Spurs,1.33,11.67,1.33,12.67
19,Wolves,1.33,23.0,1.33,14.0
12,Man City,1.33,13.33,0.33,3.67
7,Everton,1.33,14.0,0.67,10.33
15,Nott'm Forest,1.0,2.33,2.67,28.33
13,Man Utd,1.0,9.67,1.67,10.0


In [26]:
# Make a copy just in case
norm_df = team_form_df.copy()

# Normalize attacking form (higher is better)
for col in ["att_form_home", "att_form_away"]:
    norm_df[col + "_norm"] = (
        norm_df[col] - norm_df[col].min()
    ) / (norm_df[col].max() - norm_df[col].min())

In [27]:
for col in ["def_form_home", "def_form_away"]:
    normalized = (norm_df[col] - norm_df[col].min()) / (norm_df[col].max() - norm_df[col].min())
    norm_df[col + "_norm"] = 1 - normalized  # invert so higher = better defense

In [28]:
norm_df

Unnamed: 0,team,att_form_home,def_form_home,att_form_away,def_form_away,att_form_home_norm,att_form_away_norm,def_form_home_norm,def_form_away_norm
0,Arsenal,1.0,7.33,0.67,7.33,0.5,0.250936,0.792528,0.741264
1,Aston Villa,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,Bournemouth,0.33,14.0,1.33,13.0,0.165,0.498127,0.603736,0.541122
3,Brentford,0.33,7.0,1.0,11.0,0.165,0.374532,0.801868,0.611719
4,Brighton,1.33,16.33,2.0,13.33,0.665,0.749064,0.537787,0.529474
5,Chelsea,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
6,Crystal Palace,1.0,10.0,0.33,9.33,0.5,0.123596,0.716954,0.670667
7,Everton,1.33,14.0,0.67,10.33,0.665,0.250936,0.603736,0.635369
8,Fulham,1.67,10.33,0.33,10.33,0.835,0.123596,0.707614,0.635369
9,Ipswich,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [29]:
norm_df.to_csv("../data/processed/team_form_dynamic.csv", index=False)