In [14]:
import json
import pandas as pd
from pathlib import Path

project_root = Path().resolve()
if project_root.name == 'notebooks':
    project_root = project_root.parent

raw_dir = project_root / 'data' / 'raw'
processed_dir = project_root / 'data' / 'processed'
processed_dir.mkdir(parents=True, exist_ok=True)

bootstrap_path = raw_dir / 'fpl_bootstrap.json'
parquet_path = raw_dir / 'fpl_histories_SAMPLE.parquet'
understat_path = raw_dir / 'understat_players_2023.csv'
fixtures_path = raw_dir / 'fpl_fixtures.json'

missing = []
missing = []
if not bootstrap_path.exists():
    missing.append(("fpl_bootstrap", bootstrap_path))
if not parquet_path.exists():
    missing.append(("fpl_histories", parquet_path))
if not understat_path.exists():
    missing.append(("understat_players", understat_path))

if missing:
    msg = "\n".join(
        f"Missing {path}. Run: python -m src.data_fetch --resource {resource} --out {path}"
        for resource, path in missing
    )
    raise FileNotFoundError(msg)


bootstrap = json.loads(bootstrap_path.read_text())
fpl_hist = pd.read_parquet(parquet_path)
understat = pd.read_csv(understat_path)

if fixtures_path.exists():
    fixtures = pd.read_json(fixtures_path)
else:
    fixtures = None
    print('[info] fixtures file not found; skipping fixtures merge')


In [15]:
elements = pd.DataFrame(bootstrap['elements'])
teams = pd.DataFrame(bootstrap['teams']).rename(columns={'id': 'team_id'})
positions = pd.DataFrame(bootstrap['element_types']).rename(columns={'id': 'position_id'})

fpl_hist = (
    fpl_hist
    .merge(
        elements[['id', 'web_name', 'team', 'element_type']],
        left_on='element',
        right_on='id',
        how='left'
    )
    .merge(
        teams[['team_id', 'name', 'short_name']],
        left_on='team',
        right_on='team_id',
        how='left',
        suffixes=('', '_team')
    )
    .merge(
        positions[['position_id', 'singular_name_short']],
        left_on='element_type',
        right_on='position_id',
        how='left'
    )
)

fpl_hist = fpl_hist.rename(
    columns={
        'web_name': 'player_name',
        'name': 'team_name',
        'short_name': 'team_short_name',
        'singular_name_short': 'position_short'
    }
).drop(columns=['id', 'team_id', 'position_id'])


In [16]:
fpl_hist = fpl_hist.sort_values(['element', 'round'])

fpl_hist['points_last3'] = (
    fpl_hist.groupby('element')['total_points']
    .rolling(3, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

fpl_hist['minutes_last3'] = (
    fpl_hist.groupby('element')['minutes']
    .rolling(3, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)


In [17]:
understat = understat.rename(columns={'player_name': 'understat_name', 'team_title': 'understat_team'})

mapping_path = project_root / 'data' / 'mappings' / 'fpl_understat_map.csv'
if mapping_path.exists():
    player_map = pd.read_csv(mapping_path)
else:
    print('[info] mapping file not found; creating fallback using player names.')
    player_map = (
        fpl_hist[['element', 'player_name']]
        .drop_duplicates()
        .rename(columns={'player_name': 'understat_name'})
    )

if 'understat_name' not in player_map.columns:
    raise ValueError('player_map must contain an "understat_name" column.')

player_map = player_map[['element', 'understat_name']].drop_duplicates()

fpl_hist = fpl_hist.merge(player_map, on='element', how='left')
fpl_hist = fpl_hist.merge(
    understat[['understat_name', 'understat_team', 'xG', 'xA', 'shots', 'key_passes']],
    on='understat_name',
    how='left'
)


[info] mapping file not found; creating fallback using player names.


In [18]:
feature_cols = [
    'element',
    'round',
    'player_name',
    'team_name',
    'team_short_name',
    'position_short',
    'total_points',
    'points_last3',
    'minutes',
    'minutes_last3',
    'goals_scored',
    'assists',
    'clean_sheets',
    'xG',
    'xA',
    'shots',
    'key_passes'
]

missing_cols = [col for col in feature_cols if col not in fpl_hist.columns]
if missing_cols:
    raise KeyError('Missing expected columns: {cols}'.format(cols=missing_cols))

processed = fpl_hist[feature_cols].copy()
processed.to_parquet(processed_dir / 'fpl_player_weeks.parquet', index=False)
processed.head()


Unnamed: 0,element,round,player_name,team_name,team_short_name,position_short,total_points,points_last3,minutes,minutes_last3,goals_scored,assists,clean_sheets,xG,xA,shots,key_passes
0,1,1,Raya,Arsenal,ARS,GKP,10,10.0,90,90.0,0,0,1,,,,
1,1,2,Raya,Arsenal,ARS,GKP,6,8.0,90,90.0,0,0,1,,,,
2,1,3,Raya,Arsenal,ARS,GKP,2,6.0,90,90.0,0,0,0,,,,
3,1,4,Raya,Arsenal,ARS,GKP,6,4.666667,90,90.0,0,0,1,,,,
4,1,5,Raya,Arsenal,ARS,GKP,2,3.333333,90,90.0,0,0,0,,,,
