# Conception

We have Time Series Data (a sequence of events).

    GW1​,GW2​,GW3​...

Machine Learning models (like XGBoost) require Supervised Learning Data (X→y).

    We want to predict Next Week's Points (yt+1​).

    We use Current Week's Stats (Xt​) to do it.

We must construct a Sliding Window:

    Input (X): Stats from Gameweek 1 to 5.

    Target (y): Points in Gameweek 6.

# Data cleaning

## Merging the History

In [18]:
import pandas as pd
from pathlib import Path
import glob

# --- MAP TEAM IDS TO NAMES (The Fix) ---
url = "https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/master_team_list.csv"
teams_db = pd.read_csv(url)

# Create a lookup dictionary: {(Season, ID) -> Team Name}
team_map = teams_db.set_index(['season', 'team']).to_dict()['team_name']

# Apply the map to the dataframe
# We need to ensure 'season' and 'opponent_team' match the index structure
# Note: master_df['opponent_team'] are IDs here
master_df['opponent_team_name'] = master_df.set_index(['season', 'opponent_team']).index.map(team_map)
# Replace IDs with Names where available
master_df['opponent_team'] = master_df['opponent_team_name'].fillna(master_df['opponent_team'])

print("Team IDs successfully translated to Names.")

# 1. Setup Paths
DATA_PATH = Path("../data/raw")

# 2. Load all Season Files
all_files = glob.glob(str(DATA_PATH / "*_merged_gw.csv"))

dfs = []
for filename in all_files:
    # Extract season name from filename
    season_name = Path(filename).name.split('_')[0]
    
    # Read CSV
    # Use the python engine and skip malformed lines so a bad row doesn't break the whole load.
    # on_bad_lines='skip' requires pandas >= 1.3.0
    # Note: low_memory is not supported with engine='python', so it's removed.
    df = pd.read_csv(filename, encoding='latin-1', engine='python', on_bad_lines='skip')
    
    # Add metadata column (so we know which season this row belongs to)
    df['season'] = season_name
    
    dfs.append(df)

# 3. Concatenate into one massive dataset
master_df = pd.concat(dfs, axis=0, ignore_index=True)

# 4. Clean Column Names (Engineering Best Practice)
# Some seasons use 'Kickoff time' vs 'kickoff_time'. We normalize this.
master_df.columns = master_df.columns.str.lower().str.replace(' ', '_')

print(f"Dataset Shape: {master_df.shape}")
print(f"Columns: {master_df.columns.tolist()}")
master_df.head()

Team IDs successfully translated to Names.
Dataset Shape: (120220, 43)
Columns: ['name', 'position', 'team', 'xp', 'assists', 'bonus', 'bps', 'clean_sheets', 'creativity', 'element', 'fixture', 'goals_conceded', 'goals_scored', 'ict_index', 'influence', 'kickoff_time', 'minutes', 'opponent_team', 'own_goals', 'penalties_missed', 'penalties_saved', 'red_cards', 'round', 'saves', 'selected', 'team_a_score', 'team_h_score', 'threat', 'total_points', 'transfers_balance', 'transfers_in', 'transfers_out', 'value', 'was_home', 'yellow_cards', 'gw', 'season', 'expected_assists', 'expected_goal_involvements', 'expected_goals', 'expected_goals_conceded', 'starts', 'modified']


Unnamed: 0,name,position,team,xp,assists,bonus,bps,clean_sheets,creativity,element,...,was_home,yellow_cards,gw,season,expected_assists,expected_goal_involvements,expected_goals,expected_goals_conceded,starts,modified
0,Aaron Connolly,FWD,Brighton,0.5,0,0,-3,0,0.3,78,...,True,0,1,2020-21,,,,,,
1,Aaron Cresswell,DEF,West Ham,2.1,0,0,11,0,11.2,435,...,True,0,1,2020-21,,,,,,
2,Aaron Mooy,MID,Brighton,0.0,0,0,0,0,0.0,60,...,True,0,1,2020-21,,,,,,
3,Aaron Ramsdale,GK,Sheffield Utd,2.5,0,0,12,0,0.0,483,...,True,0,1,2020-21,,,,,,
4,Abdoulaye DoucourÃ©,MID,Everton,1.3,0,0,20,1,44.6,512,...,False,0,1,2020-21,,,,,,


## Advanced Feature Construction.

In [19]:
# --- 0. SETUP & CLEANING ---
# 1. Convert to datetime
master_df['kickoff_time'] = pd.to_datetime(master_df['kickoff_time'], errors='coerce')

# 2. STRIP TIMEZONES (Crucial for merging)
master_df['kickoff_time'] = master_df['kickoff_time'].dt.tz_localize(None)

# 3. FIX COLUMN NAMES (The 'KeyError' Fix)
# Historical data uses 'value', Live data uses 'now_cost'. We normalize to 'now_cost'.
if 'value' in master_df.columns:
    master_df = master_df.rename(columns={'value': 'now_cost'})

# 4. GLOBAL SORT
master_df = master_df.sort_values(by=['kickoff_time'])

# 5. RESET (Make this cell re-runnable)
cols_to_drop = ['opp_def_strength_vs_pos', 'xp_delta', 'naive_xP', 'last_3_xp_delta', 
                'team_form', 'opp_form', 'team_strength_diff']
master_df = master_df.drop(columns=[c for c in cols_to_drop if c in master_df.columns], errors='ignore')
# --- INSERT THIS BLOCK AFTER LOADING master_df ---

print("0. Mapping Team IDs to Names...")

# 6. Download the official Team Master List
url = "https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/master_team_list.csv"
teams_db = pd.read_csv(url)

# 7. Create a lookup dictionary: {(Season, ID) -> Team Name}
# We need to map (Season='2020-21', ID=1) -> 'Arsenal'
# Because ID 1 changes every year.
team_map = teams_db.set_index(['season', 'team'])['team_name'].to_dict()

# 8. Create a helper column for mapping
# We zip season and opponent_team to match the dictionary keys
master_df['temp_id'] = list(zip(master_df['season'], master_df['opponent_team']))

# 9. Map the names
master_df['opponent_team_name'] = master_df['temp_id'].map(team_map)

# 10. Fill missing values (Safety net)
# If the map fails, keep the ID so we don't crash, but warn us
master_df['opponent_team'] = master_df['opponent_team_name'].fillna(master_df['opponent_team'])

# 11. Clean up
master_df = master_df.drop(columns=['temp_id', 'opponent_team_name'])

print(f"Mapping Complete. Example Opponents: {master_df['opponent_team'].unique()[:5]}")

# --- PATCH FOR 2024-25 SEASON ---
# The generic map sometimes misses the current active season.
# We hardcode the 2024-25 IDs to ensure consistency.

patch_map_2425 = {
    1: 'Arsenal', 2: 'Aston Villa', 3: 'Bournemouth', 4: 'Brentford', 
    5: 'Brighton', 6: 'Chelsea', 7: 'Crystal Palace', 8: 'Everton', 
    9: 'Fulham', 10: 'Ipswich', 11: 'Leicester', 12: 'Liverpool', 
    13: 'Man City', 14: 'Man Utd', 15: 'Newcastle', 16: "Nott'm Forest", 
    17: 'Southampton', 18: 'Spurs', 19: 'West Ham', 20: 'Wolves'
}

# Apply patch ONLY to rows where season is 2024-25 and opponent_team is numeric
def patch_teams(row):
    if row['season'] == '2024-25' and str(row['opponent_team']).isdigit():
        return patch_map_2425.get(int(row['opponent_team']), row['opponent_team'])
    return row['opponent_team']

master_df['opponent_team'] = master_df.apply(patch_teams, axis=1)

print(f"Final Cleanup. Remaining numeric opponents: {master_df[master_df['opponent_team'].astype(str).str.isdigit()]['opponent_team'].unique()}")
# --- 1. THE VULNERABILITY MATRIX ---
print("1. Calculating Defensive Vulnerabilities...")
df_opp = master_df.sort_values(by=['opponent_team', 'position', 'kickoff_time'])

def calculate_opponent_weakness(df, window=5):
    # We use 'total_points' to see how much the opponent conceded
    return df.groupby(['opponent_team', 'position'])['total_points'].transform(
        lambda x: x.shift(1).rolling(window=window, min_periods=2).mean()
    ).fillna(0)

master_df['opp_def_strength_vs_pos'] = calculate_opponent_weakness(df_opp, window=5)


# --- 2. THE REGRESSION DELTA (Luck Detection) ---
print("2. Calculating Luck/Regression...")
# Heuristic: ~100 Threat is a Goal (5pts), ~200 Creativity is an Assist (3pts)
# We fillna(0) to prevent errors with missing stats in very old data
master_df['threat'] = pd.to_numeric(master_df['threat'], errors='coerce').fillna(0)
master_df['creativity'] = pd.to_numeric(master_df['creativity'], errors='coerce').fillna(0)

master_df['naive_xP'] = (master_df['threat'] / 15.0) + (master_df['creativity'] / 30.0) + 2.0 
master_df['xp_delta'] = master_df['total_points'] - master_df['naive_xP']

master_df['last_3_xp_delta'] = master_df.groupby('name')['xp_delta'].transform(
    lambda x: x.shift(1).rolling(window=3).mean()
).fillna(0)


# --- 3. TEAM STRENGTH DIFF ---
print("3. Calculating Team Strength Differences...")

# Calculate Team Daily Points
team_points = master_df.groupby(['team', 'kickoff_time'])['total_points'].sum().reset_index()

# Calculate Rolling Form for the TEAM
team_points['team_form'] = team_points.groupby('team')['total_points'].transform(
    lambda x: x.shift(1).rolling(window=5, min_periods=2).mean()
)

# Merge 1: Attach 'team_form' to the Player's Team
master_df = master_df.merge(team_points[['team', 'kickoff_time', 'team_form']], 
                            on=['team', 'kickoff_time'], how='left')

# Merge 2: Attach 'opp_form' to the Opponent Team
opp_stats = team_points[['team', 'kickoff_time', 'team_form']].rename(
    columns={'team': 'opponent_team', 'team_form': 'opp_form'}
)
# Ensure both columns are the same type (string)
master_df['opponent_team'] = master_df['opponent_team'].astype(str)
opp_stats['opponent_team'] = opp_stats['opponent_team'].astype(str)
master_df = master_df.merge(opp_stats, on=['opponent_team', 'kickoff_time'], how='left')

# The Calculation
master_df['team_strength_diff'] = master_df['team_form'] - master_df['opp_form']
master_df['team_strength_diff'] = master_df['team_strength_diff'].fillna(0)


# --- 4. PLAYER FORM & SAVE ---
print("4. Calculating Standard Lags...")
grouped = master_df.groupby('name')
master_df['mean_pts_3'] = grouped['total_points'].transform(lambda x: x.shift(1).rolling(3).mean()).fillna(0)
master_df['mean_threat_3'] = grouped['threat'].transform(lambda x: x.shift(1).rolling(3).mean()).fillna(0)
master_df['mean_creativity_3'] = grouped['creativity'].transform(lambda x: x.shift(1).rolling(3).mean()).fillna(0)
master_df['mean_mins_3'] = grouped['minutes'].transform(lambda x: x.shift(1).rolling(3).mean()).fillna(0)

# Normalize Cost (Now guaranteed to exist)
master_df['now_cost'] = master_df['now_cost'] / 10.0


# --- 5. FINAL SELECTION ---
final_cols = [
    'name', 'season', 'kickoff_time', 
    'position', 'was_home', 'opponent_team', 'now_cost',
    'opp_def_strength_vs_pos',  # Tactical
    'last_3_xp_delta',          # Luck
    'team_strength_diff',       # Strength
    'mean_pts_3', 'mean_threat_3', 'mean_creativity_3', 'mean_mins_3',
    'total_points'
]

train_df = master_df[final_cols].dropna()

# Save
output_path = Path("../data/processed/training_data_advanced.csv")
train_df.to_csv(output_path, index=False)

print(f"\nSuccess! Advanced Training Data Saved: {len(train_df)} rows.")

0. Mapping Team IDs to Names...
Mapping Complete. Example Opponents: ['Fulham' 'Arsenal' 'Southampton' 'Crystal Palace' 'Leeds']
Final Cleanup. Remaining numeric opponents: []
1. Calculating Defensive Vulnerabilities...
2. Calculating Luck/Regression...
3. Calculating Team Strength Differences...
4. Calculating Standard Lags...

Success! Advanced Training Data Saved: 120220 rows.


In [20]:
# Check unique values first to see exactly how they are spelled
print(train_df['opponent_team'].unique())

# Then run your specific check
check = train_df[
    (train_df['opponent_team'].astype(str).str.contains('Sheffield')) & 
    (train_df['position'] == 'MID')
].sort_values('kickoff_time', ascending=False).head(5)

print(check[['kickoff_time', 'opponent_team', 'opp_def_strength_vs_pos', 'total_points']])

['Fulham' 'Arsenal' 'Southampton' 'Crystal Palace' 'Leeds' 'Liverpool'
 'West Ham' 'Newcastle' 'West Brom' 'Leicester' 'Spurs' 'Everton'
 'Sheffield Utd' 'Wolves' 'Brighton' 'Chelsea' 'Man Utd' 'Burnley'
 'Aston Villa' 'Man City' 'Brentford' 'Watford' 'Norwich' "Nott'm Forest"
 'Bournemouth' 'Luton' 'Ipswich']
              kickoff_time  opponent_team  opp_def_strength_vs_pos  \
105937 2024-05-19 15:00:00  Sheffield Utd                      0.2   
105672 2024-05-19 15:00:00  Sheffield Utd                      2.2   
105179 2024-05-19 15:00:00  Sheffield Utd                      1.6   
105243 2024-05-19 15:00:00  Sheffield Utd                      1.4   
105318 2024-05-19 15:00:00  Sheffield Utd                      4.0   

        total_points  
105937             0  
105672             7  
105179             0  
105243            16  
105318             3  
