# Model Selection

Objective: Build and compare models for 3-gameweek forecasting while
          addressing the 61% zero-point distribution problem.

Model Architecture:
1. Naive Regression (Baseline)
2. Filtered Regression (Train only on playing time)
3. Two-Stage Model (Classification + Regression)

Mathematical Foundation:
For zero-inflated data, the expected value decomposes as:

E[Y] = P(Y > 0) Ã— E[Y | Y > 0]

Two-stage approach models these components separately for better accuracy.

## Setup & Configuration


In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import lightgbm as lgb

sns.set_style('darkgrid')

# Paths
BASE_DIR = Path.cwd().parent
PROCESSED_DIR = BASE_DIR / "data" / "processed"
FEATURES_FILE = PROCESSED_DIR / "fpl_features_engineered.csv"

# Load
df = pd.read_csv(FEATURES_FILE)
print(f"Loaded: {df.shape}")
print(f"Memory: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

Loaded: (93666, 151)
Memory: 144.2 MB


## 1. Feature Selection

Select features for modeling, excluding:
- Target variables
- IDs and metadata
- High-cardinality strings

In [11]:
# Encode position for analysis later
df['position_encoded'] = df['position'].map({'GK': 0, 'DEF': 1, 'MID': 2, 'FWD': 3})

# Exclude columns
exclude_cols = [
    'total_points',  # Target variable
    'element', 'name', 'season', 'kickoff_time',  # IDs and metadata
    'fixture', 'opponent_team', 'team',  # High cardinality
    'match_score', 'opponent_team_name',  # Derived strings
    'round', 'modified', 'xP', 'starts',  # Redundant/sparse
    'points_above_4', 'points_above_6',  # Intermediate features
]

# Available feature columns
feature_cols = [col for col in df.columns if col not in exclude_cols]

# Separate by category for analysis
rolling_features = [c for c in feature_cols if 'roll' in c]
lag_features = [c for c in feature_cols if 'lag' in c]
momentum_features = [c for c in feature_cols if any(x in c for x in ['momentum', 'streak', 'trend', 'acceleration'])]
position_features = [c for c in feature_cols if 'position' in c or 'vs_' in c]
xg_features = [c for c in feature_cols if 'expected' in c or 'xg' in c]
base_features = [c for c in feature_cols if c not in rolling_features + lag_features + 
                 momentum_features + position_features + xg_features]

print("Feature Categories:")
print(f"  Rolling stats: {len(rolling_features)}")
print(f"  Lags: {len(lag_features)}")
print(f"  Momentum: {len(momentum_features)}")
print(f"  Position: {len(position_features)}")
print(f"  Expected goals: {len(xg_features)}")
print(f"  Base stats: {len(base_features)}")
print(f"  TOTAL: {len(feature_cols)}")

Feature Categories:
  Rolling stats: 79
  Lags: 10
  Momentum: 7
  Position: 6
  Expected goals: 12
  Base stats: 31
  TOTAL: 136


## 2. Temporal Train-Validation-Test Split

In [12]:
def create_temporal_split(df):
    """
    Splits data maintaining strict temporal order.
    
    Train: 2021-22, 2022-23, 2023-24
    Validation: 2024-25 (GW 20-21)
    Test: 2025-26 (GW 1-15, current season)
    
    Returns
    -------
    tuple
        (train_df, val_df, test_df)
    """
    train_mask = df['season'].isin(['2021-22', '2022-23', '2023-24'])
    val_mask = (df['season'] == '2024-25') & (df['GW'] >= 20)
    test_mask = df['season'] == '2025-26'
    
    train_df = df[train_mask].copy()
    val_df = df[val_mask].copy()
    test_df = df[test_mask].copy()
    
    print("Temporal Split:")
    print(f"  Train: {len(train_df):,} observations ({train_df['season'].unique()})")
    print(f"  Val:   {len(val_df):,} observations ({val_df['season'].unique()})")
    print(f"  Test:  {len(test_df):,} observations ({test_df['season'].unique()})")
    
    return train_df, val_df, test_df

In [13]:
train_df, val_df, test_df = create_temporal_split(df)

# Analyze zero-inflation by split
print("\nZero-Point Distribution:")
for name, data in [('Train', train_df), ('Val', val_df), ('Test', test_df)]:
    zero_pct = (data['total_points'] == 0).sum() / len(data) * 100
    print(f"  {name}: {zero_pct:.1f}% zeros")

Temporal Split:
  Train: 68,398 observations (['2021-22' '2022-23' '2023-24'])
  Val:   1,435 observations (['2024-25'])
  Test:  11,090 observations (['2025-26'])

Zero-Point Distribution:
  Train: 60.5% zeros
  Val: 61.0% zeros
  Test: 60.5% zeros


## 3. Create Multi-Horizon Targets

In [14]:
def create_targets(df, horizons=[1, 2, 3]):
    """
    Creates target variables for each forecast horizon.
    Also creates binary classification target (will_play).
    
    Returns
    -------
    pd.DataFrame
        DataFrame with target_h1, target_h2, target_h3, will_play_h1, etc.
    """
    df = df.sort_values(['element', 'season', 'GW']).copy()
    
    for h in horizons:
        # Regression target (points)
        df[f'target_h{h}'] = df.groupby('element')['total_points'].shift(-h)
        
        # Classification target (will play?)
        df[f'will_play_h{h}'] = (df.groupby('element')['minutes'].shift(-h) > 0).astype(int)
    
    # Remove rows without targets (last 3 GWs per player)
    df = df.dropna(subset=[f'target_h{h}' for h in horizons])
    
    print(f"Created targets for horizons: {horizons}")
    print(f"Final observations: {len(df):,}")
    
    return df

In [15]:
train_df = create_targets(train_df)
val_df = create_targets(val_df)
test_df = create_targets(test_df)

Created targets for horizons: [1, 2, 3]
Final observations: 65,911
Created targets for horizons: [1, 2, 3]
Final observations: 0
Created targets for horizons: [1, 2, 3]
Final observations: 8,818


## 4. Prepare Feature Matrices

In [None]:
def prepare_features(df, feature_cols):
    """
    Prepares feature matrix with missing value handling.
    
    Returns
    -------
    np.ndarray
        Feature matrix
    """
    X = df[feature_cols].copy()
    
    # Fill missing values
    # Strategy: For tree-based models, explicit NaN handling is better than imputation
    # LightGBM can handle NaN natively, but we'll fill for compatibility
    
    # Fill numeric columns with median
    numeric_cols = X.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        X[col] = X[col].fillna(X[col].median())
    
    # Encode categorical (position, was_home)
    if 'position' in X.columns:
        X['position'] = X['position'].map({'GK': 0, 'DEF': 1, 'MID': 2, 'FWD': 3})
    
    if 'was_home' in X.columns:
        X['was_home'] = X['was_home'].astype(int)
    
    return X.values

In [None]:
# Prepare feature matrices
X_train = prepare_features(train_df, feature_cols)
X_val = prepare_features(val_df, feature_cols)
X_test = prepare_features(test_df, feature_cols)

print("Feature matrix shapes:")
print(f"  Train: {X_train.shape}")
print(f"  Val:   {X_val.shape}")
print(f"  Test:  {X_test.shape}")