# Exploring the Impact of the Lionesses' Success on the Women's Super League (WSL)
## modelling.ipynb — Time-Series Forcasting
This notebook builds a time-series forecasting model to predict future WSL attendance trends, both league-wide and for individual clubs. 
The dataset contains one attendance value per club per season (home attendance), spanning from 2017/18 to 2024/25.

#### Set-up, Import and Config

In [1]:
# Import
from pathlib import Path
import pandas as pd
import sys

import plotly.io as pio
pio.renderers.default = "iframe"

# Working directory
current = Path.cwd()
for parent in [current] + list(current.parents):
    if (parent / '.git').exists():
        repo_root = parent
        break
sys.path.append(str(repo_root))

# Function Imports
from config.general_config import DATA_OUT
from src.data_utils import load_csv_data, drop_columns
from src.modelling import fit_wsl_linear_model
from src.wsl_plots import create_attendance_capacity_forecast_chart

In [2]:
# Config
wsl_data_file_name = 'combined_wsl_data.csv'
nat_data_file_name = 'nationality_combined_data.csv'
stadium_data_file_name = 'wsl_stadium_data.csv'

## Data Import and Prep

In [3]:
df_wsl = load_csv_data(DATA_OUT / wsl_data_file_name)
df_nationality = load_csv_data(DATA_OUT / nat_data_file_name)
df_stadiums = load_csv_data(DATA_OUT / stadium_data_file_name)

In [4]:
model_df = df_wsl.copy()
model_df = model_df.merge(
    df_stadiums[['Team', 'Capacity', 'Season']],
    left_on=['Club', 'Season'],
    right_on=['Team', 'Season'],
    how='left'
)

cols_to_drop = ['Team', 'Top_Scorer', 'Top_Scorer_Name', 'Expected_Goals_Difference_Per_90_Mins', 'unique_ID']

model_df = drop_columns(model_df, cols_to_drop)

# Filter England rows
eng = df_nationality[df_nationality['Nationality'] == 'England']

# Total league players & minutes per season
league_totals = df_nationality.groupby('Season').agg({
    'Num_Players': 'sum',
    'Minutes_Played': 'sum'
}).rename(columns={
    'Num_Players': 'Total_Players_League',
    'Minutes_Played': 'Total_Minutes_League'
})

# English-only stats
eng_stats = eng.groupby('Season').agg({
    'Num_Players': 'sum',
    'Minutes_Played': 'sum',
    'FIFA Ranking': 'mean'  
}).rename(columns={
    'Num_Players': 'Eng_Num_Players',
    'Minutes_Played': 'Eng_Minutes_Played',
    'FIFA Ranking': 'Eng_FIFA_Ranking'
})

# Combine into one nationality table
nat_agg = league_totals.merge(eng_stats, on='Season', how='left')

# Percentages
nat_agg['pct_players_eng'] = nat_agg['Eng_Num_Players'] / nat_agg['Total_Players_League']
nat_agg['pct_minutes_eng'] = nat_agg['Eng_Minutes_Played'] / nat_agg['Total_Minutes_League']

model_df = model_df.merge(nat_agg, on='Season', how='left')

model_df['euro_winner'] = (model_df['Season'] >= 2022).astype(int)
model_df['years_since_euro'] = (model_df['Season'] - 2022).clip(lower=0)

model_df = pd.get_dummies(model_df, columns=['Club'], drop_first=True)

## Modelling

In [5]:
target = 'Attendance'

features = [
    'Matches', 'Wins', 'Draws', 'Losses',
    'Goals_For', 'Goals_Against', 'Goal_Difference',
    'Points', 'Points_Per_Match',
    'Top_Scorer_Goals',
    'Capacity',
    'Total_Players_League', 'Total_Minutes_League',
    'Eng_Num_Players', 'Eng_Minutes_Played',
    'Eng_FIFA_Ranking', 'pct_players_eng', 'pct_minutes_eng',
    'euro_winner', 'years_since_euro', 'Rank'
] + [col for col in model_df.columns if col.startswith('Club_')]

model_df[features] = model_df[features].fillna(model_df[features].median())
model_df['Attendance'] = model_df['Attendance'].fillna(model_df['Attendance'].median())

In [6]:
# Fit model
result = fit_wsl_linear_model(
    model_df,
    feature_cols=features,
    split_year=2022,
    forecast_years=5
)

# Extract predictions df
train_df = result["train_df"]
test_df = result["test_df"]
future_df = result["future_df"]

# 3. Combine into one dataframe
full_df = pd.concat([train_df, test_df, future_df], ignore_index=True)

# 4. Rebuild Club column from dummy columns
club_cols = [c for c in full_df.columns if c.startswith("Club_")]
full_df["Club"] = full_df[club_cols].idxmax(axis=1).str.replace("Club_", "")

# 5. Drop dummy columns
full_df = full_df.drop(columns=club_cols)

# 6. Inspect
full_df.head()


The behavior of DataFrame.idxmax with all-NA values, or any-NA and skipna=False, is deprecated. In a future version this will raise ValueError



Unnamed: 0,Rank,Matches,Wins,Draws,Losses,Goals_For,Goals_Against,Goal_Difference,Points,Points_Per_Match,...,Total_Minutes_League,Eng_Num_Players,Eng_Minutes_Played,Eng_FIFA_Ranking,pct_players_eng,pct_minutes_eng,euro_winner,years_since_euro,Predicted_Attendance,Club
0,1.0,18,13,5,0,44,13,31,44,2.44,...,176533.0,150,106092.0,3.0,0.6,0.6,0.0,0,2221.23,Chelsea
1,2.0,18,12,2,4,51,17,34,38,2.11,...,176533.0,150,106092.0,3.0,0.6,0.6,0.0,0,2305.02,Manchester_City
2,3.0,18,11,4,3,38,18,20,37,2.06,...,176533.0,150,106092.0,3.0,0.6,0.6,0.0,0,1209.97,Aston_Villa
3,4.0,18,9,5,4,40,18,22,32,1.78,...,176533.0,150,106092.0,3.0,0.6,0.6,0.0,0,1627.5,Reading
4,5.0,18,9,3,6,30,18,12,30,1.67,...,176533.0,150,106092.0,3.0,0.6,0.6,0.0,0,530.89,Birmingham_City


In [7]:
combined_metrics_df = result['combined_metrics_df']
combined_metrics_df.head()

Unnamed: 0,MAE,MSE,RMSE,R2,ExplainedVariance,MaxError,PctWithin10pct,PctWithin15pct,PctWithin500Attendees,PctWithin1000Attendees
Train,634.261806,639103.2,799.439299,0.688614,0.688614,1815.907556,0.192982,0.263158,0.438596,0.789474
Test,4489.360631,51277670.0,7160.842636,-0.209347,-0.200015,24343.506323,0.111111,0.138889,0.055556,0.138889


In [8]:
forcast_fig = create_attendance_capacity_forecast_chart(full_df, show_logo=True, save_fig=True, fig_name='forcast_fig')
forcast_fig.show()

Saving figure to: C:\Python Stuff\project\AM_Apprenticeship\reports\plots\wsl_project_forcast_fig.html
