In [1]:
import pandas as pd
import soccerdata as sd
import numpy as np
import polars as pl
from scipy.stats import weibull_min
from statsbombpy import sb
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm
import datetime
from lifelines import KaplanMeierFitter, WeibullFitter
from lifelines.statistics import logrank_test
from lifelines.utils import restricted_mean_survival_time
from sklearn.utils import resample
from mplsoccer import Pitch
import warnings
warnings.filterwarnings("ignore")

## Function Definitions

In [2]:
def season_agg(season_team_stats_all_league):
    
    # 1. Assign match order
    season_team_stats_all_league['match_no'] = season_team_stats_all_league.groupby(['season', 'team']).cumcount() + 1

    # 2. Convert GF and GA columns
    for col in ["GF", "GA", "xG", "xGA", "Poss"]:
        season_team_stats_all_league[col] = pd.to_numeric(season_team_stats_all_league[col], errors='coerce')
    
    played_stats = season_team_stats_all_league.dropna(subset=['result'])

    
    season_restricted_stats_final = pd.DataFrame()

    for league in played_stats.league.unique():
        league_stats = played_stats[played_stats['league'] == league].copy()

        season_restricted_stats = league_stats.groupby(['season', 'team']).agg(
            Goals_Scored=("GF", 'sum'),
            Goals_against=("GA", 'sum'),
            Expected_Goals=("xG", 'sum'),
            Expected_Goals_against=("xGA", 'sum'),
            Shots_Taken=("Standard_Sh", 'sum'),
            Shot_OnTarget=("Standard_SoT", 'sum'),
            Average_Possession=("Poss", 'mean'),
            Average_Shot_Distance=("Standard_Dist", 'mean'),
            matches=("result", 'size')
        )
        

        # This creates a Series with (season, team, result) as index, and count as value
        result_counts = league_stats.groupby(['season', 'team', 'result'])["result"].count().unstack(fill_value=0)
        result_counts = result_counts.rename(columns={'W': 'Wins', 'L': 'Losses', 'D': 'Draws'})
        
        # 6. Merge the stats and result counts
        final_stats_temp = season_restricted_stats.merge(
            result_counts, 
            on=['season', 'team'], 
            how='left'
        ).fillna(0) # Fill NaN from unstack/merge with 0 for teams with no W/L/D

        # 7. Calculate per-game metrics using the team-specific 'matches' count
        final_stats_temp['Goals/Game'] = final_stats_temp['Goals_Scored'] / final_stats_temp['matches']
        final_stats_temp['Goals Conceded/Game'] = final_stats_temp['Goals_against'] / final_stats_temp['matches']
        final_stats_temp['Expected Goals/Game'] = final_stats_temp['Expected_Goals'] / final_stats_temp['matches']
        final_stats_temp['Expected Goals Conceded/Game'] = final_stats_temp['Expected_Goals_against'] / final_stats_temp['matches']

        final_stats_temp['Shots_Taken/Game'] = final_stats_temp['Shots_Taken'] / final_stats_temp['matches']
        final_stats_temp['Shot_OnTarget/Game'] = final_stats_temp['Shot_OnTarget'] / final_stats_temp['matches']
        
        
        # Add league back for context if needed
        final_stats_temp['league'] = league 

        # 8. Concatenate results
        season_restricted_stats_final = pd.concat([season_restricted_stats_final, final_stats_temp])

    return season_restricted_stats_final.reset_index()

## FBREF

In [3]:
league_selector = ['ENG-Premier League','ESP-La Liga', 'FRA-Ligue 1', 'GER-Bundesliga', 'ITA-Serie A']
seasons_list = ['2024/2025','2025/2026']

last_2season_df = pd.DataFrame()
match_wise_stats = pd.DataFrame()
for league in league_selector:
    print(league)
    for season in seasons_list:
        # FBREF
        fbref = sd.FBref(league, season) 
        temp_goals = fbref.read_team_match_stats(stat_type="schedule").reset_index()
        temp_shooting = fbref.read_team_match_stats(stat_type="shooting").reset_index()
        temp_shooting.columns = [f"{col[0]}_{col[1]}" for col in temp_shooting.columns]

        temp_shooting.rename(columns={'game_':'game','team_':'team'},inplace=True)
        temp = pd.merge(temp_goals,temp_shooting[['league_','season_','team','game','Standard_Sh', 'Standard_SoT','Standard_Dist']],
                        how='inner',on=['game','team'])
        match_wise_stats = pd.concat([match_wise_stats,temp,temp])
        
print("Data collection complete.")

ENG-Premier League


ESP-La Liga


FRA-Ligue 1


GER-Bundesliga


ITA-Serie A


Data collection complete.


In [4]:
## Aggreagting FBREF DATA - TEAM, SEASON WISE
match_wise_stats.drop_duplicates(inplace=True)
season_restricted_stats_final = season_agg(match_wise_stats)

In [5]:
match_wise_stats.to_excel('FBREF - Last 2 Seasons.xlsx')
season_restricted_stats_final.to_excel('FBREF - Processed.xlsx')