In [None]:
import pandas as pd
import soccerdata as sd
import numpy as np
import polars as pl
from scipy.stats import weibull_min
from statsbombpy import sb
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm
import datetime
from lifelines import KaplanMeierFitter, WeibullFitter
from lifelines.statistics import logrank_test
from lifelines.utils import restricted_mean_survival_time
from sklearn.utils import resample
from mplsoccer import Pitch
import warnings
warnings.filterwarnings("ignore")

## Function Definitions

In [94]:
def season_agg(season_team_stats_all_league):
    
    # 1. Assign match order
    season_team_stats_all_league.sort_values(['date','team'],ascending=True,inplace=True)  
    season_team_stats_all_league['match_no'] = season_team_stats_all_league.groupby(['season', 'team']).cumcount() + 1

    # 2. Convert GF and GA columns
    for col in ["GF", "GA", "xG", "xGA", "Poss",'Tackles_Tkl', 'Tackles_TklW','Tackles_Def 3rd', 'Tackles_Mid 3rd', 'Tackles_Att 3rd','Err',
                'Performance_CrdY','Performance_CrdR','Performance_Fls','Performance_Fld']:
        try:
            season_team_stats_all_league[col] = pd.to_numeric(season_team_stats_all_league[col], errors='coerce')
        except:
            continue
    
    
    played_stats = season_team_stats_all_league.dropna(subset=['result'])
    season_restricted_stats_final = pd.DataFrame()

    for league in played_stats.league.unique():
        league_stats = played_stats[played_stats['league'] == league].copy() #Redundant Step

        season_restricted_stats = league_stats.groupby(['season', 'team']).agg(
            Goals_Scored=("GF", 'sum'),
            Goals_against=("GA", 'sum'),
            Expected_Goals=("xG", 'sum'),
            Expected_Goals_against=("xGA", 'sum'),
            Shots_Taken=("Standard_Sh", 'sum'),
            Shot_OnTarget=("Standard_SoT", 'sum'),
            Average_Possession=("Poss", 'mean'),
            Average_Shot_Distance=("Standard_Dist", 'mean'),

            #Intensity
            Tackles_Tkl=("Tackles_Tkl", 'sum'),
            Tackles_TklW=("Tackles_TklW", 'sum'),
            Tackles_Def_3rd=("Tackles_Def 3rd", 'sum'),
            Tackles_Mid_3rd=("Tackles_Mid 3rd", 'sum'),
            Tackles_Att_3rd=("Tackles_Att 3rd", 'sum'),
            #Errors & Fouls
            err_opp_shot = ('Err',"sum"),
            CrdY = ('Performance_CrdY',"sum"),
            CrdR = ('Performance_CrdR',"sum"),
            Fouls_Commited = ('Performance_Fls',"sum"),
            Fouls_Drawn = ('Performance_Fld',"sum"),
            matches=("result", 'size')
        )
        

        # This creates a Series with (season, team, result) as index, and count as value
        result_counts = league_stats.groupby(['season', 'team', 'result'])["result"].count().unstack(fill_value=0)
        result_counts = result_counts.rename(columns={'W': 'Wins', 'L': 'Losses', 'D': 'Draws'})
        
        # 6. Merge the stats and result counts
        final_stats_temp = season_restricted_stats.merge(
            result_counts, 
            on=['season', 'team'], 
            how='left'
        ).fillna(0) # Fill NaN from unstack/merge with 0 for teams with no W/L/D

        # 7. Calculate per-game metrics using the team-specific 'matches' count
        final_stats_temp['Goals/Game'] = final_stats_temp['Goals_Scored'] / final_stats_temp['matches']
        final_stats_temp['Goals Conceded/Game'] = final_stats_temp['Goals_against'] / final_stats_temp['matches']
        final_stats_temp['Expected Goals/Game'] = final_stats_temp['Expected_Goals'] / final_stats_temp['matches']
        final_stats_temp['Expected Goals Conceded/Game'] = final_stats_temp['Expected_Goals_against'] / final_stats_temp['matches']

        final_stats_temp['Shots_Taken/Game'] = final_stats_temp['Shots_Taken'] / final_stats_temp['matches']
        final_stats_temp['Shot_OnTarget/Game'] = final_stats_temp['Shot_OnTarget'] / final_stats_temp['matches']

        
        final_stats_temp['Tackles_Tkl/Game'] = final_stats_temp['Tackles_Tkl'] / final_stats_temp['matches']
        final_stats_temp['Tackles_TklW/Game'] = final_stats_temp['Tackles_TklW'] / final_stats_temp['matches']

        final_stats_temp['Tackles_Def_3rd/Game'] = final_stats_temp['Tackles_Def_3rd'] / final_stats_temp['matches']
        final_stats_temp['Tackles_Mid_3rd/Game'] = final_stats_temp['Tackles_Mid_3rd'] / final_stats_temp['matches']
        final_stats_temp['Tackles_Att_3rd/Game'] = final_stats_temp['Tackles_Att_3rd'] / final_stats_temp['matches']

        final_stats_temp['err_opp_shot/Game'] = final_stats_temp['err_opp_shot'] / final_stats_temp['matches']
        final_stats_temp['CrdY/Game'] = final_stats_temp['CrdY'] / final_stats_temp['matches']
        final_stats_temp['CrdR/Game'] = final_stats_temp['CrdR'] / final_stats_temp['matches']
        
        final_stats_temp['Fouls_Commited/Game'] = final_stats_temp['Fouls_Commited'] / final_stats_temp['matches']
        final_stats_temp['Fouls_Drawn/Game'] = final_stats_temp['Fouls_Drawn'] / final_stats_temp['matches']
        final_stats_temp['league'] = league

        # 8. Concatenate results
        season_restricted_stats_final = pd.concat([season_restricted_stats_final, final_stats_temp])

    return season_restricted_stats_final.reset_index()

## FBREF

#### Goals, Shooting

In [3]:
def fbref_data_collection(league,season,df):
# FBREF
    fbref = sd.FBref(league, season) 
    temp_goals = fbref.read_team_match_stats(stat_type="schedule").reset_index()
    temp_shooting = fbref.read_team_match_stats(stat_type="shooting").reset_index()
    temp_shooting.columns = [f"{col[0]}_{col[1]}" for col in temp_shooting.columns]

    temp_shooting.rename(columns={'game_':'game','team_':'team'},inplace=True)
    temp = pd.merge(temp_goals,temp_shooting[['league_','season_','team','game','Standard_Sh', 'Standard_SoT','Standard_Dist']],
                    how='inner',on=['game','team'])
        
    temp['league'] = league
    temp['season'] = season
    match_wise_stats = pd.concat([df,temp])
    
    return match_wise_stats

#### Defensive Metrics

In [4]:
def defensive_stats_agg(league,season,df):

    fbref = sd.FBref(league,season)
    
    temp_def = fbref.read_team_match_stats(stat_type="defense").reset_index()
    temp_def.columns = [f"{lvl0}_{lvl1}".strip("_") if lvl1 else lvl0 for lvl0, lvl1 in temp_def.columns]
    temp_def = temp_def[['game','team','Tackles_Tkl', 'Tackles_TklW','Tackles_Def 3rd', 'Tackles_Mid 3rd', 'Tackles_Att 3rd','Err']]
    
    temp_misc= fbref.read_team_match_stats(stat_type="misc").reset_index()
    temp_misc.columns = [f"{lvl0}_{lvl1}".strip("_") if lvl1 else lvl0 for lvl0, lvl1 in temp_misc.columns]
    temp_misc = temp_misc[['game','team','Performance_CrdY','Performance_CrdR','Performance_Fls','Performance_Fld']]
    

    temp = pd.merge(temp_def,temp_misc,how='inner',on=['game','team'])
    

    temp['league'] = league
    temp['season'] = season
    
    match_wise_stats_2 = pd.concat([df,temp])
    
    return match_wise_stats_2

## For 5 Seasons

In [None]:
league_selector = ['ENG-Premier League','ESP-La Liga', 'FRA-Ligue 1', 'GER-Bundesliga', 'ITA-Serie A']
seasons_list = ['2024/2025','2025/2026']

#last_2season_df = pd.DataFrame()
match_wise_stats = pd.DataFrame()
match_wise_stats_2 = pd.DataFrame()
for league in league_selector:
    print(league)
    for season in seasons_list:
        match_wise_stats = fbref_data_collection(league,season,match_wise_stats)
        match_wise_stats_2 = defensive_stats_agg(league,season,match_wise_stats_2)
        
print("Data collection complete.")

In [95]:
## Aggreagting FBREF DATA - TEAM, SEASON WISE
match_wise_stats.drop_duplicates(inplace=True)
match_wise_stats_2.drop_duplicates(inplace=True)

match_wise_stats_combined = pd.merge(match_wise_stats,match_wise_stats_2,on=['game','team'],how='inner')
match_wise_stats_combined.drop(columns= ['league_x','season_x'],inplace=True)
match_wise_stats_combined.rename(columns= {'league_y':'league','season_y':'season'},inplace=True)
season_restricted_stats_final = season_agg(match_wise_stats_combined)

In [96]:
season_restricted_stats_final.to_excel('1226_FBREF Aggregated.xlsx')

In [97]:
match_wise_stats_combined.to_excel("1226_FBREF Combined.xlsx")