In [3]:
# Suppress noisy numpy "Mean of empty slice" messages and provide a safe helper
import warnings
import numpy as np
# Only suppress the specific message to avoid hiding other important warnings
warnings.filterwarnings("ignore", message="Mean of empty slice")

def safe_nanmean(a, axis=None):
    """Compute np.nanmean but return np.nan when input is empty or all-NaN.
    This prevents the 'Mean of empty slice' RuntimeWarning from numpy and
    gives a predictable result for downstream code.
    """
    a = np.asarray(a)
    # empty array -> return NaN
    if a.size == 0:
        return np.nan
    # all NaN -> return NaN
    if np.all(np.isnan(a)):
        return np.nan
    return np.nanmean(a, axis=axis)

In [4]:
import pandas as pd

In [5]:
df_2014_2015 = pd.read_html('https://fbref.com/en/comps/32/2014-2015/2014-2015-Primeira-Liga-Stats')[0]

# Keep only the main columns (before the nested ones)
# You can select specific columns you need
main_columns = ['Rk', 'Squad', 'MP', 'W', 'D', 'L', 'GF', 'GA', 'GD', 'Pts', 'Pts/MP', 'Attendance']
df_2014_2015_clean = df_2014_2015[main_columns]

print(df_2014_2015_clean.head())

   Rk        Squad  MP   W   D   L  GF  GA  GD  Pts  Pts/MP  Attendance
0   1      Benfica  34  27   4   3  86  16  70   85    2.50       48521
1   2        Porto  34  25   7   2  74  13  61   82    2.41       30823
2   3  Sporting CP  34  22  10   2  67  29  38   76    2.24       34225
3   4        Braga  34  17   7  10  55  28  27   58    1.71       10775
4   5      Vitória  34  15  10   9  50  35  15   55    1.62       14896


In [6]:
import pandas as pd

# 2014-2015
df_2014_2015 = pd.read_html('https://fbref.com/en/comps/32/2014-2015/2014-2015-Primeira-Liga-Stats')[0]
main_columns = ['Rk', 'Squad', 'MP', 'W', 'D', 'L', 'GF', 'GA', 'GD', 'Pts', 'Pts/MP', 'Attendance']
df_2014_2015_clean = df_2014_2015[main_columns]

# 2015-2016
df_2015_2016 = pd.read_html('https://fbref.com/en/comps/32/2015-2016/2015-2016-Primeira-Liga-Stats')[0]
df_2015_2016_clean = df_2015_2016[main_columns]

# 2016-2017
df_2016_2017 = pd.read_html('https://fbref.com/en/comps/32/2016-2017/2016-2017-Primeira-Liga-Stats')[0]
df_2016_2017_clean = df_2016_2017[main_columns]

# 2017-2018
df_2017_2018 = pd.read_html('https://fbref.com/en/comps/32/2017-2018/2017-2018-Primeira-Liga-Stats')[0]
df_2017_2018_clean = df_2017_2018[main_columns]

# 2018-2019
df_2018_2019 = pd.read_html('https://fbref.com/en/comps/32/2018-2019/2018-2019-Primeira-Liga-Stats')[0]
df_2018_2019_clean = df_2018_2019[main_columns]

# 2019-2020
df_2019_2020 = pd.read_html('https://fbref.com/en/comps/32/2019-2020/2019-2020-Primeira-Liga-Stats')[0]
df_2019_2020_clean = df_2019_2020[main_columns]

# 2020-2021
df_2020_2021 = pd.read_html('https://fbref.com/en/comps/32/2020-2021/2020-2021-Primeira-Liga-Stats')[0]
df_2020_2021_clean = df_2020_2021[main_columns]

# 2021-2022
df_2021_2022 = pd.read_html('https://fbref.com/en/comps/32/2021-2022/2021-2022-Primeira-Liga-Stats')[0]
df_2021_2022_clean = df_2021_2022[main_columns]

# 2022-2023
df_2022_2023 = pd.read_html('https://fbref.com/en/comps/32/2022-2023/2022-2023-Primeira-Liga-Stats')[0]
df_2022_2023_clean = df_2022_2023[main_columns]

# 2023-2024
df_2023_2024 = pd.read_html('https://fbref.com/en/comps/32/2023-2024/2023-2024-Primeira-Liga-Stats')[0]
df_2023_2024_clean = df_2023_2024[main_columns]

# 2024-2025
df_2024_2025 = pd.read_html('https://fbref.com/en/comps/32/2024-2025/2024-2025-Primeira-Liga-Stats')[0]
df_2024_2025_clean = df_2024_2025[main_columns]

# 2025-2026 (if available)
df_2025_2026 = pd.read_html('https://fbref.com/en/comps/32/2025-2026/2025-2026-Primeira-Liga-Stats')[0]
df_2025_2026_clean = df_2025_2026[main_columns]

In [7]:
#Option 1: Loop through multiple seasons
#import pandas as pd

# Dictionary to store all dataframes
seasons = {}

# Main columns we want to keep
main_columns = ['Rk', 'Squad', 'MP', 'W', 'D', 'L', 'GF', 'GA', 'GD', 'Pts', 'Pts/MP', 'Attendance']

# Loop through seasons from 2014-2015 to 2025-2026
for year in range(2014, 2026):
    season = f"{year}-{year+1}"
    url = f'https://fbref.com/en/comps/32/{season}/{season}-Primeira-Liga-Stats'
    
    try:
        # Read the table
        df = pd.read_html(url)[0]
        
        # Keep only main columns
        df_clean = df[main_columns]
        
        # Store in dictionary
        seasons[season] = df_clean
        
        print(f"✓ Successfully loaded {season}")
        
    except Exception as e:
        print(f"✗ Error loading {season}: {e}")

# Combine all seasons only if we have data
if seasons:
    all_seasons = pd.concat([df.assign(Season=season) for season, df in seasons.items()], ignore_index=True)
    print("\nAll seasons combined:")
    print(all_seasons.head(20))
    print(f"\nTotal rows: {len(all_seasons)}")
else:
    print("No data was loaded successfully")

✓ Successfully loaded 2014-2015
✓ Successfully loaded 2015-2016
✗ Error loading 2016-2017: HTTP Error 429: Too Many Requests
✗ Error loading 2017-2018: HTTP Error 429: Too Many Requests
✗ Error loading 2018-2019: HTTP Error 429: Too Many Requests
✗ Error loading 2019-2020: HTTP Error 429: Too Many Requests
✗ Error loading 2020-2021: HTTP Error 429: Too Many Requests
✗ Error loading 2021-2022: HTTP Error 429: Too Many Requests
✗ Error loading 2022-2023: HTTP Error 429: Too Many Requests
✗ Error loading 2023-2024: HTTP Error 429: Too Many Requests
✗ Error loading 2024-2025: HTTP Error 429: Too Many Requests
✗ Error loading 2025-2026: HTTP Error 429: Too Many Requests

All seasons combined:
    Rk              Squad  MP   W   D   L  GF  GA  GD  Pts  Pts/MP  \
0    1            Benfica  34  27   4   3  86  16  70   85    2.50   
1    2              Porto  34  25   7   2  74  13  61   82    2.41   
2    3        Sporting CP  34  22  10   2  67  29  38   76    2.24   
3    4              Bra

In [8]:
# Aggregate available season DataFrames into one DataFrame with a `Years` column
# Make explicit copies before assignment to avoid SettingWithCopyWarning
import pandas as pd

dfs = []
for y in range(2014, 2026):
    var_name = f"df_{y}_{y+1}_clean"
    df = globals().get(var_name)
    if df is None:
        print(f"Warning: {var_name} not found — skipping")
        continue

    # Work on an explicit copy to avoid SettingWithCopyWarning
    df_copy = df.copy()
    df_copy.loc[:, "Years"] = f"{y}-{y+1}"
    dfs.append(df_copy)

# Concatenate all available DataFrames into `full_data`
if dfs:
    full_data = pd.concat(dfs, ignore_index=True)
    print(f"Combined {len(dfs)} seasons into full_data (rows={len(full_data)})")
else:
    full_data = pd.DataFrame()
    print("No season DataFrames were found; full_data is empty")


Combined 12 seasons into full_data (rows=216)


In [9]:
print(full_data)

     Rk        Squad  MP   W   D   L  GF  GA  GD  Pts  Pts/MP  Attendance  \
0     1      Benfica  34  27   4   3  86  16  70   85    2.50     48521.0   
1     2        Porto  34  25   7   2  74  13  61   82    2.41     30823.0   
2     3  Sporting CP  34  22  10   2  67  29  38   76    2.24     34225.0   
3     4        Braga  34  17   7  10  55  28  27   58    1.71     10775.0   
4     5      Vitória  34  15  10   9  50  35  15   55    1.62     14896.0   
..   ..          ...  ..  ..  ..  ..  ..  ..  ..  ...     ...         ...   
211  14     Casa Pia   8   2   2   4   8  13  -5    8    1.00      1248.0   
212  15      Estrela   8   1   4   3   6   8  -2    7    0.88      4647.0   
213  16      Estoril   8   1   3   4  10  13  -3    6    0.75      2829.0   
214  17      Tondela   8   1   2   5   4  14 -10    5    0.63      1552.0   
215  18  AVS Futebol   8   0   1   7   5  20 -15    1    0.13      1429.0   

         Years  
0    2014-2015  
1    2014-2015  
2    2014-2015  
3    20

In [10]:
full_data.to_csv('primeira_all_seasons.csv', index=False)

In [11]:
print(full_data.shape)

(216, 13)


In [12]:
# number of appearances for the past 11 seasons 
# if below 11 then the team was relegated at some point
full_data['Squad'].value_counts()

Squad
Benfica              12
Porto                12
Sporting CP          12
Braga                12
Vitória              12
Rio Ave              11
Boavista             11
Moreirense           11
Marítimo              9
Estoril               9
Paços de Ferreira     8
Arouca                8
Tondela               8
Nacional              7
Famalicão             7
Santa Clara           7
Portimonense          7
Gil Vicente FC        7
Vitória Setúbal       6
Chaves                5
B-SAD                 4
Belenenses            4
Casa Pia              4
Vizela                3
Feirense              3
Aves                  3
Estrela               3
Farense               3
Académica             2
AVS Futebol           2
Penafiel              1
Gil Vicente           1
União                 1
Alverca               1
Name: count, dtype: int64

In [13]:
# Top 3 ranking count per team
full_data['Squad'].groupby(full_data['Rk']).value_counts().head(10)



Rk  Squad      
1   Benfica        5
    Porto          4
    Sporting CP    3
2   Porto          5
    Benfica        4
    Sporting CP    3
3   Sporting CP    4
    Benfica        3
    Porto          3
    Braga          2
Name: count, dtype: int64

## Quick analysis cells

The next cells load the combined dataset (from memory `full_data` if available, otherwise from `data/primeira_all_seasons.csv`). They perform light cleaning and provide several ready-to-run analysis snippets (team presence, top-3 frequency, average metrics, rank volatility, attendance trends). Run them in order.

In [14]:
# Load combined data (prefer in-memory `full_data` if present, else read CSV saved earlier)
import os
import pandas as pd

if 'full_data' in globals():
    df = full_data.copy()
elif os.path.exists('data/primeira_all_seasons.csv'):
    df = pd.read_csv('data/primeira_all_seasons.csv')
else:
    raise FileNotFoundError("No `full_data` in memory and no data/primeira_all_seasons.csv found. Run the scraping cells first.")

# Basic cleaning: coerce numeric columns and normalize column names/types
numeric_cols = ['Rk','MP','W','D','L','GF','GA','GD','Pts','Pts/MP','Attendance']
for c in numeric_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')

# Trim whitespace in team names
if 'Squad' in df.columns:
    df['Squad'] = df['Squad'].astype(str).str.strip()

print('Rows:', len(df))
df.head()

Rows: 216


Unnamed: 0,Rk,Squad,MP,W,D,L,GF,GA,GD,Pts,Pts/MP,Attendance,Years
0,1,Benfica,34,27,4,3,86,16,70,85,2.5,48521.0,2014-2015
1,2,Porto,34,25,7,2,74,13,61,82,2.41,30823.0,2014-2015
2,3,Sporting CP,34,22,10,2,67,29,38,76,2.24,34225.0,2014-2015
3,4,Braga,34,17,7,10,55,28,27,58,1.71,10775.0,2014-2015
4,5,Vitória,34,15,10,9,50,35,15,55,1.62,14896.0,2014-2015


In [15]:
# --- Analysis snippets: run a block at a time ---
# 1) Team presence counts
appearance_counts = df['Squad'].value_counts().rename_axis('Squad').reset_index(name='Seasons_present')
appearance_counts_sorted = appearance_counts.sort_values('Seasons_present', ascending=False)
print('Top teams by seasons present:')
display(appearance_counts_sorted.head(10))

# 2) Top-3 frequency
top_positions = df.copy()
top_positions['Top3'] = top_positions['Rk'].le(3)
top_positions['Champion'] = top_positions['Rk'].eq(1)
summary = pd.concat([
    top_positions.groupby('Squad')['Top3'].sum().rename('Top3_count'),
    top_positions.groupby('Squad')['Champion'].sum().rename('Championships')
], axis=1).fillna(0).astype(int).sort_values('Top3_count', ascending=False)
print('\nTop-3 / Championships:')
display(summary.head(10))

# 3) Average metrics (rounded to 2 decimal places)
team_stats = df.groupby('Squad').agg({
    'Pts': 'mean',
    'Pts/MP': 'mean',
    'GD': 'mean',
    'GF': 'mean',
    'GA': 'mean'
}).round(2).rename(columns=lambda x: f'avg_{x}')
print('\nTop teams by avg Pts/MP:')
display(team_stats.sort_values('avg_Pts/MP', ascending=False).head(10))

# 4) Rank volatility (rounded to 2 decimal places)
rank_vol = df.groupby('Squad')['Rk'].agg(['count', 'mean', 'std']).rename(columns={'std':'rank_std','count':'n_seasons'})
rank_vol[['mean', 'rank_std']] = rank_vol[['mean', 'rank_std']].round(2)
print('\nRank volatility (lower std = more consistent):')
display(rank_vol.sort_values('rank_std').head(10))

# 5) Attendance by year (league average, rounded to 2 decimal places)
attendance_by_year = df.groupby('Years')['Attendance'].agg(['mean','median','sum','count']).reset_index()
print('\nAttendance by year:')
display(attendance_by_year)

# You can run more specific analyses by copying and modifying the above blocks.

Top teams by seasons present:


Unnamed: 0,Squad,Seasons_present
0,Benfica,12
1,Porto,12
2,Sporting CP,12
3,Braga,12
4,Vitória,12
5,Rio Ave,11
6,Boavista,11
7,Moreirense,11
8,Marítimo,9
9,Estoril,9



Top-3 / Championships:


Unnamed: 0_level_0,Top3_count,Championships
Squad,Unnamed: 1_level_1,Unnamed: 2_level_1
Porto,12,4
Benfica,12,5
Sporting CP,10,3
Braga,2,0
Académica,0,0
AVS Futebol,0,0
B-SAD,0,0
Aves,0,0
Alverca,0,0
Arouca,0,0



Top teams by avg Pts/MP:


Unnamed: 0_level_0,avg_Pts,avg_Pts/MP,avg_GD,avg_GF,avg_GA
Squad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Porto,75.58,2.4,47.42,68.5,21.08
Benfica,76.25,2.39,52.58,75.25,22.67
Sporting CP,73.25,2.31,41.5,67.58,26.08
Braga,60.25,1.85,23.5,55.75,32.25
Vitória,47.83,1.49,4.33,43.17,38.83
Famalicão,39.86,1.35,-2.14,38.0,40.14
Gil Vicente FC,36.57,1.29,-4.86,34.0,38.86
Rio Ave,40.91,1.27,-3.55,37.27,40.82
Moreirense,38.27,1.26,-8.91,34.0,42.91
Alverca,10.0,1.25,-1.0,11.0,12.0



Rank volatility (lower std = more consistent):


Unnamed: 0_level_0,n_seasons,mean,rank_std
Squad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Estrela,3,14.67,0.58
Porto,12,1.92,0.79
Benfica,12,1.83,0.83
Sporting CP,12,2.42,1.08
Famalicão,7,7.43,1.13
Braga,12,4.25,1.29
AVS Futebol,2,17.0,1.41
Vitória Setúbal,6,14.0,1.41
Vitória,12,6.42,1.73
Académica,2,16.5,2.12



Attendance by year:


Unnamed: 0,Years,mean,median,sum,count
0,2014-2015,10203.333333,4243.0,183660.0,18
1,2015-2016,11086.5,4763.0,199557.0,18
2,2016-2017,11904.111111,3875.5,214274.0,18
3,2017-2018,11869.666667,4009.0,213654.0,18
4,2018-2019,11669.833333,4375.5,210057.0,18
5,2019-2020,7867.0,3098.5,141606.0,18
6,2020-2021,66.5,66.5,133.0,2
7,2021-2022,7762.666667,3279.5,139728.0,18
8,2022-2023,11651.333333,4194.5,209724.0,18
9,2023-2024,12115.444444,4131.0,218078.0,18


# Primeira Liga ML Prediction Workflow

This workflow will:
- Use only completed seasons for training
- Engineer features based on recent performance
- Train a regression model to predict points
- Estimate win probabilities for 2025-2026
- Validate that top 3 predictions match historical averages

In [16]:
# Prepare data for ML models
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Create features from historical data
def create_features(data):
    features = data.copy()
    
    # Create more comprehensive features
    # Previous season stats
    features['prev_pts'] = features.groupby('Squad')['Pts'].shift(1)
    features['prev_gd'] = features.groupby('Squad')['GD'].shift(1)
    features['prev_rank'] = features.groupby('Squad')['Rk'].shift(1)
    features['prev_wins'] = features.groupby('Squad')['W'].shift(1)
    
    # Last 2 seasons averages
    features['avg_pts_2'] = features.groupby('Squad')['Pts'].rolling(window=2).mean().reset_index(0, drop=True)
    features['avg_gd_2'] = features.groupby('Squad')['GD'].rolling(window=2).mean().reset_index(0, drop=True)
    features['avg_wins_2'] = features.groupby('Squad')['W'].rolling(window=2).mean().reset_index(0, drop=True)
    
    # Win ratio and scoring efficiency from previous season
    features['prev_win_ratio'] = features.groupby('Squad')['W'].shift(1) / features['MP']
    features['prev_scoring_rate'] = features.groupby('Squad')['GF'].shift(1) / features['MP']
    
    # Fill NaN values with median (more robust than mean)
    for col in features.columns:
        if col not in ['Squad', 'Years'] and features[col].dtype in ['float64', 'int64']:
            features[col] = features[col].fillna(features[col].median())
    
    return features

# Prepare the data
ml_data = create_features(full_data)

# Remove 2025-2026 season for prediction
train_data = ml_data[ml_data['Years'] != '2025-2026'].copy()
predict_data = ml_data[ml_data['Years'] == '2025-2026'].copy()

# Enhanced feature set
feature_cols = [
    'prev_pts', 'prev_gd', 'prev_rank', 'prev_wins',
    'avg_pts_2', 'avg_gd_2', 'avg_wins_2',
    'prev_win_ratio', 'prev_scoring_rate'
]

X = train_data[feature_cols]
y_points = train_data['Pts']
y_rank = train_data['Rk']

# Scale features for better model performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_points_train, y_points_test = train_test_split(X_scaled, y_points, test_size=0.2, random_state=42)

print("Training data shape:", X_train.shape)
print("\nFeatures used:", feature_cols)
print("\nHistorical points distribution:")
print(f"Min points: {y_points.min():.0f}")
print(f"Max points: {y_points.max():.0f}")
print(f"Mean points: {y_points.mean():.1f}")
print(f"Median points: {y_points.median():.1f}")

Training data shape: (158, 9)

Features used: ['prev_pts', 'prev_gd', 'prev_rank', 'prev_wins', 'avg_pts_2', 'avg_gd_2', 'avg_wins_2', 'prev_win_ratio', 'prev_scoring_rate']

Historical points distribution:
Min points: 17
Max points: 91
Mean points: 46.9
Median points: 40.5


In [17]:
# RE-RUN: Robust feature construction + retrain + predict (standalone)
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from scipy.special import softmax
from collections import defaultdict

# Helper: compute simple trend (slope) over last n points; return 0 if insufficient data
def last_n_slope(s, n=3):
    vals = s.dropna().values[-n:]
    if len(vals) < 2:
        return 0.0
    x = np.arange(len(vals))
    slope, _ = np.polyfit(x, vals, 1)  # Calculate slope
    return slope  # Return it!

In [18]:

# 1) Build training set using only completed seasons
completed_seasons = [f"{y}-{y+1}" for y in range(2014, 2025)]
train_df = full_data[full_data['Years'].isin(completed_seasons)].copy()
# teams to predict (from scraped 2025-2026 page if present)
predict_teams = full_data[full_data['Years']=='2025-2026']['Squad'].unique().tolist()

# Ensure numeric columns
for c in ['Pts','GD','W','Rk','MP','GF','GA']:
    if c in train_df.columns:
        train_df[c] = pd.to_numeric(train_df[c], errors='coerce')

In [19]:
# 2) For training rows, compute previous-season features (shift) and multi-season features
train_df = train_df.sort_values(['Squad','Years'])
for col in ['Pts','GD','W','Rk','MP','GF','GA']:
    train_df[f'prev_{col.lower()}'] = train_df.groupby('Squad')[col].shift(1)
# Normalize previous-season column names for clarity and downstream consistency
train_df = train_df.rename(columns={'prev_w':'prev_wins', 'prev_rk':'prev_rank'})

# Last 2 and last 3 season aggregates (rolling means) on raw Pts (for trend and smoothing)
train_df['last2_pts_mean'] = train_df.groupby('Squad')['Pts'].rolling(window=2, min_periods=1).mean().reset_index(0, drop=True)
train_df['last3_pts_mean'] = train_df.groupby('Squad')['Pts'].rolling(window=3, min_periods=1).mean().reset_index(0, drop=True)

# Compute pts trend (slope) per team using last 3 seasons where available
train_df['pts_trend_last3'] = train_df.groupby('Squad')['Pts'].transform(lambda s: s.expanding().apply(lambda x: last_n_slope(pd.Series(x), n=3), raw=False))

# Per-game rates from previous season (if prev_MP available)
train_df['prev_gf_per_mp'] = train_df['prev_gf'] / train_df['prev_mp']
train_df['prev_ga_per_mp'] = train_df['prev_ga'] / train_df['prev_mp']
train_df['prev_gd_per_mp'] = train_df['prev_gd'] / train_df['prev_mp']

# Fill training missing values sensibly: team median then overall median
fill_cols = ['prev_pts','prev_gd','prev_wins','prev_rank','prev_mp','prev_gf','prev_ga','last2_pts_mean','last3_pts_mean','pts_trend_last3','prev_gf_per_mp','prev_ga_per_mp','prev_gd_per_mp']
# ensure columns exist before filling
fill_cols = [c for c in fill_cols if c in train_df.columns]
for col in fill_cols:
    train_df[col] = train_df.groupby('Squad')[col].transform(lambda x: x.fillna(x.median()))
    train_df[col] = train_df[col].fillna(train_df[col].median())


In [20]:
# 3) Create predict_df for 2025-2026 teams. We'll source their 2024-2025 stats when possible
last_season = '2024-2025'
last_stats = train_df[train_df['Years']==last_season][['Squad','prev_pts','prev_gd','prev_wins','prev_rank','prev_mp','prev_gf','prev_ga','last2_pts_mean','last3_pts_mean','pts_trend_last3']].copy()
# rename to consistent feature names (some cols from earlier steps use different names)
last_stats = last_stats.rename(columns={
    'prev_w':'prev_wins',
    'prev_rk':'prev_rank'
})

predict_df = pd.DataFrame({'Squad': predict_teams})
predict_df = predict_df.merge(last_stats, on='Squad', how='left')

# For promoted/new teams missing 2024-25, get their last available season in train_df if any
missing = predict_df['prev_pts'].isna() if 'prev_pts' in predict_df.columns else pd.Series(False, index=predict_df.index)
if missing.any():
    last_avail = train_df.sort_values('Years').groupby('Squad').tail(1)[['Squad'] + [c for c in ['prev_pts','prev_gd','prev_w','prev_rk','prev_mp','prev_gf','prev_ga','last2_pts_mean','last3_pts_mean','pts_trend_last3'] if c in train_df.columns]].rename(columns={'prev_w':'prev_wins','prev_rk':'prev_rank'})
    predict_df = predict_df.merge(last_avail, on='Squad', how='left', suffixes=('','_last'))
    for col in [c for c in ['prev_pts','prev_gd','prev_wins','prev_rank','prev_mp','prev_gf','prev_ga','last2_pts_mean','last3_pts_mean','pts_trend_last3'] if c in predict_df.columns or c+'_last' in predict_df.columns]:
        if col in predict_df.columns and col+'_last' in predict_df.columns:
            predict_df[col] = predict_df[col].fillna(predict_df[col+'_last'])
    # drop helper columns ending with _last
    predict_df = predict_df[[c for c in predict_df.columns if not c.endswith('_last')]]

# Final fallback: overall median from training for any remaining missing features
final_fill_cols = [c for c in ['prev_pts','prev_gd','prev_wins','prev_rank','prev_mp','prev_gf','prev_ga','last2_pts_mean','last3_pts_mean','pts_trend_last3'] if c in predict_df.columns]
for col in final_fill_cols:
    if predict_df[col].isna().any():
        predict_df[col] = predict_df[col].fillna(train_df[col].median())

# Derive per-game rates for predict set where possible
if 'prev_gf' in predict_df.columns and 'prev_mp' in predict_df.columns:
    predict_df['prev_gf_per_mp'] = predict_df['prev_gf'] / predict_df['prev_mp']
if 'prev_ga' in predict_df.columns and 'prev_mp' in predict_df.columns:
    predict_df['prev_ga_per_mp'] = predict_df['prev_ga'] / predict_df['prev_mp']
if 'prev_gd' in predict_df.columns and 'prev_mp' in predict_df.columns:
    predict_df['prev_gd_per_mp'] = predict_df['prev_gd'] / predict_df['prev_mp']

# Diagnostics: ensure we have variance and no identical rows
print('Prediction features (first rows):')
print(predict_df.head(20))
print('\nFeature spreads:')
desc_cols = [c for c in ['prev_pts','prev_gd','prev_wins','prev_rank','last2_pts_mean','last3_pts_mean','pts_trend_last3','prev_gf_per_mp','prev_ga_per_mp','prev_gd_per_mp'] if c in predict_df.columns]
print(predict_df[desc_cols].describe())
# Check for identical rows (quick)
if predict_df[desc_cols].nunique().min() <= 1:
    print('Warning: some features have no variance; consider checking merges/fallbacks')




Prediction features (first rows):
             Squad  prev_pts  prev_gd  prev_wins  prev_rank  prev_mp  prev_gf  \
0            Porto      72.0    36.00       22.0        3.0     34.0     63.0   
1      Sporting CP      90.0    67.00       29.0        1.0     34.0     96.0   
2          Benfica      80.0    49.00       25.0        2.0     34.0     77.0   
3   Gil Vicente FC      36.0   -10.00        9.0       12.0     34.0     42.0   
4       Moreirense      55.0     1.00       16.0        6.0     34.0     36.0   
5        Famalicão      42.0    -4.00       10.0        8.0     34.0     37.0   
6          Vitória      63.0    14.00       19.0        5.0     34.0     52.0   
7            Braga      68.0    21.00       21.0        4.0     34.0     71.0   
8         Nacional      25.0   -29.00        6.0       18.0     34.0     30.0   
9          Alverca      42.0    -7.25       11.0        9.0     34.0     39.0   
10          Arouca      46.0     4.00       13.0        7.0     34.0     54

In [21]:

# 4) Train model on train_df using expanded features
features = [c for c in ['prev_pts','prev_gd','prev_wins','prev_rank','last2_pts_mean','last3_pts_mean','pts_trend_last3','prev_gf_per_mp','prev_ga_per_mp','prev_gd_per_mp'] if c in train_df.columns]
X = train_df[features]
y = train_df['Pts']

# Simple scaler + RF pipeline (scaler used to help any tree-unfriendly features like ratios)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=300, max_depth=14, random_state=42)
model.fit(X_train, y_train)
print(f"MAE on test set: {mean_absolute_error(y_test, model.predict(X_test)):.2f}")

MAE on test set: 2.94


In [22]:

# 5) Predict for 2025-2026
X_pred = predict_df[[f for f in features if f in predict_df.columns]]
X_pred_scaled = scaler.transform(X_pred)
pred_points = model.predict(X_pred_scaled)
# Clip to plausible league range
pred_points = np.clip(pred_points, 10, 100)
predict_df['Predicted_Points'] = np.round(pred_points,1)


In [33]:


# 6) Compute win probabilities but temper them so they're not overly peaked
# We'll increase temperature until the max probability is under a threshold (e.g., 35%) to avoid one-sided distributions
def tempered_probs(points, max_top_prob=0.35, max_temp=20.0):
    temp = 1.0
    probs = None
    while True:
        scaled = points / temp
        p = softmax(scaled)
        if p.max() <= max_top_prob or temp >= max_temp:
            probs = p
            break
        temp *= 1.5
    return probs, temp

win_probs, used_temp = tempered_probs(pred_points, max_top_prob=0.35)
predict_df['Predicted_Points'] = np.round(pred_points, 0).astype(int)
predict_df['Win_Probability'] = np.round(win_probs * 100, 2)
results = predict_df[['Squad','Predicted_Points','Win_Probability']].sort_values('Predicted_Points', ascending=False).reset_index(drop=True)
print('\nPredictions:')
display(results.style.hide(axis='index').format({'Win_Probability': '{:.2f}%'}))
print(f'\nUsed temperature for softmax: {used_temp:.2f}')


Predictions:


Squad,Predicted_Points,Win_Probability
Sporting CP,83,26.11%
Benfica,80,21.82%
Porto,73,13.94%
Braga,65,9.12%
Vitória,56,5.24%
Famalicão,47,3.03%
Casa Pia,44,2.54%
Alverca,42,2.37%
Santa Clara,42,2.26%
Moreirense,42,2.28%



Used temperature for softmax: 17.09


In [24]:


# 7) Compare predicted top-3 mean to historical top-3 mean
historical_top3 = train_df.groupby('Years').apply(lambda g: g.sort_values('Pts', ascending=False).head(3)['Pts'].mean())
print(f"\nHistorical top-3 mean points: {historical_top3.mean():.2f}")
print(f"Predicted top-3 mean points: {results.head(3)['Predicted_Points'].mean():.2f}")

# 8) Feature importance and diagnostics
importance = pd.DataFrame({'Feature': features, 'Importance': model.feature_importances_}).sort_values('Importance', ascending=False)
print('\nFeature Importance:')
display(importance)

# Show per-team features next to predictions
show_cols = ['Squad'] + [f for f in ['prev_pts','prev_gd','prev_wins','prev_rank','last2_pts_mean','last3_pts_mean','pts_trend_last3','prev_gf_per_mp','prev_ga_per_mp','prev_gd_per_mp'] if f in predict_df.columns] + ['Predicted_Points','Win_Probability']
display(predict_df[show_cols].sort_values('Predicted_Points', ascending=False))


Historical top-3 mean points: 80.18
Predicted top-3 mean points: 78.83

Feature Importance:


  historical_top3 = train_df.groupby('Years').apply(lambda g: g.sort_values('Pts', ascending=False).head(3)['Pts'].mean())


Unnamed: 0,Feature,Importance
5,last3_pts_mean,0.491871
4,last2_pts_mean,0.391922
1,prev_gd,0.02762
6,pts_trend_last3,0.022911
9,prev_gd_per_mp,0.017847
3,prev_rank,0.014623
2,prev_wins,0.012741
7,prev_gf_per_mp,0.010988
0,prev_pts,0.006394
8,prev_ga_per_mp,0.003082


Unnamed: 0,Squad,prev_pts,prev_gd,prev_wins,prev_rank,last2_pts_mean,last3_pts_mean,pts_trend_last3,prev_gf_per_mp,prev_ga_per_mp,prev_gd_per_mp,Predicted_Points,Win_Probability
1,Sporting CP,90.0,67.0,29.0,1.0,86.0,82.0,4.0,2.823529,0.852941,1.970588,83.4,26.11
2,Benfica,80.0,49.0,25.0,2.0,80.0,82.333333,-3.5,2.264706,0.823529,1.441176,80.4,21.82
0,Porto,72.0,36.0,22.0,3.0,71.5,76.0,-7.0,1.852941,0.794118,1.058824,72.7,13.94
7,Braga,68.0,21.0,21.0,4.0,67.0,70.666667,-6.0,2.088235,1.470588,0.617647,65.5,9.12
6,Vitória,63.0,14.0,19.0,5.0,58.5,56.666667,0.5,1.529412,1.117647,0.411765,56.0,5.24
5,Famalicão,42.0,-4.0,10.0,8.0,44.5,44.333333,1.5,1.088235,1.205882,-0.117647,46.6,3.03
13,Casa Pia,38.0,-12.0,10.0,9.0,41.5,41.333333,2.0,1.117647,1.470588,-0.352941,43.6,2.54
9,Alverca,42.0,-7.25,11.0,9.0,42.25,41.666667,0.0,1.147059,1.294118,-0.213235,42.4,2.37
4,Moreirense,55.0,1.0,16.0,6.0,47.5,41.333333,5.5,1.058824,1.029412,0.029412,41.8,2.28
12,Santa Clara,22.0,-32.0,5.0,18.0,39.5,39.666667,8.5,0.764706,1.705882,-0.941176,41.6,2.26
