# UCL 2025-26 Knockout Playoff Predictor

Predicting which teams advance in the UEFA Champions League 2025-26 knockout playoff round using Elo ratings, historical statistics, and Monte Carlo simulation.

**Matches:** February 17-18 (1st leg) and February 24-25 (2nd leg), 2026

**Format:** Two legs, aggregate score. No away goals rule. If tied: extra time + penalties. Seeded teams (9-16) host the second leg.

## 1. Data Collection

We collect match data from two sources:
- **openfootball/football.json** ‚Äî domestic leagues (Premier League, La Liga, Bundesliga, Serie A, Ligue 1)
- **openfootball/champions-league** ‚Äî Champions League matches

Both are open-source datasets hosted on GitHub with match results going back to 2014.

In [None]:
import pandas as pd
import numpy as np
import requests
import time
import warnings
from scipy.special import expit
warnings.filterwarnings('ignore')

print("All imports successful!")

In [None]:
BASE = "https://raw.githubusercontent.com/openfootball/football.json/master"

seasons = [
    "2014-15", "2015-16", "2016-17", "2017-18", "2018-19",
    "2019-20", "2020-21", "2021-22", "2022-23", "2023-24", "2024-25"
]

leagues = {
    "en.1": "Premier League",
    "es.1": "La Liga",
    "de.1": "Bundesliga",
    "it.1": "Serie A",
    "fr.1": "Ligue 1",
}

def load_league(url, league_name):
    r = requests.get(url)
    if r.status_code != 200:
        return pd.DataFrame()
    data = r.json()
    matches = data.get("matches", [])
    rows = []
    for m in matches:
        ft = m.get("score", {}).get("ft")
        if ft is None:
            continue
        rows.append({
            "date": m["date"],
            "home": m["team1"],
            "away": m["team2"],
            "gh": ft[0],
            "ga": ft[1],
            "competition": league_name,
        })
    return pd.DataFrame(rows)

all_data = []

print("Downloading domestic leagues...")
for season in seasons:
    for code, name in leagues.items():
        url = f"{BASE}/{season}/{code}.json"
        df = load_league(url, name)
        if len(df) > 0:
            all_data.append(df)
            print(f"  ‚úÖ {name} {season}: {len(df)} matches")
        time.sleep(0.3)

print(f"\nDownloading Champions League...")
CL_BASE = "https://raw.githubusercontent.com/openfootball/champions-league/master"
for season in seasons:
    for cl_file in ["cl.json", "champions-league.json"]:
        url = f"{CL_BASE}/{season}/{cl_file}"
        df = load_league(url, "Champions League")
        if len(df) > 0:
            all_data.append(df)
            print(f"  ‚úÖ CL {season}: {len(df)} matches")
            break
    time.sleep(0.3)

df_all = pd.concat(all_data, ignore_index=True)
df_all['date'] = pd.to_datetime(df_all['date'])
df_all['gh'] = pd.to_numeric(df_all['gh'], errors='coerce')
df_all['ga'] = pd.to_numeric(df_all['ga'], errors='coerce')
df_all = df_all.dropna(subset=['date', 'gh', 'ga'])
df_all['result'] = np.where(df_all['gh'] > df_all['ga'], 'H',
                   np.where(df_all['gh'] < df_all['ga'], 'A', 'D'))
df_all = df_all.sort_values('date').reset_index(drop=True)

print(f"\n{'='*50}")
print(f"Total matches: {len(df_all):,}")
print(f"Period: {df_all['date'].min().date()} ‚Äî {df_all['date'].max().date()}")
print(f"Competitions: {df_all['competition'].unique().tolist()}")

## 2. Elo Rating System

We calculate Elo ratings for every team based on the chess rating system:

$$E_A = \frac{1}{1 + 10^{(R_B - R_A - 100) / 400}}$$

- Starting rating: 1500
- +100 home advantage bonus
- K-factor: 32 (domestic), 48 (Champions League)
- Goal difference multiplier: 1.5x for 2 goals, 1.75x for 3, 2.0x for 4+

In [None]:
def calculate_elo(df):
    elo = {}
    df_sorted = df.sort_values('date').reset_index(drop=True)

    for _, match in df_sorted.iterrows():
        home, away = match['home'], match['away']
        if home not in elo: elo[home] = 1500
        if away not in elo: elo[away] = 1500

        Ra, Rb = elo[home], elo[away]
        Ea = 1 / (1 + 10 ** ((Rb - Ra - 100) / 400))
        Eb = 1 - Ea

        if match['result'] == 'H':   Sa, Sb = 1, 0
        elif match['result'] == 'A': Sa, Sb = 0, 1
        else:                        Sa, Sb = 0.5, 0.5

        K = 48 if match['competition'] == 'Champions League' else 32
        gd = abs(match['gh'] - match['ga'])
        if gd == 2:   K *= 1.5
        elif gd == 3: K *= 1.75
        elif gd >= 4: K *= 2.0

        elo[home] = Ra + K * (Sa - Ea)
        elo[away] = Rb + K * (Sb - Eb)

    return elo

print("Calculating Elo ratings across all matches...")
elo = calculate_elo(df_all)
print(f"Done! {len(elo)} teams rated.")

In [None]:
playoff_teams = [
    "Monaco", "Paris Saint-Germain", "Galatasaray", "Juventus",
    "Benfica", "Real Madrid", "Borussia Dortmund", "Atalanta",
    "Qarabaƒü", "Newcastle United", "Club Brugge", "Atl√©tico Madrid",
    "Bod√∏/Glimt", "Inter", "Olympiacos", "Bayer Leverkusen",
]

print("\nüèÜ ELO RATINGS ‚Äî PLAYOFF TEAMS")
print("=" * 50)

team_elos = {}
for team in playoff_teams:
    rating = max((v for k, v in elo.items() if team.lower() in k.lower()), default=1500)
    team_elos[team] = rating

for team, rating in sorted(team_elos.items(), key=lambda x: -x[1]):
    bar = "‚ñà" * int((rating - 1300) / 10)
    print(f"  {rating:.0f}  {bar}  {team}")

## 3. Feature Engineering

For each playoff matchup we compute 16 features:

| # | Feature | Source |
|---|---------|--------|
| 1 | Elo difference | Calculated from 21K+ matches |
| 2 | Win rate difference | Last 30 matches |
| 3 | Goal difference per game | Last 30 matches |
| 4 | Form (last 5 matches) | Points: W=3, D=1, L=0 |
| 5 | CL win rate difference | CL matches only |
| 6 | H2H win rate | Last 10 years |
| 7 | H2H goal difference | Last 10 years |
| 8 | Injury impact difference | Expert assessment 0-1 |
| 9 | Home atmosphere (unseeded) | Stadium rating 0-1 |
| 10 | Home atmosphere (seeded) | Stadium rating 0-1 |
| 11 | Knockout experience diff | Expert assessment 0-1 |
| 12 | Squad depth difference | Expert assessment 0-1 |
| 13 | Qualitative form diff | Strength of schedule adjusted |
| 14 | Manager difference | Career CL track record |
| 15 | League phase position diff | Official 2025-26 standings |

In [None]:
def get_stats(df, team, before_date, n=30):
    mask = ((df['home'].str.contains(team, case=False, na=False)) |
            (df['away'].str.contains(team, case=False, na=False))) & (df['date'] < before_date)
    matches = df[mask].sort_values('date', ascending=False).head(n)
    if len(matches) == 0:
        return {'win_rate': 0.33, 'goal_diff': 0.0, 'form': 5, 'cl_wr': 0.33}

    wins, gf, ga, form = 0, 0, 0, 0
    for i, (_, m) in enumerate(matches.iterrows()):
        is_home = team.lower() in m['home'].lower()
        g_for = m['gh'] if is_home else m['ga']
        g_against = m['ga'] if is_home else m['gh']
        won = (is_home and m['result'] == 'H') or (not is_home and m['result'] == 'A')
        drew = m['result'] == 'D'
        if won: wins += 1
        gf += g_for; ga += g_against
        if i < 5: form += 3 if won else (1 if drew else 0)

    cl = matches[matches['competition'] == 'Champions League']
    cl_wins = sum(1 for _, m in cl.iterrows()
                  if (team.lower() in m['home'].lower() and m['result'] == 'H') or
                     (team.lower() in m['away'].lower() and m['result'] == 'A'))

    total = len(matches)
    return {
        'win_rate': wins / total,
        'goal_diff': (gf - ga) / total,
        'form': form,
        'cl_wr': cl_wins / len(cl) if len(cl) > 0 else 0.33,
    }

def get_h2h(df, team1, team2, before_date):
    cutoff = before_date - pd.DateOffset(years=10)
    mask = (((df['home'].str.contains(team1, case=False, na=False)) &
             (df['away'].str.contains(team2, case=False, na=False))) |
            ((df['home'].str.contains(team2, case=False, na=False)) &
             (df['away'].str.contains(team1, case=False, na=False)))) & \
           (df['date'] < before_date) & (df['date'] > cutoff)
    h2h = df[mask]
    if len(h2h) == 0:
        return {'h2h_wr': 0.5, 'h2h_gd': 0.0}

    wins, gf, ga = 0, 0, 0
    for _, m in h2h.iterrows():
        t1_home = team1.lower() in m['home'].lower()
        g_for = m['gh'] if t1_home else m['ga']
        g_against = m['ga'] if t1_home else m['gh']
        won = (t1_home and m['result'] == 'H') or (not t1_home and m['result'] == 'A')
        if won: wins += 1
        gf += g_for; ga += g_against

    total = len(h2h)
    return {'h2h_wr': wins / total, 'h2h_gd': (gf - ga) / total}

print("Functions defined. ‚úÖ")

In [None]:
# Verified league phase standings (NBC Sports)
league_pos = {
    "Real Madrid": 9, "Inter": 10, "Paris Saint-Germain": 11,
    "Newcastle United": 12, "Atalanta": 13, "Atl√©tico Madrid": 14,
    "Juventus": 15, "Bayer Leverkusen": 16, "Club Brugge": 17,
    "Olympiacos": 18, "Borussia Dortmund": 19, "Galatasaray": 20,
    "Monaco": 21, "Qarabaƒü": 22, "Bod√∏/Glimt": 23, "Benfica": 24,
}

# Expert qualitative assessments (0-1 scale)
qualitative = {
    "Monaco":               {"inj": 0.15, "home_atm": 0.55, "ko_exp": 0.30, "depth": 0.55, "form_q": 0.50, "mgr": 0.55},
    "Paris Saint-Germain":  {"inj": 0.90, "home_atm": 0.75, "ko_exp": 0.85, "depth": 0.90, "form_q": 0.45, "mgr": 0.80},
    "Galatasaray":          {"inj": 0.20, "home_atm": 0.95, "ko_exp": 0.40, "depth": 0.55, "form_q": 0.35, "mgr": 0.50},
    "Juventus":             {"inj": 0.65, "home_atm": 0.80, "ko_exp": 0.70, "depth": 0.75, "form_q": 0.65, "mgr": 0.60},
    "Benfica":              {"inj": 0.35, "home_atm": 0.80, "ko_exp": 0.50, "depth": 0.60, "form_q": 0.60, "mgr": 0.90},
    "Real Madrid":          {"inj": 0.55, "home_atm": 0.90, "ko_exp": 0.90, "depth": 0.80, "form_q": 0.50, "mgr": 0.50},
    "Borussia Dortmund":    {"inj": 0.30, "home_atm": 0.85, "ko_exp": 0.65, "depth": 0.65, "form_q": 0.40, "mgr": 0.55},
    "Atalanta":             {"inj": 0.05, "home_atm": 0.75, "ko_exp": 0.55, "depth": 0.65, "form_q": 0.75, "mgr": 0.80},
    "Qarabaƒü":              {"inj": 0.20, "home_atm": 0.60, "ko_exp": 0.15, "depth": 0.30, "form_q": 0.30, "mgr": 0.35},
    "Newcastle United":     {"inj": 0.20, "home_atm": 0.85, "ko_exp": 0.45, "depth": 0.70, "form_q": 0.65, "mgr": 0.70},
    "Club Brugge":          {"inj": 0.15, "home_atm": 0.65, "ko_exp": 0.35, "depth": 0.45, "form_q": 0.50, "mgr": 0.50},
    "Atl√©tico Madrid":      {"inj": 0.25, "home_atm": 0.80, "ko_exp": 0.80, "depth": 0.75, "form_q": 0.55, "mgr": 0.85},
    "Bod√∏/Glimt":           {"inj": 0.10, "home_atm": 0.75, "ko_exp": 0.10, "depth": 0.35, "form_q": 0.45, "mgr": 0.45},
    "Inter":                {"inj": 0.15, "home_atm": 0.80, "ko_exp": 0.80, "depth": 0.80, "form_q": 0.60, "mgr": 0.85},
    "Olympiacos":           {"inj": 0.20, "home_atm": 0.75, "ko_exp": 0.35, "depth": 0.45, "form_q": 0.60, "mgr": 0.55},
    "Bayer Leverkusen":     {"inj": 0.20, "home_atm": 0.70, "ko_exp": 0.55, "depth": 0.70, "form_q": 0.50, "mgr": 0.80},
}

print("Standings and qualitative data loaded. ‚úÖ")

In [None]:
matches = [
    ("Monaco", "Paris Saint-Germain"),
    ("Galatasaray", "Juventus"),
    ("Benfica", "Real Madrid"),
    ("Borussia Dortmund", "Atalanta"),
    ("Qarabaƒü", "Newcastle United"),
    ("Club Brugge", "Atl√©tico Madrid"),
    ("Bod√∏/Glimt", "Inter"),
    ("Olympiacos", "Bayer Leverkusen"),
]

pred_date = pd.Timestamp('2026-02-17')
all_features = []

print("Building feature matrix...\n")
for unseeded, seeded in matches:
    h_elo = max((v for k, v in elo.items() if unseeded.lower() in k.lower()), default=1500)
    a_elo = max((v for k, v in elo.items() if seeded.lower() in k.lower()), default=1500)
    h_stats = get_stats(df_all, unseeded, pred_date)
    a_stats = get_stats(df_all, seeded, pred_date)
    h2h = get_h2h(df_all, unseeded, seeded, pred_date)
    h_qual = qualitative.get(unseeded, {})
    a_qual = qualitative.get(seeded, {})
    h_pos = league_pos.get(unseeded, 20)
    a_pos = league_pos.get(seeded, 20)

    features = {
        'unseeded': unseeded, 'seeded': seeded,
        'elo_diff': h_elo - a_elo,
        'wr_diff': h_stats['win_rate'] - a_stats['win_rate'],
        'gd_diff': h_stats['goal_diff'] - a_stats['goal_diff'],
        'form_diff': h_stats['form'] - a_stats['form'],
        'cl_wr_diff': h_stats['cl_wr'] - a_stats['cl_wr'],
        'h2h_wr': h2h['h2h_wr'], 'h2h_gd': h2h['h2h_gd'],
        'inj_diff': a_qual.get('inj', 0.2) - h_qual.get('inj', 0.2),
        'home_atm_unseeded': h_qual.get('home_atm', 0.5),
        'home_atm_seeded': a_qual.get('home_atm', 0.5),
        'ko_exp_diff': h_qual.get('ko_exp', 0.3) - a_qual.get('ko_exp', 0.3),
        'depth_diff': h_qual.get('depth', 0.5) - a_qual.get('depth', 0.5),
        'form_q_diff': h_qual.get('form_q', 0.5) - a_qual.get('form_q', 0.5),
        'mgr_diff': h_qual.get('mgr', 0.5) - a_qual.get('mgr', 0.5),
        'lp_pos_diff': a_pos - h_pos,
    }
    all_features.append(features)
    print(f"‚öΩ {unseeded} vs {seeded}")
    print(f"   Elo: {h_elo:.0f} vs {a_elo:.0f} ({features['elo_diff']:+.0f})")
    print(f"   WR: {h_stats['win_rate']:.2f} vs {a_stats['win_rate']:.2f} | Form: {h_stats['form']} vs {a_stats['form']}")
    print(f"   H2H: wr={h2h['h2h_wr']:.2f} gd={h2h['h2h_gd']:+.2f}\n")

df_pred = pd.DataFrame(all_features)
print(f"Feature matrix: {df_pred.shape}")

## 4. Prediction Model

We use a **weighted scoring model** with sigmoid calibration and **Monte Carlo simulation** (100,000 runs).

**Why not train XGBoost?** Our qualitative features (injuries, atmosphere, manager quality) are manually assessed for these 8 matches only ‚Äî we don't have labeled training data with these features for historical matches.

**How it works:**
1. Each feature √ó weight ‚Üí single score (positive = unseeded team stronger)
2. Sigmoid converts score to probability
3. Simulate both legs 100K times with realistic goal distributions
4. Count who advances more often

In [None]:
weights = {
    'elo_diff': 0.0025, 'wr_diff': 0.8, 'gd_diff': 0.3,
    'form_diff': 0.04, 'cl_wr_diff': 0.5, 'h2h_wr': 0.4,
    'h2h_gd': 0.15, 'inj_diff': 0.6, 'ko_exp_diff': 0.3,
    'depth_diff': 0.2, 'form_q_diff': 0.4, 'mgr_diff': 0.3,
    'lp_pos_diff': 0.015,
}

HOME_ATM_WEIGHT = 0.5
SECOND_LEG_BONUS = 0.08
N_SIM = 100_000

WIN_GOALS = np.array([1, 2, 3, 4])
WIN_PROBS = np.array([0.42, 0.33, 0.18, 0.07])
LOSE_GOALS = np.array([0, 1, 2])
LOSE_PROBS = np.array([0.55, 0.32, 0.13])
DRAW_GOALS = np.array([0, 1, 2, 3])
DRAW_PROBS = np.array([0.25, 0.42, 0.25, 0.08])

def neutral_score(f):
    score = 0
    for feat, w in weights.items():
        if feat == 'h2h_wr': score += (f[feat] - 0.5) * w
        else: score += f[feat] * w
    return score

def leg_probs(ns, home_atm, second_leg=False):
    total = ns + home_atm * HOME_ATM_WEIGHT
    if second_leg: total += SECOND_LEG_BONUS
    home_base = expit(total)
    draw = max(0.12, min(0.28, 0.24 - 0.05 * abs(total)))
    rem = 1 - draw
    return rem * home_base, draw, rem * (1 - home_base)

print("Model parameters loaded. ‚úÖ")

In [None]:
rng = np.random.default_rng(42)
results = []

print("Running 100,000 Monte Carlo simulations per match...\n")

for f in all_features:
    ns = neutral_score(f)
    p1_h, p1_d, p1_a = leg_probs(ns, f['home_atm_unseeded'])
    p2_h, p2_d, p2_a = leg_probs(-ns, f['home_atm_seeded'], second_leg=True)

    unseeded_advances = 0
    for _ in range(N_SIM):
        r1 = rng.random()
        if r1 < p1_h:
            g1u = rng.choice(WIN_GOALS, p=WIN_PROBS); g1s = rng.choice(LOSE_GOALS, p=LOSE_PROBS)
        elif r1 < p1_h + p1_d:
            g = rng.choice(DRAW_GOALS, p=DRAW_PROBS); g1u = g; g1s = g
        else:
            g1s = rng.choice(WIN_GOALS, p=WIN_PROBS); g1u = rng.choice(LOSE_GOALS, p=LOSE_PROBS)

        r2 = rng.random()
        if r2 < p2_h:
            g2s = rng.choice(WIN_GOALS, p=WIN_PROBS); g2u = rng.choice(LOSE_GOALS, p=LOSE_PROBS)
        elif r2 < p2_h + p2_d:
            g = rng.choice(DRAW_GOALS, p=DRAW_PROBS); g2u = g; g2s = g
        else:
            g2u = rng.choice(WIN_GOALS, p=WIN_PROBS); g2s = rng.choice(LOSE_GOALS, p=LOSE_PROBS)

        tu = g1u + g2u; ts = g1s + g2s
        if tu > ts: unseeded_advances += 1
        elif tu == ts: unseeded_advances += rng.random() < 0.45

    t1 = unseeded_advances / N_SIM
    t2 = 1 - t1
    results.append({
        'unseeded': f['unseeded'], 'seeded': f['seeded'],
        't1_pct': t1, 't2_pct': t2,
        'leg1': f"{f['unseeded'][:3]} {p1_h:.0%} | Draw {p1_d:.0%} | {f['seeded'][:3]} {p1_a:.0%}",
        'leg2': f"{f['seeded'][:3]} {p2_h:.0%} | Draw {p2_d:.0%} | {f['unseeded'][:3]} {p2_a:.0%}",
    })

print("Simulation complete! ‚úÖ")

## 5. Results

In [None]:
print("üèÜ UCL 2025-26 KNOCKOUT PLAYOFF PREDICTIONS")
print("=" * 65)
print("Format: Two legs, aggregate score, no away goals rule")
print("Seeded team (9-16) hosts the second leg")
print("=" * 65)

results.sort(key=lambda x: max(x['t1_pct'], x['t2_pct']), reverse=True)

for i, r in enumerate(results, 1):
    fav = r['seeded'] if r['t2_pct'] > r['t1_pct'] else r['unseeded']
    fav_pct = max(r['t1_pct'], r['t2_pct'])
    conf = "üî¥" if fav_pct > 0.70 else "üü°" if fav_pct > 0.58 else "üü¢"
    bar1 = "‚ñà" * int(r['t1_pct'] * 30)
    bar2 = "‚ñë" * int(r['t2_pct'] * 30)

    print(f"\n {i}. {r['unseeded']} vs {r['seeded']}")
    print(f"    {bar1}{bar2}")
    print(f"    {r['unseeded']}: {r['t1_pct']:.1%}  vs  {r['seeded']}: {r['t2_pct']:.1%}")
    print(f"    1st leg ({r['unseeded']} home): {r['leg1']}")
    print(f"    2nd leg ({r['seeded']} home):  {r['leg2']}")
    print(f"    {conf} {fav} advances ({fav_pct:.0%})")

print(f"\n{'='*65}")
print("üî¥ >70%  üü° 58-70%  üü¢ <58%")
print(f"100,000 Monte Carlo simulations")