In [1]:
import pandas as pd
matches = pd.read_csv("ucl_matches_stats_2017_2024.csv")

In [2]:
# xg and xga are null
matches=matches.drop(columns=[col for col in ["xg", "xga"] if col in matches.columns])

In [3]:
matches.head()

Unnamed: 0,date,time,round,venue,season,team,opponent,gf,ga,result
0,2016-09-13,19:45,Group stage,Estádio do Sport Lisboa e Benfica,2016/2017,Benfica pt,tr Beşiktaş,1,1,D
1,2016-09-13,20:45,Group stage,Allianz Arena,2016/2017,Bayern Munich de,ru Rostov,5,0,H
2,2016-09-13,20:45,Group stage,Parc des Princes,2016/2017,Paris S-G fr,eng Arsenal,1,1,D
3,2016-09-13,20:45,Group stage,Camp Nou,2016/2017,Barcelona es,sct Celtic,7,0,H
4,2016-09-13,20:45,Group stage,Philips Stadion,2016/2017,PSV Eindhoven nl,es Atlético Madrid,0,1,A


In [4]:
matches.shape

(994, 10)

In [5]:
matches["round"].value_counts()

round
Group stage       768
Round of 16       128
Quarter-finals     60
Semi-finals        30
Final               8
Name: count, dtype: int64

In [6]:
matches.dtypes

date        object
time        object
round       object
venue       object
season      object
team        object
opponent    object
gf           int64
ga           int64
result      object
dtype: object

In [7]:
# converting date to date type
matches["date"] = pd.to_datetime(matches["date"])

In [8]:
matches["target"] = matches["result"].map({"A": 0, "D": 1, "H": 2})

In [9]:
import re

def clean_team_name(name: str) -> str:
    if pd.isna(name):
        return name
    # lowercase
    name = name.lower().strip()
    # remove 2-3 letter country prefixes like "eng " or "es "
    name = re.sub(r'^[a-z]{2,3}\s+', '', name)
    # remove trailing country tags like " pt", " es"
    name = re.sub(r'\s+[a-z]{2,3}$', '', name)
    # normalize whitespace
    name = re.sub(r'\s+', ' ', name)
    return name

In [10]:
matches['team'] = matches['team'].apply(clean_team_name)
matches['opponent'] = matches['opponent'].apply(clean_team_name)

In [11]:
all_teams = pd.concat([matches['team'], matches['opponent']]).unique()
team_to_id = {team: idx for idx, team in enumerate(sorted(all_teams))}

matches['home_id'] = matches['team'].map(team_to_id)
matches['away_id'] = matches['opponent'].map(team_to_id)

In [12]:
matches.head()

Unnamed: 0,date,time,round,venue,season,team,opponent,gf,ga,result,target,home_id,away_id
0,2016-09-13,19:45,Group stage,Estádio do Sport Lisboa e Benfica,2016/2017,benfica,beşiktaş,1,1,D,1,14,15
1,2016-09-13,20:45,Group stage,Allianz Arena,2016/2017,bayern munich,rostov,5,0,H,2,12,75
2,2016-09-13,20:45,Group stage,Parc des Princes,2016/2017,paris s-g,arsenal,1,1,D,1,63,6
3,2016-09-13,20:45,Group stage,Camp Nou,2016/2017,barcelona,celtic,7,0,H,2,10,17
4,2016-09-13,20:45,Group stage,Philips Stadion,2016/2017,eindhoven,atlético madrid,0,1,A,0,25,9


In [13]:
matches['time'] = matches["time"].str.replace(":.+", "", regex=True).astype("int")

In [14]:
round_map = {
    'Group stage': 0,
    'Round of 16': 1,
    'Quarter-finals': 2,
    'Semi-finals': 3,
    'Final': 4
}
matches['round_code'] = matches['round'].map(round_map)

In [15]:
# Add month, season, and venue_code features
matches['month'] = matches['date'].dt.month
matches['season_code'] = matches['season'].astype('category').cat.codes
if 'venue' in matches.columns:
    matches['venue_code'] = matches['venue'].astype('category').cat.codes
else:
    matches['venue_code'] = 0  # fallback if venue column is missing

In [16]:
import numpy as np
def build_form_features(df, window=5):
    df = df.sort_values('date')
    all_stats = []
    for team in pd.concat([df['team'], df['opponent']]).unique():
        team_matches = df[(df['team']==team) | (df['opponent']==team)].sort_values('date').copy()
        team_matches['gf_team'] = np.where(team_matches['team']==team, team_matches['gf'], team_matches['ga'])
        team_matches['ga_team'] = np.where(team_matches['team']==team, team_matches['ga'], team_matches['gf'])
        team_matches['pts'] = np.where(team_matches['gf_team']>team_matches['ga_team'], 3, np.where(team_matches['gf_team']==team_matches['ga_team'], 1, 0))
        roll = team_matches[['gf_team','ga_team','pts']].rolling(window, min_periods=1).mean().shift(1)
        roll.columns = [f'form_{c}_r{window}' for c in roll.columns]
        roll['team'] = team
        roll['date'] = team_matches['date'].values
        all_stats.append(roll)
    features = pd.concat(all_stats)
    return features
form_feats = build_form_features(matches, window=5)
matches = matches.merge(form_feats, left_on=['date','team'], right_on=['date','team'], how='left')
matches = matches.rename(columns={
    'form_gf_team_r5':'home_avg_gf5',
    'form_ga_team_r5':'home_avg_ga5',
    'form_pts_r5':'home_avg_pts5'
})
matches = matches.merge(form_feats, left_on=['date','opponent'], right_on=['date','team'], how='left', suffixes=('','_away'))
matches = matches.rename(columns={
    'form_gf_team_r5':'away_avg_gf5',
    'form_ga_team_r5':'away_avg_ga5',
    'form_pts_r5':'away_avg_pts5'
})
matches = matches.drop(columns=['team_away'])

In [17]:
# Fill missing values for away_avg_gf5 and away_avg_ga5
away_gf_mean = matches['gf'].mean()
away_ga_mean = matches['ga'].mean()
matches['away_avg_gf5'] = matches['away_avg_gf5'].fillna(away_gf_mean)
matches['away_avg_ga5'] = matches['away_avg_ga5'].fillna(away_ga_mean)

In [18]:
matches.head()

Unnamed: 0,date,time,round,venue,season,team,opponent,gf,ga,result,...,round_code,month,season_code,venue_code,home_avg_gf5,home_avg_ga5,home_avg_pts5,away_avg_gf5,away_avg_ga5,away_avg_pts5
0,2016-09-13,19,Group stage,Estádio do Sport Lisboa e Benfica,2016/2017,benfica,beşiktaş,1,1,D,...,0,9,0,35,,,,1.68008,1.366197,
1,2016-09-13,20,Group stage,Allianz Arena,2016/2017,bayern munich,rostov,5,0,H,...,0,9,0,1,,,,1.68008,1.366197,
2,2016-09-13,20,Group stage,Parc des Princes,2016/2017,paris s-g,arsenal,1,1,D,...,0,9,0,62,,,,1.68008,1.366197,
3,2016-09-13,20,Group stage,Camp Nou,2016/2017,barcelona,celtic,7,0,H,...,0,9,0,12,,,,1.68008,1.366197,
4,2016-09-13,20,Group stage,Philips Stadion,2016/2017,eindhoven,atlético madrid,0,1,A,...,0,9,0,64,,,,1.68008,1.366197,


In [19]:
# model training
from sklearn.ensemble import RandomForestClassifier

In [20]:
rf = RandomForestClassifier(
    n_estimators=50,
    min_samples_split=10,
    random_state=1
    )

In [21]:
train = matches[matches['season'] != '2023/2024']
test = matches[matches['season'] == '2023/2024']

In [22]:
test

Unnamed: 0,date,time,round,venue,season,team,opponent,gf,ga,result,...,round_code,month,season_code,venue_code,home_avg_gf5,home_avg_ga5,home_avg_pts5,away_avg_gf5,away_avg_ga5,away_avg_pts5
869,2023-09-19,18,Group stage,Stadio Giuseppe Meazza,2023/2024,milan,newcastle,0,0,D,...,0,9,7,87,0.4,0.8,1.0,1.68008,1.366197,
870,2023-09-19,18,Group stage,Stadion Wankdorf,2023/2024,young boys,rb leipzig,1,3,A,...,0,9,7,97,1.0,2.2,0.4,2.20000,1.800000,1.8
871,2023-09-19,20,Group stage,Etihad Stadium,2023/2024,manchester city,red star,3,1,H,...,0,9,7,37,2.0,0.4,2.2,0.20000,3.200000,0.0
872,2023-09-19,21,Group stage,Parc des Princes,2023/2024,paris s-g,dortmund,2,0,H,...,0,9,7,62,2.0,1.4,1.4,0.60000,0.800000,1.2
873,2023-09-19,21,Group stage,Stadio Olimpico,2023/2024,lazio,atlético madrid,1,1,D,...,0,9,7,88,1.6,2.0,1.0,0.60000,1.600000,0.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
989,2024-04-30,21,Semi-finals,Allianz Arena,2023/2024,bayern munich,real madrid,2,2,D,...,3,4,7,1,1.4,0.6,2.0,1.80000,1.400000,1.8
990,2024-05-01,21,Semi-finals,Signal Iduna Park,2023/2024,dortmund,paris s-g,1,0,H,...,3,5,7,74,1.8,1.2,1.6,2.20000,1.200000,2.0
991,2024-05-07,21,Semi-finals,Parc des Princes,2023/2024,paris s-g,dortmund,0,1,A,...,3,5,7,62,2.0,1.2,1.8,1.80000,1.000000,2.0
992,2024-05-08,21,Semi-finals,Estadio Santiago Bernabéu,2023/2024,real madrid,bayern munich,2,1,H,...,3,5,7,23,1.6,1.4,1.4,1.60000,1.000000,1.6


In [26]:
predictors = [
    'home_id', 'away_id', 'time', 'round_code', 'month', 'season_code', 'venue_code',
    'home_avg_gf5','home_avg_ga5','home_avg_pts5',
    'away_avg_gf5','away_avg_ga5','away_avg_pts5'
    ]

In [27]:
rf.fit(train[predictors],train['target'])

0,1,2
,n_estimators,50
,criterion,'gini'
,max_depth,
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [28]:
preds = rf.predict(test[predictors])

In [29]:
from sklearn.metrics import accuracy_score

In [30]:
acc = accuracy_score(test['target'],preds)

In [31]:
acc

0.488

In [None]:
combined = pd.DataFrame(dict(actual=test['target'],prediction=preds))

In [None]:
pd.crosstab(index=combined['actual'],columns=combined['prediction'])

In [None]:
# Extra features + stronger model + proper time split
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import HistGradientBoostingClassifier

# 1) Basic sanity: sort by date to avoid leakage
matches = matches.sort_values('date').reset_index(drop=True)

# 2) Elo rating feature (simple implementation)
K = 20
start_elo = 1500
elos = {}
elo_list = []

for _, row in matches.iterrows():
    h = row['team']
    a = row['opponent']
    elos.setdefault(h, start_elo)
    elos.setdefault(a, start_elo)
    Rh, Ra = elos[h], elos[a]
    exp_h = 1/(1+10**((Ra-Rh)/400))
    # Map target to 1/0 from home pov: H=2 win->1, D=1 draw->0.5, A=0 loss->0
    if row['target'] == 2:
        score_h = 1
    elif row['target'] == 1:
        score_h = 0.5
    else:
        score_h = 0
    # Update
    elos[h] = Rh + K*(score_h - exp_h)
    elos[a] = Ra + K*((1-score_h) - (1-exp_h))
    elo_list.append((elos[h], elos[a]))

elo_df = pd.DataFrame(elo_list, columns=['home_elo','away_elo'])
matches = pd.concat([matches.reset_index(drop=True), elo_df], axis=1)

# 3) Rest days feature (days since team's last match)
last_seen = {}
rest_home = []
rest_away = []
for i, row in matches.iterrows():
    d = row['date']
    h = row['team']
    a = row['opponent']
    rest_home.append((d - last_seen[h]).days if h in last_seen else np.nan)
    rest_away.append((d - last_seen[a]).days if a in last_seen else np.nan)
    last_seen[h] = d
    last_seen[a] = d

matches['home_rest_days'] = pd.Series(rest_home)
matches['away_rest_days'] = pd.Series(rest_away)
# Fill NaNs safely (assign, not inplace)
matches['home_rest_days'] = matches['home_rest_days'].fillna(matches['home_rest_days'].median())
matches['away_rest_days'] = matches['away_rest_days'].fillna(matches['away_rest_days'].median())

# 4) Explicit home-advantage recent form delta
matches['avg_gf_delta5'] = matches['home_avg_gf5'] - matches['away_avg_gf5']
matches['avg_ga_delta5'] = matches['home_avg_ga5'] - matches['away_avg_ga5']
matches['avg_pts_delta5'] = matches['home_avg_pts5'] - matches['away_avg_pts5']

# 5) Define new predictor set
new_predictors = [
    'home_id','away_id','time','round_code','month','season_code','venue_code',
    'home_avg_gf5','home_avg_ga5','home_avg_pts5',
    'away_avg_gf5','away_avg_ga5','away_avg_pts5',
    'avg_gf_delta5','avg_ga_delta5','avg_pts_delta5',
    'home_elo','away_elo','home_rest_days','away_rest_days'
]

# 6) Train/test split by season (same as before) and a stronger model that accepts NaNs
train = matches[matches['season'] != '2023/2024']
test = matches[matches['season'] == '2023/2024']

model = HistGradientBoostingClassifier(random_state=42)
model.fit(train[new_predictors], train['target'])

preds = model.predict(test[new_predictors])
probs = model.predict_proba(test[new_predictors])
acc_boost = accuracy_score(test['target'], preds)
print({'accuracy': round(acc_boost, 4)})

# Confident predictions (only evaluate when model is sure)
maxp = probs.max(axis=1)
mask_conf = maxp >= 0.6
if mask_conf.any():
    acc_conf = accuracy_score(test['target'][mask_conf], preds[mask_conf])
    coverage = mask_conf.mean()
    print({'confident_accuracy': round(acc_conf, 4), 'coverage': round(float(coverage), 3)})
else:
    print({'confident_accuracy': None, 'coverage': 0.0})

# Optional diagnostics
print('Class distribution (test):', test['target'].value_counts(normalize=True).to_dict())
print('Confusion matrix (test):')
print(confusion_matrix(test['target'], preds))


{'accuracy': 0.584}
{'confident_accuracy': 0.6111, 'coverage': 0.864}
Class distribution (test): {2: 0.464, 0: 0.304, 1: 0.232}
Confusion matrix (test):
[[25  4  9]
 [10  3 16]
 [ 5  8 45]]


In [40]:
def _infer_season_from_date(date: pd.Timestamp) -> str:
    y = date.year
    if date.month >= 7:
        return f"{y}/{y+1}"
    else:
        return f"{y-1}/{y}"


def predict_match_minimal(home_team: str,
                          away_team: str,
                          date_str: str,
                          time_hour: int,
                          round_name: str):
    """
    Minimal interface: home_team, away_team, date (YYYY-MM-DD), time hour (int), round name.
    Returns predicted label and probabilities [Away, Draw, Home].
    """
    # Ensure prior training artifacts exist
    assert 'model' in globals(), 'Model not trained. Run the training cell first.'
    assert 'new_predictors' in globals(), 'Feature list missing. Run the training cell first.'

    # Clean and parse
    h = clean_team_name(home_team)
    a = clean_team_name(away_team)
    mdate = pd.to_datetime(date_str)

    # Historical data strictly before match date
    hist = matches[matches['date'] < mdate].sort_values('date').copy()

    # Team ID map: reuse if available
    mapper = globals().get('team_to_id', None)
    if mapper is None:
        all_teams_full = pd.concat([matches['team'], matches['opponent']]).dropna().unique()
        mapper = {team: idx for idx, team in enumerate(sorted(all_teams_full))}

    # Form features
    hist_form = build_form_features(hist, window=5)

    def last_form(team):
        r = hist_form[hist_form['team'] == team].sort_values('date').tail(1)
        if r.empty:
            return pd.Series({'gf': np.nan, 'ga': np.nan, 'pts': np.nan})
        return pd.Series({
            'gf': r['form_gf_team_r5'].values[0],
            'ga': r['form_ga_team_r5'].values[0],
            'pts': r['form_pts_r5'].values[0],
        })

    hf = last_form(h)
    af = last_form(a)

    # Elo over history
    K = 20
    start_elo = 1500
    elos = {}
    for _, row in hist.iterrows():
        th = row['team']; ta = row['opponent']
        elos.setdefault(th, start_elo); elos.setdefault(ta, start_elo)
        Rh, Ra = elos[th], elos[ta]
        exp_h = 1/(1+10**((Ra-Rh)/400))
        score_h = 1 if row['target'] == 2 else (0.5 if row['target'] == 1 else 0)
        elos[th] = Rh + K*(score_h - exp_h)
        elos[ta] = Ra + K*((1-score_h) - (1-exp_h))

    home_elo = elos.get(h, start_elo)
    away_elo = elos.get(a, start_elo)

    # Rest days
    last_seen = {}
    for _, row in hist.iterrows():
        d = row['date']; th = row['team']; ta = row['opponent']
        last_seen[th] = d; last_seen[ta] = d
    home_rest = (mdate - last_seen[h]).days if h in last_seen else np.nan
    away_rest = (mdate - last_seen[a]).days if a in last_seen else np.nan

    # Round code map: prefer existing variable
    round_map_local = globals().get('round_map', {
        'Group stage': 0,
        'Round of 16': 1,
        'Quarter-finals': 2,
        'Semi-finals': 3,
        'Final': 4
    })

    # Season code from training mapping
    season_str = _infer_season_from_date(mdate)
    sc_map = matches[['season','season_code']].drop_duplicates()
    sc_map = dict(zip(sc_map['season'], sc_map['season_code']))
    season_code_val = sc_map.get(season_str, float(np.nan))

    # Assemble features
    feats = {
        'home_id': mapper.get(h, -1),
        'away_id': mapper.get(a, -1),
        'time': int(time_hour),
        'round_code': round_map_local.get(round_name, 0),
        'month': mdate.month,
        'season_code': season_code_val,
        'venue_code': 0,  # unknown
        'home_avg_gf5': hf['gf'],
        'home_avg_ga5': hf['ga'],
        'home_avg_pts5': hf['pts'],
        'away_avg_gf5': af['gf'],
        'away_avg_ga5': af['ga'],
        'away_avg_pts5': af['pts'],
        'avg_gf_delta5': (hf['gf'] - af['gf']) if not (pd.isna(hf['gf']) or pd.isna(af['gf'])) else np.nan,
        'avg_ga_delta5': (hf['ga'] - af['ga']) if not (pd.isna(hf['ga']) or pd.isna(af['ga'])) else np.nan,
        'avg_pts_delta5': (hf['pts'] - af['pts']) if not (pd.isna(hf['pts']) or pd.isna(af['pts'])) else np.nan,
        'home_elo': home_elo,
        'away_elo': away_elo,
        'home_rest_days': home_rest,
        'away_rest_days': away_rest,
    }

    # Build DataFrame and align to model's expected features
    X_df = pd.DataFrame([feats])
    expected = list(getattr(model, 'feature_names_in_', new_predictors))

    # Add any missing expected columns as NaN
    for col in expected:
        if col not in X_df.columns:
            X_df[col] = np.nan
    # Keep only expected columns in correct order
    X_df = X_df[expected]

    # Ensure numeric types
    X_df = X_df.apply(pd.to_numeric, errors='coerce')

    # Optional sanity
    if X_df.shape[1] != getattr(model, 'n_features_in_', X_df.shape[1]):
        raise ValueError(f"Feature mismatch: built {X_df.shape[1]} but model expects {getattr(model, 'n_features_in_', 'unknown')}")

    pred = model.predict(X_df)[0]
    prob = model.predict_proba(X_df)[0]

    label_map = {0: 'Away', 1: 'Draw', 2: 'Home'}
    return {
        'prediction': int(pred),
        'label': label_map[int(pred)],
        'proba_away_draw_home': prob.tolist(),
        'max_prob': float(prob.max())
    }

# Example:
# predict_match_minimal(
#     home_team='Real Madrid',
#     away_team='Manchester City',
#     date_str='2024-04-15',
#     time_hour=20,
#     round_name='Semi-finals'
# )

In [41]:
# Demo (uncomment and edit):
predict_match_minimal(
    home_team='Real Madrid',
    away_team='Manchester City',
    date_str='2024-04-15',
    time_hour=20,
    round_name='Semi-finals'
)

{'prediction': 2,
 'label': 'Home',
 'proba_away_draw_home': [0.3397819053703489,
  0.16168555019907702,
  0.4985325444305742],
 'max_prob': 0.4985325444305742}