# ML2 Semestral Project - Football O/U 2.5
**Authors:** Phuong Nhi Tranová, Vít Maruniak, Šimon Slánský, Radim Škoukal, Ondřej Zetek, Martin Kareš, Jan Korčák, Jakub Maličkay, Jáchym Janouch  
**Course:** FIS 4IT344 Machine Learning 2 (2025/2026)  
**Goal:** Compare baseline (current features) vs extended (richer features) models for O/U 2.5 goals across markets; translate accuracy gains into optimal profit and **maximum data subscription price per country** *.  



---


***maximum data subscription price per country**
- the most money our company should be willing to pay for that country's additional data
- that's how much extra profit the improved model generates
- baseline model → accuracy = A₀
    - Generates profit Π*(A₀)
- extended model → accuracy = A₁
    - Generates profit Π*(A₁)
- profit improvement = ΔΠ = Π(A₁) − Π(A₀)*
    - basically how much more money the comany earns each year by using the better data
- the maximum data subscription price per country = ΔΠ


# 0. Imports and paths

In [2]:
import os, glob, math, json, re, gc, itertools, warnings, textwrap
from pathlib import Path
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional

from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, brier_score_loss, confusion_matrix
)
from sklearn.calibration import CalibrationDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (8,5)

In [3]:
DATA_DIR = "./data"  
OUTPUT_DIR = f"./processed"

os.makedirs(OUTPUT_DIR, exist_ok=True)

# 1. Mapping

In [4]:
# --- Base mapping from your spec (exact -> readable) ---
BASE_MAP = {
    "Div": "league_division",
    "Date": "match_date",
    "Time": "kickoff_time",
    "HomeTeam": "home_team",
    "AwayTeam": "away_team",
    "FTHG": "fulltime_home_goals",
    "HG": "fulltime_home_goals",
    "FTAG": "fulltime_away_goals",
    "AG": "fulltime_away_goals",
    "FTR": "fulltime_result",
    "Res": "fulltime_result",
    "HTHG": "halftime_home_goals",
    "HTAG": "halftime_away_goals",
    "HTR":  "halftime_result",
    "Attendance": "attendance",
    "Referee": "referee",
    "HS": "home_shots",
    "AS": "away_shots",
    "HST": "home_shots_on_target",
    "AST": "away_shots_on_target",
    "HHW": "home_hit_woodwork",
    "AHW": "away_hit_woodwork",
    "HC": "home_corners",
    "AC": "away_corners",
    "HF": "home_fouls_committed",
    "AF": "away_fouls_committed",
    "HFKC": "home_free_kicks_conceded",
    "AFKC": "away_free_kicks_conceded",
    "HO": "home_offsides",
    "AO": "away_offsides",
    "HY": "home_yellow_cards",
    "AY": "away_yellow_cards",
    "HR": "home_red_cards",
    "AR": "away_red_cards",
    "HBP": "home_bookings_points",
    "ABP": "away_bookings_points",
}

# --- Helpers to normalize O/U tokens & general safe snake_case ---
import re

def normalize_ou_token(col: str) -> str:
    # map >2.5 / <2.5 to "over25" / "under25"
    return col.replace(">2.5", "over25").replace("<2.5", "under25")

def normalize_colname(col: str) -> str:
    col = normalize_ou_token(col)
    col = col.replace("%", "pct").replace("+", "plus").replace("-", "minus")
    col = re.sub(r"[^\w]+", "_", col)        # non-alnum -> _
    col = re.sub(r"_+", "_", col).strip("_") # collapse repeats
    return col.lower()

# --- Known odds keys (OPENING) -> readable names (UPPERCASE KEYS) ---
ODDS_PREFIX_MAP = {
    # BetBrain 1x2 counts + opening 1x2 averages/maxima
    "BB1X2": "bb_1x2_count",
    "BBMXH": "bb_max_home",
    "BBAVH": "bb_avg_home",
    "BBMXD": "bb_max_draw",
    "BBAVD": "bb_avg_draw",
    "BBMXA": "bb_max_away",
    "BBAVA": "bb_avg_away",

    # Market opening 1x2
    "MAXH": "market_max_home",
    "MAXD": "market_max_draw",
    "MAXA": "market_max_away",
    "AVGH": "market_avg_home",
    "AVGD": "market_avg_draw",
    "AVGA": "market_avg_away",

    # Betfair Exchange (opening) 1x2
    "BFEH": "betfair_exch_home",
    "BFED": "betfair_exch_draw",
    "BFEA": "betfair_exch_away",

    # O/U opening
    "BBOU": "bb_ou_count",
    "BBMXOVER25": "bb_max_over25",
    "BBAVOVER25": "bb_avg_over25",
    "BBMXUNDER25": "bb_max_under25",
    "BBAVUNDER25": "bb_avg_under25",
    "GBOVER25": "gamebookers_over25",
    "GBUNDER25": "gamebookers_under25",
    "B365OVER25": "bet365_over25",
    "B365UNDER25": "bet365_under25",
    "POVER25": "pinnacle_over25",
    "PUNDER25": "pinnacle_under25",
    "MAXOVER25": "market_max_over25",
    "MAXUNDER25": "market_max_under25",
    "AVGOVER25": "market_avg_over25",
    "AVGUNDER25": "market_avg_under25",

    # Asian handicap opening
    "BBAH": "bb_ah_count",
    "BBAHH": "bb_ah_handicap_home",
    "AHH": "market_ah_handicap_home",
    "BBMXAHH": "bb_max_ah_home",
    "BBAVAHH": "bb_avg_ah_home",
    "BBMXAHA": "bb_max_ah_away",
    "BBAVAHA": "bb_avg_ah_away",
    "GBAHH": "gamebookers_ah_home",
    "GBAHA": "gamebookers_ah_away",
    "GBAH": "gamebookers_ah_handicap_home",
    "LBAHH": "ladbrokes_ah_home",
    "LBAHA": "ladbrokes_ah_away",
    "LBAH": "ladbrokes_ah_handicap_home",
    "B365AHH": "bet365_ah_home",
    "B365AHA": "bet365_ah_away",
    "B365AH": "bet365_ah_handicap_home",
    "PAHH": "pinnacle_ah_home",
    "PAHA": "pinnacle_ah_away",
    "MAXAHH": "market_max_ah_home",
    "MAXAHA": "market_max_ah_away",
    "AVGAHH": "market_avg_ah_home",
    "AVGAHA": "market_avg_ah_away",

    # Special 3-letter OPENING (do not fit prefix+tail pattern)
    "BWH": "betandwin_home",
    "BWD": "betandwin_draw",
    "BWA": "betandwin_away",
}

# --- Add explicit CLOSING odds mappings (UPPERCASE KEYS) ---
CLOSING_MAP = {
    # Bet365 closing 1x2 / OU / AH
    "B365CH": "bet365_close_home",
    "B365CD": "bet365_close_draw",
    "B365CA": "bet365_close_away",
    "B365COVER25": "bet365_close_over25",
    "B365CUNDER25": "bet365_close_under25",
    "B365CAHH": "bet365_close_ah_home",
    "B365CAHA": "bet365_close_ah_away",

    # Bet&Win closing 1x2
    "BWCH": "betandwin_close_home",
    "BWCD": "betandwin_close_draw",
    "BWCA": "betandwin_close_away",

    # Interwetten closing 1x2
    "IWCH": "interwetten_close_home",
    "IWCD": "interwetten_close_draw",
    "IWCA": "interwetten_close_away",

    # Pinnacle closing 1x2 / OU / AH  (PC…)
    "PSCH": "pinnacle_close_home",   # some datasets use PS… for Pinnacle
    "PSCD": "pinnacle_close_draw",
    "PSCA": "pinnacle_close_away",
    "PCH":  "pinnacle_close_home",   # safety alias
    "PCD":  "pinnacle_close_draw",
    "PCA":  "pinnacle_close_away",
    "PCOVER25": "pinnacle_close_over25",
    "PCUNDER25": "pinnacle_close_under25",
    "PCAHH": "pinnacle_close_ah_home",
    "PCAHA": "pinnacle_close_ah_away",

    # William Hill closing 1x2
    "WHCH": "william_hill_close_home",
    "WHCD": "william_hill_close_draw",
    "WHCA": "william_hill_close_away",

    # VC Bet closing 1x2
    "VCCH": "vc_bet_close_home",
    "VCCD": "vc_bet_close_draw",
    "VCCA": "vc_bet_close_away",

    # Market closing 1x2
    "MAXCH": "market_max_close_home",
    "MAXCD": "market_max_close_draw",
    "MAXCA": "market_max_close_away",
    "AVGCH": "market_avg_close_home",
    "AVGCD": "market_avg_close_draw",
    "AVGCA": "market_avg_close_away",

    # Betfair Exchange closing 1x2 / OU / AH
    "BFECH": "betfair_exch_close_home",
    "BFECD": "betfair_exch_close_draw",
    "BFECA": "betfair_exch_close_away",
    "BFECOVER25": "betfair_exch_close_over25",
    "BFECUNDER25": "betfair_exch_close_under25",
    "BFECAHH": "betfair_exch_close_ah_home",
    "BFECAHA": "betfair_exch_close_ah_away",

    # 1XBet closing 1x2
    "1XBCH": "1xbet_close_home",
    "1XBCD": "1xbet_close_draw",
    "1XBCA": "1xbet_close_away",
}

# Merge opening+closing into one dictionary
ODDS_PREFIX_MAP.update(CLOSING_MAP)

# --- 1x2 prefixes that DO follow PREFIX + H/D/A (case-insensitive) ---
BOOKMAKER_1X2_PREFIXES = [
    "1XB","B365","BF","BFD","BMGM","BV","BS","CL","GB","IW","LB","PS","PH","PD","PA",
    "SO","SB","SJ","SY","VC","WH"
]
BOOKMAKER_NAME_MAP = {
    "1XB": "1xbet", "B365": "bet365", "BF": "betfair", "BFD": "betfred", "BMGM": "betmgm",
    "BV": "betvictor", "BS": "blue_square", "CL": "coral", "GB": "gamebookers",
    "IW": "interwetten", "LB": "ladbrokes", "PS": "pinnacle", "PH": "pinnacle",
    "PD": "pinnacle", "PA": "pinnacle", "SO": "sporting_odds", "SB": "sportingbet",
    "SJ": "stan_james", "SY": "stanleybet", "VC": "vc_bet", "WH": "william_hill",
}

def bookmaker_1x2_alias(col: str):
    """Case-insensitive matcher for PREFIX + (H/D/A) and closing PREFIX + C(H/D/A)."""
    col_up = col.upper()
    for pfx in BOOKMAKER_1X2_PREFIXES:
        if col_up.startswith(pfx):
            tail = col_up[len(pfx):]
            if tail in ("H","D","A"):
                side = {"H":"home","D":"draw","A":"away"}[tail]
                return f"{BOOKMAKER_NAME_MAP.get(pfx, pfx.lower())}_{side}"
            if tail.startswith("C") and tail[1:] in ("H","D","A"):
                side = {"H":"home","D":"draw","A":"away"}[tail[1:]]
                return f"{BOOKMAKER_NAME_MAP.get(pfx, pfx.lower())}_close_{side}"
    return None

def make_column_renamer(base_map: dict):
    """
    Returns a function that:
      1) applies exact base_map replacements (case-sensitive),
      2) handles known odds keys (case-insensitive via an UPPER lookup),
      3) handles bookmaker PREFIX+(H/D/A) and closing variants,
      4) falls back to a safe snake_case normalizer.
    """
    # Build lookups
    exact_map = dict(base_map)

    # Case-insensitive exacts for odds: use uppercase keys
    odds_map_upper = {k.upper(): v for k, v in ODDS_PREFIX_MAP.items()}

    def renamer(cols):
        new_cols = []
        for c in cols:
            # 1) exact spec remap (e.g., Div -> league_division)
            if c in exact_map:
                new_cols.append(exact_map[c])
                continue

            # 2) odds known keys (opening + closing), case-insensitive
            c_up = c.upper()
            if c_up in odds_map_upper:
                new_cols.append(odds_map_upper[c_up])
                continue

            # 3) 1x2 bookmaker prefix logic
            alias = bookmaker_1x2_alias(c)
            if alias:
                new_cols.append(alias)
                continue

            # 4) fallback: normalized snake_case preserving meaning
            new_cols.append(normalize_colname(c))
        return new_cols

    return renamer

rename_columns = make_column_renamer(BASE_MAP)


# 2. Data load

In [5]:
def load_all_matches_basic(data_dir: str) -> pd.DataFrame:
    csv_files = glob.glob(os.path.join(data_dir, "**", "*.csv"), recursive=True)
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found under {data_dir}")

    frames = []
    for fp in csv_files:
        # extract path info
        rel = os.path.relpath(fp, data_dir)
        parts = Path(rel).parts
        country = parts[0] if len(parts) >= 1 else None
        league  = parts[1] if len(parts) >= 2 else None
        season_file = parts[2] if len(parts) >= 3 else None
        season = os.path.splitext(season_file)[0] if season_file else None

        # read and rename
        try:
            df = pd.read_csv(fp, low_memory=False)
        except Exception as e:
            print(f"Skipping {fp}: {e}")
            continue

        df.columns = rename_columns(df.columns.tolist())

        # attach metadata
        df["country"] = country
        df["league_tier"] = league
        df["season_code"] = season

        frames.append(df)

    all_df = pd.concat(frames, ignore_index=True, sort=False)
    return all_df

# run the loader
all_matches = load_all_matches_basic(DATA_DIR)
all_matches.shape

(42593, 140)

In [6]:
all_matches.head()

Unnamed: 0,league_division,match_date,kickoff_time,home_team,away_team,fulltime_home_goals,fulltime_away_goals,fulltime_result,halftime_home_goals,halftime_away_goals,...,betfair_exch_close_away,bfecover25,bfecunder25,betfair_exch_close_ah_home,betfair_exch_close_ah_away,unnamed_105,unnamed_119,unnamed_120,unnamed_121,unnamed_106
0,E0,13/08/2021,20:00,Brentford,Arsenal,2,0,H,1.0,0.0,...,,,,,,,,,,
1,E0,14/08/2021,12:30,Man United,Leeds,5,1,H,1.0,0.0,...,,,,,,,,,,
2,E0,14/08/2021,15:00,Burnley,Brighton,1,2,A,1.0,0.0,...,,,,,,,,,,
3,E0,14/08/2021,15:00,Chelsea,Crystal Palace,3,0,H,2.0,0.0,...,,,,,,,,,,
4,E0,14/08/2021,15:00,Everton,Southampton,3,1,H,0.0,1.0,...,,,,,,,,,,


# 3. Data cleaning

It seems like the renaming and loading went smoothly! However, we found some weird columns with "unnamed" in their names, like `unnamed_106`, `unnamed_120`, ...  
That sometimes happens when excel files have extra blank columns. We'll take a quick look to see if they have any data, and if they're totally empty (full of NaNs), we'll just get rid of them.

In [7]:
unnamed_cols = [c for c in all_matches.columns if c.startswith("unnamed")]
all_matches[unnamed_cols].isna().mean().sort_values()

unnamed_105    1.0
unnamed_119    1.0
unnamed_120    1.0
unnamed_121    1.0
unnamed_106    1.0
dtype: float64

They're 100% full of NaNs so we can now safely drop them.

In [8]:
all_matches = all_matches.drop(columns=unnamed_cols)