# ML2 Semestral Project - Football O/U 2.5
**Authors:** Phuong Nhi Tranová, Vít Maruniak, Šimon Slánský, Radim Škoukal, Ondřej Zetek, Martin Kareš, Jan Korčák, Jakub Maličkay, Jáchym Janouch  
**Course:** FIS 4IT344 Machine Learning 2 (2025/2026)  
**Goal:** Compare baseline (current features) vs extended (richer features) models for O/U 2.5 goals across markets; translate accuracy gains into optimal profit and **maximum data subscription price per country** *.  



---


***maximum data subscription price per country**
- the most money our company should be willing to pay for that country's additional data
- that's how much extra profit the improved model generates
- baseline model → accuracy = A₀
    - Generates profit Π*(A₀)
- extended model → accuracy = A₁
    - Generates profit Π*(A₁)
- profit improvement = ΔΠ = Π(A₁) − Π(A₀)*
    - basically how much more money the comany earns each year by using the better data
- the maximum data subscription price per country = ΔΠ


## Enviromental setup

### a) Configuration

In [19]:
# === Paths ===
from google.colab import drive
drive.mount('/content/drive')

DATA_DIR = "/content/drive/MyDrive/data"  # <- root with {country}/{league}/{season}.csv
OUTPUT_DIR = f"{DATA_DIR}/processed"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# === Modeling ===
RANDOM_STATE = 42
METRICS      = ["accuracy", "f1", "roc_auc", "log_loss"]

@dataclass(frozen=True)
class EconParams:
    m_operations: float = 0.03   # m_operations
    k: float = 0.3               # risk margin parameter
    alpha: float = 1000          # demand parameter
    eps: float = 3               # elasticity (>1)
    avg_bet: float = 12          # USD

ECON = EconParams()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### b) Installation of necessary libraries

In [7]:
!pip -q install xgboost==2.0.3 scikit-learn==1.4.2 pandas==2.2.2 numpy==1.26.4 matplotlib==3.8.4 seaborn==0.13.2 scipy==1.11.4 sympy==1.12

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.1/297.1 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.2/12.2 MB[0m [31m120.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m111.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m124.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.8/35.8 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.7/5.7 MB[0m [31m108.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver

### b) Imports

In [21]:
import os, glob, math, json, re, gc, itertools, warnings, textwrap
from pathlib import Path
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional

from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, brier_score_loss, confusion_matrix
)
from sklearn.calibration import CalibrationDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (8,5)

## Data Preprocessing

### a) Helpers
since this semestrals project is quite specific with this football domain that i must frankly admit that i do not know anything about it... let alone all these crazy abbreviations and when i look at the data my head spins from all the letters. so we have made a collect decision to rename the columns for easier understanding

In [63]:
# --- Base mapping from your spec (exact -> readable) ---
BASE_MAP = {
    "Div": "league_division",
    "Date": "match_date",
    "Time": "kickoff_time",
    "HomeTeam": "home_team",
    "AwayTeam": "away_team",
    "FTHG": "fulltime_home_goals",
    "HG": "fulltime_home_goals",
    "FTAG": "fulltime_away_goals",
    "AG": "fulltime_away_goals",
    "FTR": "fulltime_result",
    "Res": "fulltime_result",
    "HTHG": "halftime_home_goals",
    "HTAG": "halftime_away_goals",
    "HTR":  "halftime_result",
    "Attendance": "attendance",
    "Referee": "referee",
    "HS": "home_shots",
    "AS": "away_shots",
    "HST": "home_shots_on_target",
    "AST": "away_shots_on_target",
    "HHW": "home_hit_woodwork",
    "AHW": "away_hit_woodwork",
    "HC": "home_corners",
    "AC": "away_corners",
    "HF": "home_fouls_committed",
    "AF": "away_fouls_committed",
    "HFKC": "home_free_kicks_conceded",
    "AFKC": "away_free_kicks_conceded",
    "HO": "home_offsides",
    "AO": "away_offsides",
    "HY": "home_yellow_cards",
    "AY": "away_yellow_cards",
    "HR": "home_red_cards",
    "AR": "away_red_cards",
    "HBP": "home_bookings_points",
    "ABP": "away_bookings_points",
}

# --- Helpers to normalize O/U tokens & general safe snake_case ---
import re

def normalize_ou_token(col: str) -> str:
    # map >2.5 / <2.5 to "over25" / "under25"
    return col.replace(">2.5", "over25").replace("<2.5", "under25")

def normalize_colname(col: str) -> str:
    col = normalize_ou_token(col)
    col = col.replace("%", "pct").replace("+", "plus").replace("-", "minus")
    col = re.sub(r"[^\w]+", "_", col)        # non-alnum -> _
    col = re.sub(r"_+", "_", col).strip("_") # collapse repeats
    return col.lower()

# --- Known odds keys (OPENING) -> readable names (UPPERCASE KEYS) ---
ODDS_PREFIX_MAP = {
    # BetBrain 1x2 counts + opening 1x2 averages/maxima
    "BB1X2": "bb_1x2_count",
    "BBMXH": "bb_max_home",
    "BBAVH": "bb_avg_home",
    "BBMXD": "bb_max_draw",
    "BBAVD": "bb_avg_draw",
    "BBMXA": "bb_max_away",
    "BBAVA": "bb_avg_away",

    # Market opening 1x2
    "MAXH": "market_max_home",
    "MAXD": "market_max_draw",
    "MAXA": "market_max_away",
    "AVGH": "market_avg_home",
    "AVGD": "market_avg_draw",
    "AVGA": "market_avg_away",

    # Betfair Exchange (opening) 1x2
    "BFEH": "betfair_exch_home",
    "BFED": "betfair_exch_draw",
    "BFEA": "betfair_exch_away",

    # O/U opening
    "BBOU": "bb_ou_count",
    "BBMXOVER25": "bb_max_over25",
    "BBAVOVER25": "bb_avg_over25",
    "BBMXUNDER25": "bb_max_under25",
    "BBAVUNDER25": "bb_avg_under25",
    "GBOVER25": "gamebookers_over25",
    "GBUNDER25": "gamebookers_under25",
    "B365OVER25": "bet365_over25",
    "B365UNDER25": "bet365_under25",
    "POVER25": "pinnacle_over25",
    "PUNDER25": "pinnacle_under25",
    "MAXOVER25": "market_max_over25",
    "MAXUNDER25": "market_max_under25",
    "AVGOVER25": "market_avg_over25",
    "AVGUNDER25": "market_avg_under25",

    # Asian handicap opening
    "BBAH": "bb_ah_count",
    "BBAHH": "bb_ah_handicap_home",
    "AHH": "market_ah_handicap_home",
    "BBMXAHH": "bb_max_ah_home",
    "BBAVAHH": "bb_avg_ah_home",
    "BBMXAHA": "bb_max_ah_away",
    "BBAVAHA": "bb_avg_ah_away",
    "GBAHH": "gamebookers_ah_home",
    "GBAHA": "gamebookers_ah_away",
    "GBAH": "gamebookers_ah_handicap_home",
    "LBAHH": "ladbrokes_ah_home",
    "LBAHA": "ladbrokes_ah_away",
    "LBAH": "ladbrokes_ah_handicap_home",
    "B365AHH": "bet365_ah_home",
    "B365AHA": "bet365_ah_away",
    "B365AH": "bet365_ah_handicap_home",
    "PAHH": "pinnacle_ah_home",
    "PAHA": "pinnacle_ah_away",
    "MAXAHH": "market_max_ah_home",
    "MAXAHA": "market_max_ah_away",
    "AVGAHH": "market_avg_ah_home",
    "AVGAHA": "market_avg_ah_away",

    # Special 3-letter OPENING (do not fit prefix+tail pattern)
    "BWH": "betandwin_home",
    "BWD": "betandwin_draw",
    "BWA": "betandwin_away",
}

# --- Add explicit CLOSING odds mappings (UPPERCASE KEYS) ---
CLOSING_MAP = {
    # Bet365 closing 1x2 / OU / AH
    "B365CH": "bet365_close_home",
    "B365CD": "bet365_close_draw",
    "B365CA": "bet365_close_away",
    "B365COVER25": "bet365_close_over25",
    "B365CUNDER25": "bet365_close_under25",
    "B365CAHH": "bet365_close_ah_home",
    "B365CAHA": "bet365_close_ah_away",

    # Bet&Win closing 1x2
    "BWCH": "betandwin_close_home",
    "BWCD": "betandwin_close_draw",
    "BWCA": "betandwin_close_away",

    # Interwetten closing 1x2
    "IWCH": "interwetten_close_home",
    "IWCD": "interwetten_close_draw",
    "IWCA": "interwetten_close_away",

    # Pinnacle closing 1x2 / OU / AH  (PC…)
    "PSCH": "pinnacle_close_home",   # some datasets use PS… for Pinnacle
    "PSCD": "pinnacle_close_draw",
    "PSCA": "pinnacle_close_away",
    "PCH":  "pinnacle_close_home",   # safety alias
    "PCD":  "pinnacle_close_draw",
    "PCA":  "pinnacle_close_away",
    "PCOVER25": "pinnacle_close_over25",
    "PCUNDER25": "pinnacle_close_under25",
    "PCAHH": "pinnacle_close_ah_home",
    "PCAHA": "pinnacle_close_ah_away",

    # William Hill closing 1x2
    "WHCH": "william_hill_close_home",
    "WHCD": "william_hill_close_draw",
    "WHCA": "william_hill_close_away",

    # VC Bet closing 1x2
    "VCCH": "vc_bet_close_home",
    "VCCD": "vc_bet_close_draw",
    "VCCA": "vc_bet_close_away",

    # Market closing 1x2
    "MAXCH": "market_max_close_home",
    "MAXCD": "market_max_close_draw",
    "MAXCA": "market_max_close_away",
    "AVGCH": "market_avg_close_home",
    "AVGCD": "market_avg_close_draw",
    "AVGCA": "market_avg_close_away",

    # Betfair Exchange closing 1x2 / OU / AH
    "BFECH": "betfair_exch_close_home",
    "BFECD": "betfair_exch_close_draw",
    "BFECA": "betfair_exch_close_away",
    "BFECOVER25": "betfair_exch_close_over25",
    "BFECUNDER25": "betfair_exch_close_under25",
    "BFECAHH": "betfair_exch_close_ah_home",
    "BFECAHA": "betfair_exch_close_ah_away",

    # 1XBet closing 1x2
    "1XBCH": "1xbet_close_home",
    "1XBCD": "1xbet_close_draw",
    "1XBCA": "1xbet_close_away",
}

# Merge opening+closing into one dictionary
ODDS_PREFIX_MAP.update(CLOSING_MAP)

# --- 1x2 prefixes that DO follow PREFIX + H/D/A (case-insensitive) ---
BOOKMAKER_1X2_PREFIXES = [
    "1XB","B365","BF","BFD","BMGM","BV","BS","CL","GB","IW","LB","PS","PH","PD","PA",
    "SO","SB","SJ","SY","VC","WH"
]
BOOKMAKER_NAME_MAP = {
    "1XB": "1xbet", "B365": "bet365", "BF": "betfair", "BFD": "betfred", "BMGM": "betmgm",
    "BV": "betvictor", "BS": "blue_square", "CL": "coral", "GB": "gamebookers",
    "IW": "interwetten", "LB": "ladbrokes", "PS": "pinnacle", "PH": "pinnacle",
    "PD": "pinnacle", "PA": "pinnacle", "SO": "sporting_odds", "SB": "sportingbet",
    "SJ": "stan_james", "SY": "stanleybet", "VC": "vc_bet", "WH": "william_hill",
}

def bookmaker_1x2_alias(col: str):
    """Case-insensitive matcher for PREFIX + (H/D/A) and closing PREFIX + C(H/D/A)."""
    col_up = col.upper()
    for pfx in BOOKMAKER_1X2_PREFIXES:
        if col_up.startswith(pfx):
            tail = col_up[len(pfx):]
            if tail in ("H","D","A"):
                side = {"H":"home","D":"draw","A":"away"}[tail]
                return f"{BOOKMAKER_NAME_MAP.get(pfx, pfx.lower())}_{side}"
            if tail.startswith("C") and tail[1:] in ("H","D","A"):
                side = {"H":"home","D":"draw","A":"away"}[tail[1:]]
                return f"{BOOKMAKER_NAME_MAP.get(pfx, pfx.lower())}_close_{side}"
    return None

def make_column_renamer(base_map: dict):
    """
    Returns a function that:
      1) applies exact base_map replacements (case-sensitive),
      2) handles known odds keys (case-insensitive via an UPPER lookup),
      3) handles bookmaker PREFIX+(H/D/A) and closing variants,
      4) falls back to a safe snake_case normalizer.
    """
    # Build lookups
    exact_map = dict(base_map)

    # Case-insensitive exacts for odds: use uppercase keys
    odds_map_upper = {k.upper(): v for k, v in ODDS_PREFIX_MAP.items()}

    def renamer(cols):
        new_cols = []
        for c in cols:
            # 1) exact spec remap (e.g., Div -> league_division)
            if c in exact_map:
                new_cols.append(exact_map[c])
                continue

            # 2) odds known keys (opening + closing), case-insensitive
            c_up = c.upper()
            if c_up in odds_map_upper:
                new_cols.append(odds_map_upper[c_up])
                continue

            # 3) 1x2 bookmaker prefix logic
            alias = bookmaker_1x2_alias(c)
            if alias:
                new_cols.append(alias)
                continue

            # 4) fallback: normalized snake_case preserving meaning
            new_cols.append(normalize_colname(c))
        return new_cols

    return renamer

rename_columns = make_column_renamer(BASE_MAP)


### b) Data Loading


In [64]:
def load_all_matches_basic(data_dir: str) -> pd.DataFrame:
    csv_files = glob.glob(os.path.join(data_dir, "**", "*.csv"), recursive=True)
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found under {data_dir}")

    frames = []
    for fp in csv_files:
        # extract path info
        rel = os.path.relpath(fp, data_dir)
        parts = Path(rel).parts
        country = parts[0] if len(parts) >= 1 else None
        league  = parts[1] if len(parts) >= 2 else None
        season_file = parts[2] if len(parts) >= 3 else None
        season = os.path.splitext(season_file)[0] if season_file else None

        # read and rename
        try:
            df = pd.read_csv(fp, low_memory=False)
        except Exception as e:
            print(f"Skipping {fp}: {e}")
            continue

        df.columns = rename_columns(df.columns.tolist())

        # attach metadata
        df["country"] = country
        df["league_tier"] = league
        df["season_code"] = season

        frames.append(df)

    all_df = pd.concat(frames, ignore_index=True, sort=False)
    return all_df

# run the loader
all_matches = load_all_matches_basic(DATA_DIR)
all_matches.shape

(42593, 140)

### c) Data Cleaning

In [65]:
all_matches.head()

Unnamed: 0,league_division,match_date,kickoff_time,home_team,away_team,fulltime_home_goals,fulltime_away_goals,fulltime_result,halftime_home_goals,halftime_away_goals,halftime_result,home_shots,away_shots,home_shots_on_target,away_shots_on_target,home_fouls_committed,away_fouls_committed,home_corners,away_corners,home_yellow_cards,away_yellow_cards,home_red_cards,away_red_cards,bet365_home,bet365_draw,bet365_away,betandwin_home,betandwin_draw,betandwin_away,interwetten_home,interwetten_draw,interwetten_away,pinnacle_home,pinnacle_draw,pinnacle_away,william_hill_home,william_hill_draw,william_hill_away,vc_bet_home,vc_bet_draw,vc_bet_away,market_max_home,market_max_draw,market_max_away,market_avg_home,market_avg_draw,market_avg_away,b365over25,b365under25,pover25,punder25,maxover25,maxunder25,avgover25,avgunder25,market_ah_handicap_home,bet365_ah_home,bet365_ah_away,pinnacle_ah_home,pinnacle_ah_away,...,vc_bet_close_draw,vc_bet_close_away,market_max_close_home,market_max_close_draw,market_max_close_away,market_avg_close_home,market_avg_close_draw,market_avg_close_away,b365cover25,b365cunder25,pcover25,pcunder25,maxcover25,maxcunder25,avgcover25,avgcunder25,ahch,bet365_close_ah_home,bet365_close_ah_away,pinnacle_close_ah_home,pinnacle_close_ah_away,maxcahh,maxcaha,avgcahh,avgcaha,country,league_tier,season_code,betfair_home,betfair_draw,betfair_away,1xbet_home,1xbet_draw,1xbet_away,betfair_exch_home,betfair_exch_draw,betfair_exch_away,bfeover25,bfeunder25,bfeahh,bfeaha,betfair_close_home,betfair_close_draw,betfair_close_away,1xbet_close_home,1xbet_close_draw,1xbet_close_away,betfair_exch_close_home,betfair_exch_close_draw,betfair_exch_close_away,bfecover25,bfecunder25,betfair_exch_close_ah_home,betfair_exch_close_ah_away,referee,unnamed_106,unnamed_120,unnamed_121,unnamed_105,unnamed_119
0,SP1,12/08/2022,20:00,Osasuna,Sevilla,2,1,H,1.0,1.0,D,12.0,10.0,5.0,3.0,16.0,9.0,5.0,2.0,5.0,3.0,0.0,0.0,3.2,3.1,2.4,3.2,3.0,2.4,3.2,3.05,2.45,3.38,3.1,2.48,3.2,3.0,2.4,3.25,3.1,2.4,3.41,3.24,2.49,3.28,3.08,2.43,2.62,1.5,2.55,1.57,2.62,1.6,2.5,1.54,0.25,1.83,2.07,1.85,2.09,...,3.0,2.55,3.52,3.14,2.66,3.2,3.04,2.5,2.62,1.5,2.63,1.54,2.69,1.57,2.55,1.52,0.25,1.73,2.08,1.76,2.2,1.88,2.23,1.79,2.09,spain,1,2223,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,SP1,13/08/2022,16:00,Celta,Espanol,2,2,D,1.0,0.0,H,8.0,10.0,3.0,2.0,17.0,15.0,3.0,3.0,5.0,2.0,0.0,0.0,1.8,3.75,4.75,1.83,3.6,4.33,1.83,3.55,4.6,1.81,3.75,4.87,1.83,3.5,4.33,1.75,3.5,4.5,1.9,3.77,4.9,1.83,3.68,4.5,2.0,1.8,2.1,1.81,2.13,1.87,2.04,1.8,-0.5,1.77,2.02,1.8,2.13,...,3.6,5.0,1.84,3.73,5.39,1.78,3.58,5.0,2.2,1.66,2.24,1.71,2.26,1.83,2.19,1.7,-0.75,2.0,1.9,2.02,1.91,2.06,1.93,2.01,1.86,spain,1,2223,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,SP1,13/08/2022,18:00,Valladolid,Villarreal,0,3,A,0.0,0.0,D,12.0,10.0,6.0,4.0,6.0,10.0,4.0,5.0,1.0,0.0,0.0,0.0,3.9,3.6,1.9,3.8,3.5,1.95,3.85,3.6,1.95,4.05,3.78,1.95,3.9,3.5,1.95,3.8,3.5,1.9,4.1,3.8,2.0,3.93,3.66,1.94,1.99,1.91,2.0,1.91,2.0,1.92,1.95,1.87,0.5,1.95,1.95,1.97,1.95,...,3.4,2.2,3.66,3.47,2.25,3.45,3.39,2.19,2.0,1.8,2.09,1.81,2.14,1.9,2.05,1.79,0.25,2.02,1.88,2.02,1.91,2.03,1.91,1.98,1.88,spain,1,2223,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,SP1,13/08/2022,20:00,Barcelona,Vallecano,0,0,D,0.0,0.0,D,21.0,4.0,6.0,2.0,16.0,14.0,8.0,0.0,3.0,6.0,1.0,0.0,1.22,7.0,12.0,1.21,6.75,13.5,1.22,6.75,13.0,1.22,6.83,15.0,1.18,6.5,17.0,1.18,6.5,13.0,1.24,7.4,17.0,1.21,6.79,14.2,1.53,2.5,1.55,2.57,1.55,2.73,1.5,2.58,-2.0,2.09,1.81,2.15,1.78,...,7.0,13.0,1.25,7.4,16.75,1.22,6.82,13.42,1.5,2.62,1.51,2.68,1.58,2.79,1.5,2.61,-1.75,1.82,2.08,1.83,2.08,1.86,2.14,1.82,2.06,spain,1,2223,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,SP1,14/08/2022,16:30,Cadiz,Sociedad,0,1,A,0.0,1.0,A,7.0,15.0,2.0,7.0,9.0,20.0,3.0,6.0,4.0,2.0,0.0,0.0,3.6,3.2,2.2,3.4,3.2,2.2,3.45,3.2,2.2,3.62,3.29,2.26,3.4,3.2,2.2,3.5,3.13,2.15,3.75,3.33,2.26,3.55,3.24,2.2,2.37,1.57,2.5,1.59,2.5,1.65,2.38,1.59,0.25,1.99,1.91,2.0,1.93,...,3.1,2.2,4.05,3.18,2.25,3.8,3.07,2.2,2.62,1.5,2.74,1.51,2.75,1.55,2.61,1.5,0.25,2.05,1.85,2.06,1.87,2.08,1.92,2.0,1.86,spain,1,2223,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


it seems like the renaming and loading went smoothly! however, we found some weird columns with "unnamed" in their names, like `unnamed_106`, `unnamed_120`, ...  
that sometimes happens when excel files have extra blank columns. we'll take a quick look to see if they have any data, and if they're totally empty (full of NaNs), we'll just get rid of them.

In [66]:
unnamed_cols = [c for c in all_matches.columns if c.lower().startswith("unnamed")]
all_matches[unnamed_cols].isna().mean().sort_values()


Unnamed: 0,0
unnamed_106,1.0
unnamed_120,1.0
unnamed_121,1.0
unnamed_105,1.0
unnamed_119,1.0


they're 100% full of NaNs so we can now safely drop them

In [67]:
all_matches = all_matches.drop(columns=unnamed_cols)

In [68]:
all_matches.country.value_counts().head()


Unnamed: 0_level_0,count
country,Unnamed: 1_level_1
england,11952
spain,5052
italy,4560
scotland,4247
france,4135


looks pretty reasonable, especially since england has more leagues compared to other countries.

now, let's inspect the data types of our columns. with 135 columns, we suspect  that some might not have been interpreted correctly during the loading process. checking the data types is an important step before proceeding with any further analysis or modeling, as we have learned during our amazing lectures!

In [69]:
all_matches.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42593 entries, 0 to 42592
Data columns (total 135 columns):
 #    Column                      Non-Null Count  Dtype  
---   ------                      --------------  -----  
 0    league_division             42593 non-null  object 
 1    match_date                  42593 non-null  object 
 2    kickoff_time                42593 non-null  object 
 3    home_team                   42593 non-null  object 
 4    away_team                   42593 non-null  object 
 5    fulltime_home_goals         42593 non-null  int64  
 6    fulltime_away_goals         42593 non-null  int64  
 7    fulltime_result             42593 non-null  object 
 8    halftime_home_goals         42552 non-null  float64
 9    halftime_away_goals         42552 non-null  float64
 10   halftime_result             42552 non-null  object 
 11   home_shots                  42549 non-null  float64
 12   away_shots                  42549 non-null  float64
 13   home_shots_on_

looks pretty solid but there are a few columns that are in wrong types... columns that should be numeric but are object and the date is also object which is a big no no  
on top of that, leaving categorical variables as object is inconvenient because well need to encode them later so lets convert them that to categorical

In [70]:
text_cols = [
    "league_division","match_date","kickoff_time","home_team","away_team",
    "fulltime_result","halftime_result","country","league_tier","season_code","referee"
]

cat_cols = ["league_division", "home_team", "away_team",
            "fulltime_result", "halftime_result", "country",
            "league_tier", "season_code"]

all_matches[cat_cols] = all_matches[cat_cols].astype("category")

for col in all_matches.select_dtypes(include="object"):
    if col not in text_cols:
        all_matches[col] = pd.to_numeric(all_matches[col], errors="coerce")


all_matches["match_date"] = pd.to_datetime(all_matches["match_date"], errors="coerce", dayfirst=True)

all_matches["kickoff_time"] = pd.to_datetime(
    all_matches["kickoff_time"], format="%H:%M", errors="coerce"
).dt.time

# checking again
all_matches.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42593 entries, 0 to 42592
Data columns (total 135 columns):
 #    Column                      Non-Null Count  Dtype         
---   ------                      --------------  -----         
 0    league_division             42593 non-null  category      
 1    match_date                  42593 non-null  datetime64[ns]
 2    kickoff_time                42593 non-null  object        
 3    home_team                   42593 non-null  category      
 4    away_team                   42593 non-null  category      
 5    fulltime_home_goals         42593 non-null  int64         
 6    fulltime_away_goals         42593 non-null  int64         
 7    fulltime_result             42593 non-null  category      
 8    halftime_home_goals         42552 non-null  float64       
 9    halftime_away_goals         42552 non-null  float64       
 10   halftime_result             42552 non-null  category      
 11   home_shots                  42549 non-n

now that weve made sure our data is properly loaded lets check for missing data and duplicates

In [75]:
na_counts = all_matches.isna().sum()

missing_cols = na_counts[na_counts > 0]
print(missing_cols)

halftime_home_goals              41
halftime_away_goals              41
halftime_result                  41
home_shots                       44
away_shots                       44
                              ...  
bfecover25                    35465
bfecunder25                   35465
betfair_exch_close_ah_home    35469
betfair_exch_close_ah_away    35469
referee                       26394
Length: 124, dtype: int64
