# ML2 Semestral Project - Football O/U 2.5
**Authors:** Phuong Nhi Tranová, Vít Maruniak, Šimon Slánský, Radim Škoukal, Ondřej Zetek, Martin Kareš, Jan Korčák, Jakub Maličkay, Jáchym Janouch  
**Course:** FIS 4IT344 Machine Learning 2 (2025/2026)  
**Goal:** Compare baseline (current features) vs extended (richer features) models for O/U 2.5 goals across markets; translate accuracy gains into optimal profit and **maximum data subscription price per country** *.  



---


***maximum data subscription price per country**
- the most money our company should be willing to pay for that country's additional data
- that's how much extra profit the improved model generates
- baseline model → accuracy = A₀
    - Generates profit Π*(A₀)
- extended model → accuracy = A₁
    - Generates profit Π*(A₁)
- profit improvement = ΔΠ = Π(A₁) − Π(A₀)*
    - basically how much more money the comany earns each year by using the better data
- the maximum data subscription price per country = ΔΠ


# 0. Imports and paths

### 0.1 Imports

In [None]:
import os, glob, warnings
from pathlib import Path
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from scipy.stats import zscore, chi2_contingency
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score


### 0.2 Library parameters

In [None]:
plt.rcParams["figure.figsize"] = (8,5)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

### 0.3 Paths

In [None]:
DATA_DIR = "./data"
OUTPUT_DIR = f"./processed"

os.makedirs(OUTPUT_DIR, exist_ok=True)

# 1. Data load

In [None]:
def load_all_matches(data_dir: str) -> pd.DataFrame:
    csv_files = glob.glob(os.path.join(data_dir, "**", "*.csv"), recursive=True)
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found under {data_dir}")

    frames = []
    for fp in csv_files:
        # extract path info
        rel = os.path.relpath(fp, data_dir)
        parts = Path(rel).parts
        country = parts[0] if len(parts) >= 1 else None
        league  = parts[1] if len(parts) >= 2 else None
        season_file = parts[2] if len(parts) >= 3 else None
        season_code = os.path.splitext(season_file)[0] if season_file else None

        # read and rename
        try:
            df = pd.read_csv(fp, low_memory=False)
        except Exception as e:
            print(f"Skipping {fp}: {e}")
            continue

        # Format season as YYYY/YYYY format
        if season_code and len(season_code) == 4 and season_code.isdigit():
            # Handle formats like "1920" or "2021"
            year1 = int(season_code[:2])
            year2 = int(season_code[2:])

            # Determine century based on year range
            if year1 >= 19 and year1 <= 24:  # 19-24 maps to 2019-2024
                year1_full = 2000 + year1
            else:
                year1_full = 1900 + year1

            if year2 >= 19 and year2 <= 99:
                if year2 < year1:  # Next year (e.g., 19->20, 23->24)
                    year2_full = 2000 + year2
                else:
                    year2_full = 2000 + year2
            else:
                year2_full = 1900 + year2

            season_formatted = f"{year1_full}/{year2_full}"
        else:
            season_formatted = season_code  # Fallback to original if format is unexpected

        # Add Season column right after Div (if Div exists)
        if 'Div' in df.columns:
            div_idx = df.columns.get_loc('Div')
            df.insert(div_idx + 1, 'Season', season_formatted)
        else:
            df['Season'] = season_formatted

        frames.append(df)

    all_df = pd.concat(frames, ignore_index=True, sort=False)
    return all_df

# run the loader
all_matches = pd.DataFrame(load_all_matches(DATA_DIR))
print(all_matches.columns.tolist())
print(all_matches.shape)
display(all_matches.head())

# 2. Exploratory Data Analysis

Before proceeding with data cleaning, let's understand our data better through comprehensive exploratory data analysis. This will help us make informed decisions about preprocessing steps.

### 2.1 Data Shape and Overview

In [None]:
print(f"Dataset shape: {all_matches.shape}")
print(f"Number of seasons/countries covered:")
print(f"Countries: {all_matches['Div'].str[:-1].nunique()}")
print(f"Leagues: {all_matches['Div'].nunique()}")
print(f"Date range: {all_matches['Date'].min()} to {all_matches['Date'].max()}")

# Check basic statistics
print(f"\nBasic goal statistics:")
print(f"Total goals per match stats:")
total_goals = all_matches['FTHG'] + all_matches['FTAG']
print(total_goals.describe())

print(f"\nOver/Under 2.5 goals distribution:")
over_2_5 = (total_goals > 2.5).astype(int)
print(f"Over 2.5: {over_2_5.sum()} ({over_2_5.mean():.2%})")
print(f"Under 2.5: {(~over_2_5.astype(bool)).sum()} ({(1-over_2_5.mean()):.2%})")

Great! Our target variable (Over/Under 2.5 goals) is perfectly balanced with almost exactly 50/50 split, which is ideal for classification. Mainly because the model won't be biased toward either class and we can use standard accuracy but also because we won't have to do any kind of resampling or rebalancing techniques.

### 2.2 Missing Values Analysis

In [None]:
# Detailed missing values analysis
missing_analysis = pd.DataFrame({
    'column': all_matches.columns,
    'missing_count': all_matches.isnull().sum(),
    'missing_percentage': (all_matches.isnull().sum() / len(all_matches)) * 100,
    'dtype': all_matches.dtypes
})

# Filter to show only columns with missing values
missing_analysis = missing_analysis[missing_analysis['missing_count'] > 0].sort_values('missing_percentage', ascending=False)

print(f"Columns with missing values: {len(missing_analysis)}")
print(f"Total columns: {len(all_matches.columns)}")
print(f"\nTop 20 columns with highest missing percentage:")
display(missing_analysis.head(20))

# Check missing patterns in key variables
key_stats = ['HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR']
print(f"\nMissing data in key match statistics:")
for stat in key_stats:
    if stat in all_matches.columns:
        missing_pct = (all_matches[stat].isnull().sum() / len(all_matches)) * 100
        print(f"{stat}: {missing_pct:.1f}%")

The missing data analysis reveals that:
1. **Betting odds** have the highest missing percentages (80%+) - this is expected as not all bookmakers operate in all leagues/seasons
2. **Key match statistics** (shots, corners, fouls, cards) have very low missing rates (<0.1%), which is excellent for our modeling
3. Most missing data is in betting-related columns, which we can handle appropriately

also we have found 4 unnamed columns that are 100% missing. they're most likely artifacts from csv exports so they're definitely safe to drop outright

lets do a bit more of a in depth analysis, shall we?

In [None]:
raw = all_matches.copy()

# missingness flag
stats_cols = ['HS','AS','HST','AST','HF','AF','HC','AC','HY','AY','HR','AR']
for c in stats_cols:
    if c in raw.columns:
        raw[f'isna_{c}'] = raw[c].isna().astype(int)

# Row-level summary: how many of the 12 stats are missing in the same row?
flag_cols = [f'isna_{c}' for c in stats_cols if f'isna_{c}' in raw.columns]
raw['missing_count_stats'] = raw[flag_cols].sum(axis=1)

# Quick overview
print(raw['missing_count_stats'].value_counts().sort_index())


Most of the rows seem to have no missigness/ However, there are 41 rows that have are missing all 12 variables, which seems pretty clustered. Suggesting that the missing data likely stem from a specific data source or a batch issue rather than random omission.

In [None]:
# single-stat missing % (already computed as flags)
single_rates = (raw[flag_cols].mean() * 100)
single_rates.index = [c.replace('isna_', '') for c in single_rates.index]

fig, ax = plt.subplots(figsize=(9,4))
ax.bar(single_rates.index, single_rates.values)
ax.set_title('Missingness by variables (%)')
ax.set_ylabel('% missing')
ax.set_xlabel('stat')
ax.set_xticklabels(single_rates.index, rotation=45, ha='right')
for i, v in enumerate(single_rates.values):
    ax.text(i, v, f'{v:.3f}%', ha='center', va='bottom', fontsize=9)
plt.tight_layout()
plt.show()

misigness seems uniformly low across all variables, there seems to be no issue with a variable specific collection issue

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(18, 12))
axes = axes.flatten()
cmap = 'plasma'

# Extract country from Div column (e.g., 'E1' -> 'E', 'SP2' -> 'SP')
raw['country_code'] = raw['Div'].str[:-1]

# 1️⃣ Country × Stat
if 'country_code' in raw.columns:
    M1 = raw.groupby('country_code')[flag_cols].mean().mul(100)
    order = M1.mean(axis=1).sort_values(ascending=False).index
    M1 = M1.loc[order]
    M1.columns = [c.replace('isna_', '') for c in M1.columns]

    im1 = axes[0].imshow(M1.values, aspect='auto', cmap=cmap)
    axes[0].set_xticks(np.arange(M1.shape[1]))
    axes[0].set_xticklabels(M1.columns, rotation=45, ha='right')
    axes[0].set_yticks(np.arange(M1.shape[0]))
    axes[0].set_yticklabels(M1.index)
    axes[0].set_title('Country × Stat')
    fig.colorbar(im1, ax=axes[0], label='% missing')
else:
    axes[0].text(0.5, 0.5, "Missing 'Div' column", ha='center', va='center')
    axes[0].set_axis_off()

# 2️⃣ Year × Stat (using Date column)
if 'Date' in raw.columns:
    raw['year'] = pd.to_datetime(raw['Date']).dt.year
    M2 = raw.groupby('year')[flag_cols].mean().mul(100)
    order = M2.mean(axis=1).sort_values(ascending=False).index
    M2 = M2.loc[order]
    M2.columns = [c.replace('isna_', '') for c in M2.columns]

    im2 = axes[1].imshow(M2.values, aspect='auto', cmap=cmap)
    axes[1].set_xticks(np.arange(M2.shape[1]))
    axes[1].set_xticklabels(M2.columns, rotation=45, ha='right')
    axes[1].set_yticks(np.arange(M2.shape[0]))
    axes[1].set_yticklabels(M2.index.astype(int))
    axes[1].set_title('Year × Stat')
    fig.colorbar(im2, ax=axes[1], label='% missing')
else:
    axes[1].text(0.5, 0.5, "Missing 'Date' column", ha='center', va='center')
    axes[1].set_axis_off()

# 3️⃣ Year × Country
needed = {'year', 'country_code'}
if needed.issubset(raw.columns):
    G = raw.groupby(['year','country_code'])[flag_cols].mean().mul(100)
    G['avg_missing'] = G.mean(axis=1)
    year_order  = G['avg_missing'].groupby(level=0).mean().sort_values(ascending=False).index
    country_order = G['avg_missing'].groupby(level=1).mean().sort_values(ascending=False).index
    P3 = (G['avg_missing'].unstack('country_code')
          .reindex(index=year_order, columns=country_order)
          .fillna(0))

    im3 = axes[2].imshow(P3.values, aspect='auto', cmap=cmap)
    axes[2].set_xticks(np.arange(P3.shape[1]))
    axes[2].set_xticklabels(P3.columns, rotation=45, ha='right')
    axes[2].set_yticks(np.arange(P3.shape[0]))
    axes[2].set_yticklabels(P3.index.astype(int))
    axes[2].set_title('Year × Country')
    fig.colorbar(im3, ax=axes[2], label='% missing')
else:
    axes[2].text(0.5, 0.5, "Missing required columns", ha='center', va='center')
    axes[2].set_axis_off()

# 4️⃣ Country × League
needed = {'country_code', 'Div'}
if needed.issubset(raw.columns):
    G = raw.groupby(['country_code', 'Div'])[flag_cols].mean().mul(100)
    G['avg_missing'] = G.mean(axis=1)
    P4 = (G['avg_missing'].unstack('Div').fillna(0))
    country_order = P4.mean(axis=1).sort_values(ascending=False).index
    league_order  = P4.mean(axis=0).sort_values(ascending=False).index
    P4 = P4.loc[country_order, league_order]

    im4 = axes[3].imshow(P4.values, aspect='auto', cmap=cmap)
    axes[3].set_xticks(np.arange(P4.shape[1]))
    axes[3].set_xticklabels(P4.columns, rotation=45, ha='right')
    axes[3].set_yticks(np.arange(P4.shape[0]))
    axes[3].set_yticklabels(P4.index)
    axes[3].set_title('Country × League')
    fig.colorbar(im4, ax=axes[3], label='% missing')
else:
    axes[3].text(0.5, 0.5, "Missing required columns", ha='center', va='center')
    axes[3].set_axis_off()

plt.tight_layout()
plt.show()

The first heatmap shows missing data by country. Turkey has the most missing data by far, with over 1.4 percent missing on average. All other countries have very little missing data, less than 0.5 percent each.

The second heatmap shows missing data by year. The years 2023 has slightly more missing data than the other years.

The third heatmap combines year and country together. It shows that Turkey has most missing values in 2023. In other years, the missingness is not so bad.

The fourth heatmap shows missing data by country and league division. Again, Turkey stands out with the highest missing data. Within each country, different league divisions have similar amounts of missing data, which means the problem is more about the country than about which league tier we look at.

Overall, the missing data is not random. It is concentrated mainly in Turkey and in the year 2023.

In [None]:
flag_cols  = [f'isna_{c}' for c in stats_cols if f'isna_{c}' in raw.columns]
top_n      = 15
min_matches_ref = 50   # ignore refs with tiny sample sizes

def group_missing_rate(df, key):
    """Return DataFrame with avg % missing across 12 stats, plus counts."""
    grp = df.groupby(key)[flag_cols]
    rate = grp.mean().mul(100).mean(axis=1)
    cnt  = df.groupby(key).size()
    out  = pd.DataFrame({'rate': rate, 'n': cnt}).sort_values('rate', ascending=False)
    return out

# 1️⃣ Home, Away, Referee
home_df = group_missing_rate(raw, 'HomeTeam') if 'HomeTeam' in raw.columns else pd.DataFrame()
away_df = group_missing_rate(raw, 'AwayTeam') if 'AwayTeam' in raw.columns else pd.DataFrame()
ref_df  = group_missing_rate(raw, 'Referee')  if 'Referee'  in raw.columns else pd.DataFrame()
if not ref_df.empty:
    ref_df = ref_df[ref_df['n'] >= min_matches_ref].sort_values('rate', ascending=False)

# 2️⃣ Merge for Home vs Away comparison (teams present in both)
both = pd.DataFrame()
if not home_df.empty and not away_df.empty:
    both = (home_df[['rate']].rename(columns={'rate': 'home_rate'})
            .merge(away_df[['rate']], left_index=True, right_index=True, how='inner')
            .rename(columns={'rate': 'away_rate'}))
    both['diff'] = both['home_rate'] - both['away_rate']
    both = both.sort_values('home_rate', ascending=False).head(top_n)

# =======================
# FIGURE 1 — Home & Away
# =======================
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# A) Top Home teams
if not home_df.empty:
    htop = home_df.head(top_n)[::-1]
    axes[0].barh(htop.index.astype(str), htop['rate'].values, color='#8c564b')
    axes[0].set_title('Missingness by HomeTeam (avg % across stats)')
    axes[0].set_xlabel('% missing')
    for y, (r, n) in enumerate(zip(htop['rate'].values, htop['n'].values)):
        axes[0].text(r, y, f'  {r:.2f}% (n={n})', va='center', ha='left', fontsize=9)
else:
    axes[0].text(0.5, 0.5, "HomeTeam column not found", ha='center', va='center')
    axes[0].set_axis_off()

# B) Top Away teams
if not away_df.empty:
    atop = away_df.head(top_n)[::-1]
    axes[1].barh(atop.index.astype(str), atop['rate'].values, color='#1f77b4')
    axes[1].set_title('Missingness by AwayTeam (avg % across stats)')
    axes[1].set_xlabel('% missing')
    for y, (r, n) in enumerate(zip(atop['rate'].values, atop['n'].values)):
        axes[1].text(r, y, f'  {r:.2f}% (n={n})', va='center', ha='left', fontsize=9)
else:
    axes[1].text(0.5, 0.5, "AwayTeam column not found", ha='center', va='center')
    axes[1].set_axis_off()

plt.tight_layout()
plt.show()

This visualization compares the average percentage of missing match statistics for each team when playing at home (brown dots) versus away (blue dots). The horizontal lines connect each team’s home and away missingness rates, allowing quick identification of patterns.

Most teams show very little difference between home and away games, suggesting that data gaps are not related to the venue. However, several Turkish teams—most notably Hatayspor, Gaziantep, and Ümraniyespor—stand out with exceptionally high missingness in both conditions (above 5–8%). This indicates that missing data is clustered around specific teams and leagues, rather than being randomly distributed or caused by home/away factors.

Overall, the visualization reinforces that the missingness originates from systematic collection or feed issues affecting particular teams or competitions, rather than isolated recording errors.

In [None]:
# missigness for referees
if not ref_df.empty:
    fig, ax = plt.subplots(figsize=(8, 6))
    rtop = ref_df.head(top_n)[::-1]
    ax.barh(rtop.index.astype(str), rtop['rate'].values, color='#9467bd')
    ax.set_title(f'Missingness by Referee (avg % across stats, n≥{min_matches_ref})')
    ax.set_xlabel('% missing')
    for y, (r, n) in enumerate(zip(rtop['rate'].values, rtop['n'].values)):
        ax.text(r, y, f'  {r:.2f}% (n={n})', va='center', ha='left', fontsize=9)
    plt.tight_layout()
    plt.show()
else:
    print("No referees pass the sample-size filter.")

In [None]:
# helper: % missing by group (avg across rows)
def pct_missing_by(group_cols, cols):
    G = raw.groupby(group_cols)[cols].mean().mul(100)   # % per stat
    return G

# =========================
# Figure A — Year × Stat
# =========================
if 'year' in raw.columns:
    YS = pct_missing_by(['year'], flag_cols)
    # order years by overall missingness (desc)
    order = YS.mean(axis=1).sort_values(ascending=False).index
    YS = YS.loc[order]
    YS.columns = [c.replace('isna_', '') for c in YS.columns]

    fig, ax = plt.subplots(figsize=(9, 6))
    im = ax.imshow(YS.values, aspect='auto')
    ax.set_xticks(np.arange(YS.shape[1]))
    ax.set_xticklabels(YS.columns, rotation=45, ha='right')
    ax.set_yticks(np.arange(YS.shape[0]))
    ax.set_yticklabels(YS.index.astype(int))
    ax.set_title('Missingness heatmap (%) — Year × Stat')
    fig.colorbar(im, ax=ax, label='% missing')
    plt.tight_layout()
    plt.show()

# =============================================
# Figure B — Year trend (avg across all stats)
# =============================================
if 'year' in raw.columns:
    Y_avg = (raw.groupby('year')[flag_cols].mean().mul(100).mean(axis=1)
             .sort_index())
    fig, ax = plt.subplots(figsize=(8, 4.5))
    ax.plot(Y_avg.index.astype(int), Y_avg.values, marker='o')
    for x, y in zip(Y_avg.index, Y_avg.values):
        ax.text(x, y, f'{y:.2f}%', va='bottom', ha='center', fontsize=9)
    ax.set_xlabel('Year')
    ax.set_ylabel('% missing (avg across stats)')
    ax.set_title('Missingness over time (yearly average)')
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

# =========================
# Figure C — Hour × Stat
# =========================
if 'hour' in raw.columns:
    # drop hours that are NaN (unparseable)
    HH = raw.dropna(subset=['hour']).copy()
    HH['hour'] = HH['hour'].astype(int)
    HS = (HH.groupby('hour')[flag_cols].mean().mul(100))
    # ensure 0–23 present (fill with zeros if absent)
    HS = HS.reindex(range(0, 24), fill_value=0)
    HS.columns = [c.replace('isna_', '') for c in HS.columns]

    fig, ax = plt.subplots(figsize=(9, 6))
    im = ax.imshow(HS.values, aspect='auto')
    ax.set_xticks(np.arange(HS.shape[1]))
    ax.set_xticklabels(HS.columns, rotation=45, ha='right')
    ax.set_yticks(np.arange(HS.shape[0]))
    ax.set_yticklabels(HS.index.astype(int))
    ax.set_title('Missingness heatmap (%) — Hour × Stat')
    fig.colorbar(im, ax=ax, label='% missing')
    plt.tight_layout()
    plt.show()


### 2.3 Data Sanity Checks

Before moving forward, we need to verify that our data makes logical sense. We will check if the relationships between different columns are consistent.

In [None]:
sanity_issues = []

# Check 1: Full time goals should be >= half time goals
print("\nFull Time Goals >= Half Time Goals")
ht_ft_home_check = all_matches['FTHG'] >= all_matches['HTHG']
ht_ft_away_check = all_matches['FTAG'] >= all_matches['HTAG']
home_violations = (~ht_ft_home_check).sum()
away_violations = (~ht_ft_away_check).sum()
print(f"Home goals violations: {home_violations}")
print(f"Away goals violations: {away_violations}")
if home_violations > 0 or away_violations > 0:
    sanity_issues.append(f"FT goals < HT goals: {home_violations + away_violations} cases")

# Check 2: Full time result should match actual goals
print("\nFull Time Result matches actual goals")
ftr_check = pd.Series(index=all_matches.index, dtype=bool)
ftr_check = (
    ((all_matches['FTR'] == 'H') & (all_matches['FTHG'] > all_matches['FTAG'])) |
    ((all_matches['FTR'] == 'A') & (all_matches['FTAG'] > all_matches['FTHG'])) |
    ((all_matches['FTR'] == 'D') & (all_matches['FTHG'] == all_matches['FTAG']))
)
ftr_violations = (~ftr_check).sum()
print(f"FTR mismatches: {ftr_violations}")
if ftr_violations > 0:
    sanity_issues.append(f"FTR doesn't match goals: {ftr_violations} cases")

# Check 3: Half time result should match half time goals
print("\nChecking: Half Time Result matches half time goals")
htr_check = pd.Series(index=all_matches.index, dtype=bool)
htr_check = (
    ((all_matches['HTR'] == 'H') & (all_matches['HTHG'] > all_matches['HTAG'])) |
    ((all_matches['HTR'] == 'A') & (all_matches['HTAG'] > all_matches['HTHG'])) |
    ((all_matches['HTR'] == 'D') & (all_matches['HTHG'] == all_matches['HTAG']))
)
htr_violations = (~htr_check).sum()
print(f"HTR mismatches: {htr_violations}")
if htr_violations > 0:
    sanity_issues.append(f"HTR doesn't match HT goals: {htr_violations} cases")

# Check 4: Shots on target should be <= total shots
print("\nShots on Target <= Total Shots")
home_shot_check = all_matches['HST'] <= all_matches['HS']
away_shot_check = all_matches['AST'] <= all_matches['AS']
home_shot_violations = (~home_shot_check).sum()
away_shot_violations = (~away_shot_check).sum()
print(f"Home shots violations: {home_shot_violations}")
print(f"Away shots violations: {away_shot_violations}")
if home_shot_violations > 0 or away_shot_violations > 0:
    sanity_issues.append(f"Shots on target > total shots: {home_shot_violations + away_shot_violations} cases")

# Check 5: Goals should be <= shots on target (generally, but not always)
print("\nGoals <= Shots on Target (usually)")
home_goals_shots_check = all_matches['FTHG'] <= all_matches['HST']
away_goals_shots_check = all_matches['FTAG'] <= all_matches['AST']
home_goals_violations = (~home_goals_shots_check).sum()
away_goals_violations = (~away_goals_shots_check).sum()
print(f"Home goals > shots on target: {home_goals_violations}")
print(f"Away goals > shots on target: {away_goals_violations}")
print(f"Note: Some violations are possible due to own goals or deflections")
if home_goals_violations > 10 or away_goals_violations > 10:
    sanity_issues.append(f"Goals > shots on target: {home_goals_violations + away_goals_violations} cases (check if excessive)")

# Check 6: Red cards should be <= yellow cards + red cards
print("\nCard counts are reasonable")
home_red_check = all_matches['HR'] <= (all_matches['HY'] + all_matches['HR'])
away_red_check = all_matches['AR'] <= (all_matches['AY'] + all_matches['AR'])
print(f"Home card logic violations: {(~home_red_check).sum()}")
print(f"Away card logic violations: {(~away_red_check).sum()}")

# Check 7: Negative values check
print("\nNo negative values in count columns")
count_columns = ['FTHG', 'FTAG', 'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST',
                'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR']
negative_found = False
for col in count_columns:
    if col in all_matches.columns:
        negative_count = (all_matches[col] < 0).sum()
        if negative_count > 0:
            print(f"{col}: {negative_count} negative values")
            negative_found = True
            sanity_issues.append(f"{col} has {negative_count} negative values")
if not negative_found:
    print(f"No negative values found")

# Check 8: Extreme values check
print("\nExtreme values that might be data errors")
extreme_checks = {
    'FTHG': 15,
    'FTAG': 15,
    'HS': 50,
    'AS': 50,
    'HC': 30,
    'AC': 30,
    'HY': 10,
    'AY': 10,
    'HR': 5,
    'AR': 5
}
for col, threshold in extreme_checks.items():
    if col in all_matches.columns:
        extreme_count = (all_matches[col] > threshold).sum()
        if extreme_count > 0:
            max_value = all_matches[col].max()
            print(f"{col} > {threshold}: {extreme_count} cases (max: {max_value})")

The sanity checks help us verify that the data is internally consistent. We check things like full time goals being at least as many as half time goals, that the match result codes match the actual goal counts, that shots on target do not exceed total shots, and that there are no negative values in count columns. These checks help identify data entry errors or corruption before we use the data for modeling.

Our data passed most checks well. Full time goals are always at least as many as half time goals, which is correct. The full time result codes match the actual scores perfectly.

We found 41 matches where the half time result code does not match the half time goals. This is a small number out of 42,593 matches, so it is likely just data entry errors in those specific matches.

We found 6 matches where shots on target are higher than total shots. This is probably a recording error but only affects 6 matches so it is not a big problem.

We found 234 matches where a team scored more goals than they had shots on target. This can happen in real football due to own goals or deflections, so these are not necessarily errors.

We found one match where a team got 9 red cards. This is extremely unusual and might be a data error, but it is only one match out of thousands.

Overall, the data quality is very good. The few issues we found affect less than 1 percent of matches and will not significantly impact our model training.

### 2.4 League and Country Distribution

In [None]:
# League distribution
league_counts = all_matches['Div'].value_counts()
print("League distribution:")
display(league_counts)

# Country mapping for better understanding
country_mapping = {
    'E': 'England', 'SC': 'Scotland', 'SP': 'Spain', 'I': 'Italy',
    'D': 'Germany', 'F': 'France', 'N': 'Netherlands', 'B': 'Belgium',
    'P': 'Portugal', 'T': 'Turkey', 'G': 'Greece'
}

all_matches['Country'] = all_matches['Div'].str[:-1].map(country_mapping)
country_counts = all_matches['Country'].value_counts()
print(f"\nMatches per country:")
display(country_counts)

# Visualize the distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Country distribution
country_counts_sorted = country_counts.dropna().sort_values(ascending=False)
bars = ax1.bar(country_counts_sorted.index, country_counts_sorted.values, color='skyblue')
ax1.set_title('Matches per Country')
ax1.set_xlabel('Country')
ax1.set_ylabel('Number of Matches')
ax1.tick_params(axis='x', rotation=30)
ax1.grid(axis='y', alpha=0.3)

# data labels
for rect in bars:
    height = rect.get_height()
    ax1.text(rect.get_x() + rect.get_width()/2, height, f"{int(height):,}",
             ha='center', va='bottom', fontsize=9)

# Goals distribution
total_goals = all_matches['FTHG'] + all_matches['FTAG']
max_g = int(np.nanmax(total_goals))
bins = np.arange(-0.5, max(10, max_g) + 1.5, 1)

ax2.hist(total_goals, bins=bins, color='lightcoral', alpha=0.7)
ax2.axvline(x=2.5, linestyle='--', linewidth=2, label='2.5 goals threshold')
ax2.set_title('Distribution of Total Goals per Match')
ax2.set_xlabel('Total Goals')
ax2.set_ylabel('Frequency')
ax2.set_xticks(range(0, max(10, max_g) + 1))
ax2.grid(axis='y', alpha=0.3)
ax2.legend()

plt.show()

England seems to account for the majority of matches in the dataset, making the sample somewhat country-imbalanced. This suggests that model training should be performed separately for each country, or at least include country-specific components, to prevent English leagues from dominating the overall model behavior.  

When building time-aware models, it would also be beneficial to use chronological splits within each country and consider assigning higher weights to more recent matches, since they better reflect current team dynamics and scoring trends.  

Alse, the distribution of total goals per match is right-skewed, with mode around 2–3 goals. The red dashed line at 2.5 goals marks the classification threshold for our target variable. Visually, the mass on either side of this threshold is roughly equal, which confirms the balanced 50/50 split observed in the data


# 3. Data cleaning

### 3.1 Handling csv issues
It seems like the renaming and loading went smoothly! However, we found some weird columns with "unnamed" in their names, like `unnamed_106`, `unnamed_120`, ...  
That sometimes happens when excel files have extra blank columns. We'll take a quick look to see if they have any data, and if they're totally empty (full of NaNs), we'll just get rid of them.

In [None]:
unnamed_cols = [c for c in all_matches.columns if c.lower().startswith("unnamed")]
all_matches[unnamed_cols].isna().mean().sort_values()

They're 100% full of NaNs so we can now safely drop them.

In [None]:
all_matches = all_matches.drop(columns=unnamed_cols)

### 3.2 Normalizing league codes
Let's normalize the leagues, as English and Scottish leagues have the best leagues interpreted as E0, SC0, respectively. All other countries mark the best league as CountryCode1.

In [None]:
mask = all_matches['Div'].str.startswith(('E', 'SC'))
all_matches.loc[mask, 'Div'] = all_matches.loc[mask, 'Div'].apply(
    lambda x: f"{x[:-1]}{int(x[-1]) + 1}"
)

print(all_matches['Div'].unique())

### 3.3 Handling English and Scottish yellow cards
We need to take care of the first note in notex.txt, which mentions an important inconsitency in how yellow and red cards are recorded across different competitions.  

In English and Scottish leagues, when a player receives a second yellow card that leads to a red card, the initial yellow card is not counted in the match statistics, only the red card is recorded. However, European and international competitions record both: the second yellow is counted as an additional yellow card plus a red card 

As a result, yellow card totals in English and Scottish matches can underestimate the true number of yellow cards compared to other leagues. To correct for this and ensure consistency across competitions, we applied a simple adjustment:
- whenever a team has exactly one red card and one yellow card, we add one additional yellow card.
- and if a team has 0 reds, 2 or more reds, or 1 red but no yellows, we make no adjustment.

We acknowledge that this rule is an approximation, our adjustment may not always be the case and it may introduce some bias. 

In [None]:
mask = all_matches['Div'].str.startswith(('E', 'SC'))
red_mask = mask & ((all_matches['HR'] == 1) | (all_matches['AR'] == 1))

print("Before adjustment (sample):")
print(all_matches.loc[red_mask, ['Div', 'HY', 'HR', 'AY', 'AR']].head())

all_matches.loc[mask & (all_matches['HR'] == 1) & (all_matches['HY'] == 0), 'HY'] += 1
all_matches.loc[mask & (all_matches['AR'] == 1) & (all_matches['AY'] == 0), 'AY'] += 1

print("\nAfter adjustment (sample):")
print(all_matches.loc[red_mask, ['Div', 'HY', 'HR', 'AY', 'AR']].head())

### 3.4 Correcting data types
Now, let's inspect the data types of our columns. With 135 columns, we suspect that some might not have been interpreted correctly during the loading process. Checking the data types is an important step before proceeding with any further analysis or modeling.

In [None]:
for col, dtype in all_matches.dtypes.items():
    print(f"{col}: {dtype}")

In [None]:
time_columns = ['Date', 'Time']

category_columns = ['Div', 'HomeTeam', 'AwayTeam', 'FTR', 'HTR', 'Referee', 'Country']

int_columns = ['FTHG', 'FTAG', 'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR']

float_columns = ['B365CH', 'BWCA', '1XBH']

for col in time_columns:
    if col == 'Date':
        all_matches[col] = pd.to_datetime(all_matches[col])
    else:
        all_matches[col] = pd.to_datetime(all_matches[col], format='%H:%M').dt.time

for col in category_columns:
    if col in all_matches.columns:
        all_matches[col] = all_matches[col].astype('category')

for col in int_columns:
    all_matches[col] = pd.to_numeric(all_matches[col], errors='coerce').astype('Int64')

for col in float_columns:
    all_matches[col] = pd.to_numeric(all_matches[col], errors='coerce').astype(float)


In [None]:
for col, dtype in all_matches.dtypes.items():
    print(f"{col}: {dtype}")

### 3.5 Outlier detection and handling

Following the methodology from Week1 (house pricing), we'll use z-score analysis to detect outliers in match statistics.

In [None]:
# Define numerical columns for outlier detection
match_stats_cols = ['HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR']
numerical_cols = ['FTHG', 'FTAG', 'HTHG', 'HTAG'] + match_stats_cols

# Calculate z-scores for numerical columns
print("Outlier analysis using z-score > 3:")
outlier_counts = {}

for col in numerical_cols:
    if col in all_matches.columns:
        z_scores = np.abs(zscore(all_matches[col].dropna()))
        outliers = (z_scores > 3).sum()
        outlier_counts[col] = outliers
        if outliers > 0:
            print(f"{col}: {outliers} outliers ({outliers/len(all_matches)*100:.2f}%)")

# Look at extreme cases
print(f"\nExamples of potential outliers:")
print(f"Highest total goals: {all_matches['FTHG'].max() + all_matches['FTAG'].max()}")
print(f"Most shots in a match: {all_matches['HS'].max() + all_matches['AS'].max()}")
print(f"Most cards in a match: {all_matches['HY'].max() + all_matches['AY'].max()}")

# Visualize outliers for key variables
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
key_vars = ['FTHG', 'FTAG', 'HS', 'AS']

for i, var in enumerate(key_vars):
    row, col = i // 2, i % 2
    ax = axes[row, col]

    # Box plot to show outliers
    all_matches[var].plot(kind='box', ax=ax)
    ax.set_title(f'Box Plot of {var}')
    ax.set_ylabel(var)

plt.tight_layout()
plt.show()

# For football data, we'll be more conservative with outlier removal
# as extreme scores can be legitimate (unlike house prices)
print(f"\nDecision: Keep outliers for football data as high scores/stats can be legitimate")

# 4. Feature engineering

Based on soccer domain knowledge and the course materials, we'll create meaningful features that could help predict Over/Under 2.5 goals.

## 4.1 Target variable creation

In [None]:
# Create the main target variable: Over/Under 2.5 goals
all_matches['total_goals'] = all_matches['FTHG'] + all_matches['FTAG']
all_matches['over_2_5'] = (all_matches['total_goals'] > 2.5).astype(int)

print("Target variable distribution:")
print(all_matches['over_2_5'].value_counts())
print(f"Over 2.5 rate: {all_matches['over_2_5'].mean():.2%}")

# Also create alternative targets for analysis
all_matches['over_1_5'] = (all_matches['total_goals'] > 1.5).astype(int)
all_matches['over_3_5'] = (all_matches['total_goals'] > 3.5).astype(int)

print(f"\nOther thresholds:")
print(f"Over 1.5 rate: {all_matches['over_1_5'].mean():.2%}")
print(f"Over 3.5 rate: {all_matches['over_3_5'].mean():.2%}")

## 4.2 Basic feature engineering

Creating features that capture match dynamics and team performance patterns.

In [None]:
# ============================================================================
# IMPORTANT: These are TEMPORARY INTERMEDIATE FEATURES
# ============================================================================
# These features are created from post-match statistics (HS, AS, HST, etc.)
# that are only known AFTER a match is played. They are NOT used directly
# in the final model.
#
# PURPOSE: These intermediate features are used to calculate:
# 1. Moving averages (MA5) for extended dataset features
# 2. Seasonal patterns and team statistics
#
# These raw features will be REMOVED in Section 4.6 before model training.
# Only their aggregated historical versions (MA5, seasonal stats) will remain,
# which ARE valid predictors as they represent past performance.
# ============================================================================

# Basic engineered features (TEMPORARY - for intermediate calculations only)
all_matches['home_shot_accuracy'] = all_matches['HST'] / (all_matches['HS'] + 0.001)
all_matches['away_shot_accuracy'] = all_matches['AST'] / (all_matches['AS'] + 0.001)
all_matches['total_shots'] = all_matches['HS'] + all_matches['AS']
all_matches['total_shots_on_target'] = all_matches['HST'] + all_matches['AST']

# 2. Attacking vs Defensive balance (TEMPORARY)
all_matches['shot_dominance'] = (all_matches['HS'] - all_matches['AS']) / (all_matches['HS'] + all_matches['AS'] + 0.001)
all_matches['corner_dominance'] = (all_matches['HC'] - all_matches['AC']) / (all_matches['HC'] + all_matches['AC'] + 0.001)

# 3. Game intensity features (TEMPORARY)
all_matches['total_fouls'] = all_matches['HF'] + all_matches['AF']
all_matches['total_cards'] = all_matches['HY'] + all_matches['AY'] + all_matches['HR'] + all_matches['AR']
all_matches['card_intensity'] = all_matches['total_cards'] / (all_matches['total_fouls'] + 0.001)

# 4. Half-time patterns (TEMPORARY)
all_matches['ht_total_goals'] = all_matches['HTHG'] + all_matches['HTAG']
all_matches['second_half_goals'] = all_matches['total_goals'] - all_matches['ht_total_goals']

# 5. League tier (can be used directly - known before match)
all_matches['league_tier'] = all_matches['Div'].str[-1].astype(int)

# 6. Season timing features (can be used directly - known before match)
all_matches['month'] = all_matches['Date'].dt.month
all_matches['is_weekend'] = all_matches['Date'].dt.dayofweek.isin([5, 6]).astype(int)

print("Basic engineered features created (note: post-match stats are temporary).")
print("These will be used to calculate historical aggregates, then removed before modeling.")


## 4.3 Base feature df engineering

Start with core + basic engineered features as the foundation.

In [None]:
# dataframe with core and basic engineered features
df_basic = all_matches[['Div', 'Season', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR',
                   'total_goals', 'league_tier', 'month', 'is_weekend', "over_2_5"]].copy()
# by date for time-based features
df_basic = df_basic.sort_values(['Div', 'Date']).reset_index(drop=True)
print(f"Columns: {df_basic.columns.tolist()}")
df_basic.head()

### 4.3.1 Days since last match


**Calculate days since last match for both home and away teams.**

In [None]:
# days since last match for each team
df_basic['home_days_since_last'] = np.nan
df_basic['away_days_since_last'] = np.nan
for team in df_basic['HomeTeam'].unique():
    home_mask = df_basic['HomeTeam'] == team
    away_mask = df_basic['AwayTeam'] == team
    team_matches = df_basic[home_mask | away_mask].sort_values('Date')
    #days between matches
    team_matches['days_diff'] = team_matches['Date'].diff().dt.days
    # to home/away columns
    for idx, row in team_matches.iterrows():
        if df_basic.loc[idx, 'HomeTeam'] == team:
            df_basic.loc[idx, 'home_days_since_last'] = row['days_diff']
        else:
            df_basic.loc[idx, 'away_days_since_last'] = row['days_diff']
# first matches filling with median
df_basic['home_days_since_last'].fillna(df_basic['home_days_since_last'].median(), inplace=True)
df_basic['away_days_since_last'].fillna(df_basic['away_days_since_last'].median(), inplace=True)

print(f"Home days since last - mean: {df_basic['home_days_since_last'].mean():.1f}, median: {df_basic['home_days_since_last'].median():.1f}")
print(f"Away days since last - mean: {df_basic['away_days_since_last'].mean():.1f}, median: {df_basic['away_days_since_last'].median():.1f}")

- **Everything seems ok, the usual pause is one week (expected), but they are visible pauses between individual seasons.**

### 4.3.2 5-match Moving Averages

**Calculate 5-match moving averages for goals scored and conceded.**

In [None]:
df_basic['home_goals_ma5'] = np.nan
df_basic['home_conceded_ma5'] = np.nan
df_basic['away_goals_ma5'] = np.nan
df_basic['away_conceded_ma5'] = np.nan

for team in df_basic['HomeTeam'].unique():

    # Home matches
    home_mask = df_basic['HomeTeam'] == team
    home_dates = df_basic[home_mask].sort_values('Date').index
    for i, idx in enumerate(home_dates):
        if i >= 5:
            last_5_home = df_basic.loc[home_dates[i-5:i]]
            df_basic.loc[idx, 'home_goals_ma5'] = last_5_home['FTHG'].mean()
            df_basic.loc[idx, 'home_conceded_ma5'] = last_5_home['FTAG'].mean()

    # Away matches
    away_mask = df_basic['AwayTeam'] == team
    away_dates = df_basic[away_mask].sort_values('Date').index

    for i, idx in enumerate(away_dates):
        if i >= 5:
            last_5_away = df_basic.loc[away_dates[i-5:i]]
            df_basic.loc[idx, 'away_goals_ma5'] = last_5_away['FTAG'].mean()
            df_basic.loc[idx, 'away_conceded_ma5'] = last_5_away['FTHG'].mean()

# NaN values remain for teams' first 5 matches - will be handled by model
# or imputed during train/test split using only training data
print(f"\nMA5 Statistics (NaN preserved for early-season matches):")
print(f"home_goals_ma5    - mean: {df_basic['home_goals_ma5'].mean():.2f}, missing: {df_basic['home_goals_ma5'].isna().sum()}")
print(f"away_goals_ma5    - mean: {df_basic['away_goals_ma5'].mean():.2f}, missing: {df_basic['away_goals_ma5'].isna().sum()}")
print(f"home_conceded_ma5 - mean: {df_basic['home_conceded_ma5'].mean():.2f}, missing: {df_basic['home_conceded_ma5'].isna().sum()}")
print(f"away_conceded_ma5 - mean: {df_basic['away_conceded_ma5'].mean():.2f}, missing: {df_basic['away_conceded_ma5'].isna().sum()}")

 **Fixed: Data leakage removed**
- NaN values for early-season matches (first 5 games) are now preserved
- No longer filling with overall mean (which would use future data)
- Tree-based models can handle NaN values natively
- Alternative: Impute during train/test split using only training data statistics

### 4.3.3 Promoted/Demoted

**Detect teams that changed leagues between seasons using the Season column (e.g., 2019/2020 → 2020/2021).**

In [None]:
# promotion/demotion flags
df_basic['home_promoted'] = 0
df_basic['home_demoted'] = 0
df_basic['away_promoted'] = 0
df_basic['away_demoted'] = 0

# For each team, check if they changed tier between seasons
for team in df_basic['HomeTeam'].unique():
    team_data = df_basic[(df_basic['HomeTeam'] == team) | (df_basic['AwayTeam'] == team)].sort_values('Date')

    # Grouping by season and get the league tier for each season
    season_tiers = team_data.groupby('Season')['league_tier'].first()

    for i in range(1, len(season_tiers)):
        season = season_tiers.index[i]
        prev_tier = season_tiers.iloc[i-1]
        curr_tier = season_tiers.iloc[i]

        if curr_tier < prev_tier:  # Lower tier number = higher division
            promoted = 1
            demoted = 0
        elif curr_tier > prev_tier:
            promoted = 0
            demoted = 1
        else:
            continue

        season_mask = (df_basic['Season'] == season)
        home_mask = season_mask & (df_basic['HomeTeam'] == team)
        away_mask = season_mask & (df_basic['AwayTeam'] == team)

        df_basic.loc[home_mask, 'home_promoted'] = promoted
        df_basic.loc[home_mask, 'home_demoted'] = demoted
        df_basic.loc[away_mask, 'away_promoted'] = promoted
        df_basic.loc[away_mask, 'away_demoted'] = demoted

print(f"Home teams promoted: {df_basic['home_promoted'].sum()}")
print(f"Home teams demoted: {df_basic['home_demoted'].sum()}")
print(f"Away teams promoted: {df_basic['away_promoted'].sum()}")
print(f"Away teams demoted: {df_basic['away_demoted'].sum()}")

- **Promotion/demotion detection works correctly by comparing league tiers across consecutive seasons (not calendar years).**

### 4.3.4 Historical Season Positions & Goal Patterns

**Calculate team standings and goal-scoring patterns with round-based tracking and home/away splits.**

**Methodology:**
- **Past seasons:** Use final end-of-season standings (complete data)
- **Current season:** Use standings after last completed round (prevents data leakage)
  - Round = all teams played same number of matches
  - Match uses standings from previous completed round only
  - Example: If Team A has played 5 matches but round 5 not complete → use round 4 standings
- **Position features:** Overall league position only (no home/away split)
- **Goal pattern features:** Track separately for overall, home-only, and away-only
  - Home team gets: Overall stats + Home-specific stats
  - Away team gets: Overall stats + Away-specific stats
  - Both absolute counts AND percentages (counts show sample size/reliability)
- Use percentiles (0-100) for cross-league comparability

In [None]:
# Calculate league standings: final for past seasons, round-by-round for current season
def calculate_standings_with_rounds(df):
    """
    Calculate standings with home/away splits:
    - Position: Overall only
    - Goal patterns: Overall + Home-specific + Away-specific (both counts and percentages)
    """
    standings_list = []
    match_to_round = {}

    for (season, div), group in df.groupby(['Season', 'Div']):
        group = group.sort_values('Date').reset_index(drop=True)
        
        # Initialize tracking for overall, home, away
        teams = set(group['HomeTeam'].unique()) | set(group['AwayTeam'].unique())
        team_stats_overall = {team: {'points': 0, 'gf': 0, 'ga': 0, 'matches': 0} for team in teams}
        team_stats_home = {team: {'gf': 0, 'ga': 0, 'matches': 0} for team in teams}
        team_stats_away = {team: {'gf': 0, 'ga': 0, 'matches': 0} for team in teams}
        
        # Goal patterns: overall, home-only, away-only
        patterns_overall = {team: {f'{p}_{t}': 0 for p in ['sc', 'co', 'to'] for t in [2, 3]} for team in teams}
        patterns_home = {team: {f'{p}_{t}': 0 for p in ['sc', 'co', 'to'] for t in [2, 3]} for team in teams}
        patterns_away = {team: {f'{p}_{t}': 0 for p in ['sc', 'co', 'to'] for t in [2, 3]} for team in teams}
        
        current_round = None

        for _, match in group.iterrows():
            ht, at = match['HomeTeam'], match['AwayTeam']
            hg, ag = match['FTHG'], match['FTAG']
            match_key = (season, div, match['Date'], ht, at)
            
            # Store BEFORE match
            if current_round is not None:
                match_to_round[match_key] = current_round.copy()
            
            # Update overall stats
            team_stats_overall[ht]['gf'] += hg
            team_stats_overall[ht]['ga'] += ag
            team_stats_overall[ht]['matches'] += 1
            team_stats_overall[at]['gf'] += ag
            team_stats_overall[at]['ga'] += hg
            team_stats_overall[at]['matches'] += 1
            
            # Update home/away specific stats
            team_stats_home[ht]['gf'] += hg
            team_stats_home[ht]['ga'] += ag
            team_stats_home[ht]['matches'] += 1
            team_stats_away[at]['gf'] += ag
            team_stats_away[at]['ga'] += hg
            team_stats_away[at]['matches'] += 1
            
            # Update goal patterns - OVERALL
            for t in [2, 3]:
                if hg >= t: 
                    patterns_overall[ht][f'sc_{t}'] += 1
                    patterns_home[ht][f'sc_{t}'] += 1
                if ag >= t: 
                    patterns_overall[ht][f'co_{t}'] += 1
                    patterns_home[ht][f'co_{t}'] += 1
                    patterns_overall[at][f'sc_{t}'] += 1
                    patterns_away[at][f'sc_{t}'] += 1
                if hg + ag >= t: 
                    patterns_overall[ht][f'to_{t}'] += 1
                    patterns_overall[at][f'to_{t}'] += 1
                    patterns_home[ht][f'to_{t}'] += 1
                    patterns_away[at][f'to_{t}'] += 1
                if hg >= t:
                    patterns_overall[at][f'co_{t}'] += 1
                    patterns_away[at][f'co_{t}'] += 1
            
            # Points
            if match['FTR'] == 'H': team_stats_overall[ht]['points'] += 3
            elif match['FTR'] == 'A': team_stats_overall[at]['points'] += 3
            else: 
                team_stats_overall[ht]['points'] += 1
                team_stats_overall[at]['points'] += 1
            
            # Check round completion
            if len(set(s['matches'] for s in team_stats_overall.values())) == 1:
                rows = []
                for team in teams:
                    s_o = team_stats_overall[team]
                    s_h = team_stats_home[team]
                    s_a = team_stats_away[team]
                    
                    row = {
                        'Season': season, 'Div': div, 'Team': team,
                        'Points': s_o['points'], 'Matches': s_o['matches'],
                        'Goals_For': s_o['gf'], 'Goals_Against': s_o['ga']
                    }
                    
                    # Overall goal patterns (count + pct)
                    for t in [2, 3]:
                        for p, full in [('sc', 'scored'), ('co', 'conceded'), ('to', 'total')]:
                            cnt = patterns_overall[team][f'{p}_{t}']
                            row[f'{full}_{t}plus_count'] = cnt
                            row[f'{full}_{t}plus_pct'] = round(cnt / s_o['matches'] * 100, 1) if s_o['matches'] > 0 else 0
                    
                    # Home-specific goal patterns
                    for t in [2, 3]:
                        for p, full in [('sc', 'scored'), ('co', 'conceded'), ('to', 'total')]:
                            cnt = patterns_home[team][f'{p}_{t}']
                            row[f'home_{full}_{t}plus_count'] = cnt
                            row[f'home_{full}_{t}plus_pct'] = round(cnt / s_h['matches'] * 100, 1) if s_h['matches'] > 0 else 0
                    
                    # Away-specific goal patterns
                    for t in [2, 3]:
                        for p, full in [('sc', 'scored'), ('co', 'conceded'), ('to', 'total')]:
                            cnt = patterns_away[team][f'{p}_{t}']
                            row[f'away_{full}_{t}plus_count'] = cnt
                            row[f'away_{full}_{t}plus_pct'] = round(cnt / s_a['matches'] * 100, 1) if s_a['matches'] > 0 else 0
                    
                    rows.append(row)
                
                current_round = pd.DataFrame(rows).sort_values('Points', ascending=False)
                current_round['Position'] = range(1, len(current_round) + 1)
        
        if current_round is not None:
            standings_list.append(current_round)
    
    final = pd.concat(standings_list, ignore_index=True) if standings_list else pd.DataFrame()
    return final, match_to_round

# Calculate standings
df_season_standings, match_round_map = calculate_standings_with_rounds(df_basic)
print(f"Season standings: {len(df_season_standings)} team-season records")
print(f"Round-based mappings: {len(match_round_map)} matches with historical standings")

In [None]:
# Create historical position features with lookback logic
season_list = sorted(df_basic['Season'].unique())
season_to_order = {season: idx for idx, season in enumerate(season_list)}
df_basic['season_order'] = df_basic['Season'].map(season_to_order)

# Extract season years and create position lookup
def extract_season_year(season_str):
    return int(season_str.split('/')[0])

df_season_standings['season_year'] = df_season_standings['Season'].apply(extract_season_year)
unique_season_years = sorted(df_season_standings['season_year'].unique())

# Add percentile rankings for cross-league comparability
df_season_standings['league_size'] = df_season_standings.groupby(['Season', 'Div'])['Position'].transform('max')
df_season_standings['Position_Percentile'] = (
    (df_season_standings['league_size'] - df_season_standings['Position'] + 1) /
    df_season_standings['league_size'] * 100
).round(2)
percentile_lookup = df_season_standings.set_index(['Season', 'Div', 'Team'])['Position_Percentile'].to_dict()

# Create columns for percentile positions only (not raw positions)
for year in unique_season_years:
    df_basic[f'home_position_pct_{year}'] = np.nan
    df_basic[f'away_position_pct_{year}'] = np.nan

# Populate historical features
for idx, row in df_basic.iterrows():
    current_season_order = row['season_order']
    current_season = row['Season']
    match_key = (row['Season'], row['Div'], row['Date'], row['HomeTeam'], row['AwayTeam'])

    for year in unique_season_years:
        target_season = next((s for s in season_list if extract_season_year(s) == year), None)
        if not target_season:
            continue
        
        target_season_order = season_to_order[target_season]

        # For past seasons: use final standings
        if target_season_order < current_season_order:
            for team_type, team in [('home', row['HomeTeam']), ('away', row['AwayTeam'])]:
                key = next(
                    ((target_season, div, team) for div in df_season_standings['Div'].unique()
                     if (target_season, div, team) in percentile_lookup),
                    None
                )
                if key:
                    df_basic.loc[idx, f'{team_type}_position_pct_{year}'] = percentile_lookup[key]

        # For current season (same season as match): use round-based standings
        elif target_season == current_season and match_key in match_round_map:
            round_standings = match_round_map[match_key]
            for team_type, team in [('home', row['HomeTeam']), ('away', row['AwayTeam'])]:
                team_row = round_standings[round_standings['Team'] == team]
                if not team_row.empty:
                    position = team_row['Position'].iloc[0]
                    league_size = len(round_standings)
                    percentile = ((league_size - position + 1) / league_size * 100)
                    df_basic.loc[idx, f'{team_type}_position_pct_{year}'] = round(percentile, 2)

position_cols = [col for col in df_basic.columns if 'position_pct_' in col]
print(f"Historical position features: {len(position_cols)} columns")

In [None]:
# Add historical goal-scoring pattern features with home/away context
# Create lookups from final standings
goal_stat_lookups = {}
for threshold in [2, 3]:
    for prefix in ['scored', 'conceded', 'total']:
        # Overall stats
        for suffix in ['count', 'pct']:
            col = f'{prefix}_{threshold}plus_{suffix}'
            goal_stat_lookups[col] = df_season_standings.set_index(['Season', 'Div', 'Team'])[col].to_dict()
        # Home-specific stats
        for suffix in ['count', 'pct']:
            col = f'home_{prefix}_{threshold}plus_{suffix}'
            goal_stat_lookups[col] = df_season_standings.set_index(['Season', 'Div', 'Team'])[col].to_dict()
        # Away-specific stats
        for suffix in ['count', 'pct']:
            col = f'away_{prefix}_{threshold}plus_{suffix}'
            goal_stat_lookups[col] = df_season_standings.set_index(['Season', 'Div', 'Team'])[col].to_dict()

# Create columns: home team gets overall + home-specific, away team gets overall + away-specific
for year in unique_season_years:
    for threshold in [2, 3]:
        for prefix in ['scored', 'conceded', 'total']:
            # Home team: overall + home-specific
            df_basic[f'home_{prefix}_{threshold}plus_count_{year}'] = np.nan
            df_basic[f'home_{prefix}_{threshold}plus_pct_{year}'] = np.nan
            df_basic[f'home_home_{prefix}_{threshold}plus_count_{year}'] = np.nan
            df_basic[f'home_home_{prefix}_{threshold}plus_pct_{year}'] = np.nan
            # Away team: overall + away-specific
            df_basic[f'away_{prefix}_{threshold}plus_count_{year}'] = np.nan
            df_basic[f'away_{prefix}_{threshold}plus_pct_{year}'] = np.nan
            df_basic[f'away_away_{prefix}_{threshold}plus_count_{year}'] = np.nan
            df_basic[f'away_away_{prefix}_{threshold}plus_pct_{year}'] = np.nan

# Populate goal statistics
for idx, row in df_basic.iterrows():
    current_season_order = row['season_order']
    current_season = row['Season']
    match_key = (row['Season'], row['Div'], row['Date'], row['HomeTeam'], row['AwayTeam'])

    for year in unique_season_years:
        target_season = next((s for s in season_list if extract_season_year(s) == year), None)
        if not target_season:
            continue

        # For past seasons: use final standings
        if season_to_order[target_season] < current_season_order:
            # Home team
            key_home = next(((target_season, div, row['HomeTeam']) for div in df_season_standings['Div'].unique()
                             if (target_season, div, row['HomeTeam']) in percentile_lookup), None)
            if key_home:
                for threshold in [2, 3]:
                    for prefix in ['scored', 'conceded', 'total']:
                        # Overall stats
                        for suffix in ['count', 'pct']:
                            col = f'{prefix}_{threshold}plus_{suffix}'
                            val = goal_stat_lookups[col].get(key_home)
                            if val is not None:
                                df_basic.loc[idx, f'home_{col}_{year}'] = val
                        # Home-specific stats
                        for suffix in ['count', 'pct']:
                            col = f'home_{prefix}_{threshold}plus_{suffix}'
                            val = goal_stat_lookups[col].get(key_home)
                            if val is not None:
                                df_basic.loc[idx, f'home_{col}_{year}'] = val
            
            # Away team
            key_away = next(((target_season, div, row['AwayTeam']) for div in df_season_standings['Div'].unique()
                             if (target_season, div, row['AwayTeam']) in percentile_lookup), None)
            if key_away:
                for threshold in [2, 3]:
                    for prefix in ['scored', 'conceded', 'total']:
                        # Overall stats
                        for suffix in ['count', 'pct']:
                            col = f'{prefix}_{threshold}plus_{suffix}'
                            val = goal_stat_lookups[col].get(key_away)
                            if val is not None:
                                df_basic.loc[idx, f'away_{col}_{year}'] = val
                        # Away-specific stats
                        for suffix in ['count', 'pct']:
                            col = f'away_{prefix}_{threshold}plus_{suffix}'
                            val = goal_stat_lookups[col].get(key_away)
                            if val is not None:
                                df_basic.loc[idx, f'away_{col}_{year}'] = val

        # For current season: use round-based standings
        elif target_season == current_season and match_key in match_round_map:
            round_standings = match_round_map[match_key]
            
            # Home team
            home_row = round_standings[round_standings['Team'] == row['HomeTeam']]
            if not home_row.empty:
                for threshold in [2, 3]:
                    for prefix in ['scored', 'conceded', 'total']:
                        # Overall stats
                        for suffix in ['count', 'pct']:
                            col = f'{prefix}_{threshold}plus_{suffix}'
                            if col in home_row.columns:
                                df_basic.loc[idx, f'home_{col}_{year}'] = home_row[col].iloc[0]
                        # Home-specific stats
                        for suffix in ['count', 'pct']:
                            col = f'home_{prefix}_{threshold}plus_{suffix}'
                            if col in home_row.columns:
                                df_basic.loc[idx, f'home_{col}_{year}'] = home_row[col].iloc[0]
            
            # Away team
            away_row = round_standings[round_standings['Team'] == row['AwayTeam']]
            if not away_row.empty:
                for threshold in [2, 3]:
                    for prefix in ['scored', 'conceded', 'total']:
                        # Overall stats
                        for suffix in ['count', 'pct']:
                            col = f'{prefix}_{threshold}plus_{suffix}'
                            if col in away_row.columns:
                                df_basic.loc[idx, f'away_{col}_{year}'] = away_row[col].iloc[0]
                        # Away-specific stats
                        for suffix in ['count', 'pct']:
                            col = f'away_{prefix}_{threshold}plus_{suffix}'
                            if col in away_row.columns:
                                df_basic.loc[idx, f'away_{col}_{year}'] = away_row[col].iloc[0]

goal_stat_cols = [col for col in df_basic.columns if any(f'{p}_{t}plus' in col for p in ['scored', 'conceded', 'total'] for t in [2, 3])]
print(f"Historical goal statistics: {len(goal_stat_cols)} columns")

**Summary of Section 4.3.4:**

We've created comprehensive historical performance features with **round-based tracking and home/away context**:

1. **Season Standings** (`df_season_standings`): Team-season records with home/away splits
   - **Past seasons:** Final standings (complete data)
   - **Current season:** Round-by-round standings (last completed round before each match)
   - Metrics: Points, Position, Goals, Goal patterns with home/away context

2. **Historical Position Features** (13 columns added to `df_basic`):
   - **Overall position only** (no home/away split - league position is always overall)
   - **Percentile rankings** (0-100): Normalized across league sizes
   - Format: `home_position_pct_YYYY`, `away_position_pct_YYYY` for 6 seasons
   - Higher percentile = better position (95th = top of table)
   - Uses round-based data for current season to maximize data usage

3. **Historical Goal Pattern Features** (288 columns added to `df_basic`):
   - **Context-aware:** Home team gets overall + home-specific, away team gets overall + away-specific
   - For each context: 6 statistics × 2 metrics (count & percentage)
     - `scored_2plus/3plus`: Matches where team scored 2+/3+ goals
     - `conceded_2plus/3plus`: Matches where team conceded 2+/3+ goals
     - `total_2plus/3plus`: Matches with 2+/3+ total goals
   - **Both counts AND percentages** included:
     - Counts show sample size (reliability indicator)
     - Percentages show the pattern/trend
     - Example: 80% from 4 matches vs 50% from 1 match - model learns reliability
   - Format: `home_home_scored_3plus_pct_2024` = home team's home-only scoring %
   - Format: `away_away_conceded_2plus_count_2024` = away team's away-only conceding count

**Key Design Choices:**
- **Home/Away Context:** Teams perform differently at home vs away - model gets both perspectives
- **Counts + Percentages:** Dual metrics allow model to weight by sample size
- **Lookback logic:** Prevents data leakage (features only reference past seasons)
- **Percentiles:** Enable fair comparison between different league sizes
- **2+ and 3+ thresholds:** Most relevant for Over/Under 2.5 target
- **Current season handling:** Uses round-based data to maximize coverage (~95%)

**Total features added: 301** (13 position percentiles + 288 goal patterns)

**Expected Impact:** Context-aware goal patterns (home team's home performance vs away team's away performance) should provide stronger predictive signals than overall statistics alone. Including both counts and percentages helps model assess reliability of patterns.

### 4.3.5 Additional Derived Features

**Create additional features from existing data that might improve Over/Under 2.5 prediction:**

**Team strength differential features:**
- Position percentile difference between home and away team (quality gap, normalized across leagues)
- Combined attacking/defensive strength indicators (blend recent form + historical patterns)

**Form-based features:**
- Recent form trend (improving vs declining over last 10 matches)

**Contextual features:**
- Season progress (early, mid, late season effects)
- Days rest advantage (difference in rest days between teams)

In [None]:
# 1. Position difference features (most recent season available for both teams)
df_basic['position_pct_diff'] = np.nan

for idx, row in df_basic.iterrows():
    # Find most recent season where both teams have position data
    for year in reversed(unique_season_years):
        home_pct_col = f'home_position_pct_{year}'
        away_pct_col = f'away_position_pct_{year}'

        if (pd.notna(df_basic.loc[idx, home_pct_col]) and
            pd.notna(df_basic.loc[idx, away_pct_col])):
            # Positive = home team has better position (higher percentile)
            df_basic.loc[idx, 'position_pct_diff'] = df_basic.loc[idx, home_pct_col] - df_basic.loc[idx, away_pct_col]
            break

print(f" Position percentile difference feature created")
print(f"Coverage: {(df_basic['position_pct_diff'].notna().sum() / len(df_basic) * 100):.1f}%")

# 2. Combined attacking strength using context-aware stats
# Home team uses home-specific stats, away team uses away-specific stats
df_basic['combined_attack_strength'] = np.nan
df_basic['combined_defense_weakness'] = np.nan

for idx, row in df_basic.iterrows():
    # Get most recent goal statistics (context-aware: home-specific for home team, away-specific for away)
    for year in reversed(unique_season_years):
        # Home team: use home-specific scoring
        home_scored = df_basic.loc[idx, f'home_home_scored_3plus_pct_{year}']
        home_conceded = df_basic.loc[idx, f'home_home_conceded_3plus_pct_{year}']
        # Away team: use away-specific scoring
        away_scored = df_basic.loc[idx, f'away_away_scored_3plus_pct_{year}']
        away_conceded = df_basic.loc[idx, f'away_away_conceded_3plus_pct_{year}']

        if pd.notna(home_scored) and pd.notna(away_scored):
            # Combine recent MA with historical patterns
            home_recent = df_basic.loc[idx, 'home_goals_ma5']
            away_recent = df_basic.loc[idx, 'away_goals_ma5']

            # Weighted average: 60% recent form, 40% historical context-specific pattern
            df_basic.loc[idx, 'combined_attack_strength'] = (
                0.6 * (home_recent + away_recent) +
                0.4 * ((home_scored + away_scored) / 20)  # Normalize percentage to goals scale
            )
            df_basic.loc[idx, 'combined_defense_weakness'] = (
                0.6 * (df_basic.loc[idx, 'home_conceded_ma5'] + df_basic.loc[idx, 'away_conceded_ma5']) +
                0.4 * ((home_conceded + away_conceded) / 20)
            )
            break

print(f" Combined strength features created (using home/away context)")
print(f"Coverage: {(df_basic['combined_attack_strength'].notna().sum() / len(df_basic) * 100):.1f}%")

# 3. Form trend (improving vs declining)
df_basic['home_form_trend'] = np.nan
df_basic['away_form_trend'] = np.nan

for team in df_basic['HomeTeam'].unique():
    # Home matches
    home_mask = df_basic['HomeTeam'] == team
    home_dates = df_basic[home_mask].sort_values('Date').index
    for i, idx in enumerate(home_dates):
        if i >= 10:  # Need at least 10 matches
            last_5 = df_basic.loc[home_dates[i-5:i], 'FTHG'].mean()
            prev_5 = df_basic.loc[home_dates[i-10:i-5], 'FTHG'].mean()
            df_basic.loc[idx, 'home_form_trend'] = last_5 - prev_5  # Positive = improving

    # Away matches
    away_mask = df_basic['AwayTeam'] == team
    away_dates = df_basic[away_mask].sort_values('Date').index
    for i, idx in enumerate(away_dates):
        if i >= 10:
            last_5 = df_basic.loc[away_dates[i-5:i], 'FTAG'].mean()
            prev_5 = df_basic.loc[away_dates[i-10:i-5], 'FTAG'].mean()
            df_basic.loc[idx, 'away_form_trend'] = last_5 - prev_5

# Fill NaN with 0 (no trend info = assume stable)
df_basic['home_form_trend'].fillna(0, inplace=True)
df_basic['away_form_trend'].fillna(0, inplace=True)

print(f" Form trend features created")

# 4. Rest days advantage
df_basic['rest_days_advantage'] = df_basic['home_days_since_last'] - df_basic['away_days_since_last']

print(f" Rest days advantage created")

# 5. Season progress (match number / expected total matches based on league structure)
# NO DATA LEAKAGE: Uses team count (known from first match) and round-robin format
# Expected matches = N × (N-1) where N = number of teams in the league
df_basic['season_progress'] = np.nan
for season_div, group in df_basic.groupby(['Season', 'Div']):
    # Count unique teams in this season/division (structural information)
    teams_in_league = len(group['HomeTeam'].unique())
    # Calculate expected total matches for round-robin (home & away)
    expected_total_matches = teams_in_league * (teams_in_league - 1)
    
    # Sort by date to get chronological order
    sorted_indices = group.sort_values('Date').index
    
    for i, idx in enumerate(sorted_indices):
        match_number = i + 1
        df_basic.loc[idx, 'season_progress'] = match_number / expected_total_matches

print(f" Season progress feature created (using league structure)")

# Summary
new_derived_features = ['position_pct_diff', 'combined_attack_strength',
                        'combined_defense_weakness', 'home_form_trend', 'away_form_trend',
                        'rest_days_advantage', 'season_progress']
print(f"
Total new derived features: {len(new_derived_features)}")
print(f"Features: {new_derived_features}")

In [None]:
# Verify season_progress fix - check values are reasonable
print("Season Progress Verification")
print("=" * 60)

# Check range
print(f"\nSeason progress range: {df_basic['season_progress'].min():.4f} to {df_basic['season_progress'].max():.4f}")

# Check a few examples from different league sizes
for season_div in [('2024/2025', 'E0'), ('2024/2025', 'SP1'), ('2024/2025', 'SC0')]:
    season, div = season_div
    mask = (df_basic['Season'] == season) & (df_basic['Div'] == div)
    group = df_basic[mask].copy()
    
    if len(group) > 0:
        teams = len(group['HomeTeam'].unique())
        expected = teams * (teams - 1)
        progress_values = group.sort_values('Date')['season_progress'].values
        
        print(f"\n{season} {div}:")
        print(f"Teams: {teams}, Expected matches: {expected}")
        print(f"Matches so far: {len(group)}")
        print(f"Progress range: {progress_values.min():.4f} to {progress_values.max():.4f}")
        print(f"First 3 matches progress: {progress_values[:3]}")

print("
Season progress values verified - using league structure (no data leakage)")

#### Data Leakage Prevention Summary

**Season Progress Feature Fix:**
- **Problem**: Previously used `len(group)` which included ALL matches in the season (including future unplayed matches)
- **Solution**: Calculate expected matches using league structure: `teams × (teams - 1)`
- **Why this is NOT leakage**: 
  - Number of teams is known from the first match
  - League format (round-robin home & away) is predetermined
  - Similar to knowing NBA has 82 games or Premier League has 38 rounds
  - We're using structural information, not future match results

**All Features Verified for Data Leakage:**
 MA5 features: Use `Date < date` or index slicing `[i-5:i]` to exclude current match  
 Combined strength: Uses pre-calculated MA5 (which is safe)  
 Form trend: Uses historical windows `[i-5:i]` and `[i-10:i-5]`  
 Season progress: Now uses league structure (team count)  
 Extended stats MA5: Uses `Date < date` filtering

### 4.3.6 Feature Importance Testing

**Test which features matter for predicting Over/Under 2.5 goals**

**Correlation Analysis:**
- Quick initial screening of linear relationships
- Identifies features with direct linear impact on target
- Visualize top correlated features by group (time-based, position, goal patterns)
- **Limitation:** Only captures linear relationships, misses non-linear patterns and interactions

**Note:** While correlations are weak, features may still be valuable in tree-based models that capture non-linear patterns, thresholds, and feature interactions.

#### A. Correlation Analysis

**Quick screening of linear relationships with Over/Under 2.5 target**

In [None]:
# Numerical features to test
num_features = ['home_days_since_last', 'away_days_since_last',
                'home_goals_ma5', 'home_conceded_ma5', 'away_goals_ma5', 'away_conceded_ma5',
                'league_tier', 'month']

# Add historical position features (from section 4.3.4) - percentiles only
position_features = [col for col in df_basic.columns if 'position_pct_' in col]

# Add historical goal statistics features (from sections 4.3.4 and 4.3.5)
# Includes: overall, home-specific, away-specific stats × counts and percentages
goal_stat_features = [col for col in df_basic.columns if any(f'{p}_{t}plus' in col for p in ['scored', 'conceded', 'total'] for t in [2, 3])]

all_num_features = num_features + position_features + goal_stat_features

# Calculate correlations
print("=" * 60)
print("CORRELATIONS WITH over_2_5 TARGET")
print("=" * 60)

print("\n1. TIME-BASED FEATURES:")
time_corrs = []
for feat in num_features:
    corr = df_basic[feat].corr(df_basic['over_2_5'])
    time_corrs.append((feat, corr))
    print(f"{feat:30s}: {corr:7.4f}")

print(f"\n2. HISTORICAL POSITION FEATURES ({len(position_features)} total):")
# Group by season year for cleaner display
position_corrs = []
for feat in position_features:
    corr = df_basic[feat].corr(df_basic['over_2_5'])
    position_corrs.append((feat, corr))
    print(f"{feat:30s}: {corr:7.4f}")

# Summary statistics for position features
position_corr_values = [abs(c[1]) for c in position_corrs if not pd.isna(c[1])]
if position_corr_values:
    print(f"
Position features summary:")
    print(f"Max |correlation|: {max(position_corr_values):.4f}")
    print(f"Mean |correlation|: {np.mean(position_corr_values):.4f}")
    print(f"Median |correlation|: {np.median(position_corr_values):.4f}")

print(f"\n3. HISTORICAL GOAL PATTERN FEATURES ({len(goal_stat_features)} total):")
goal_stat_corrs = []
for feat in goal_stat_features:
    corr = df_basic[feat].corr(df_basic['over_2_5'])
    goal_stat_corrs.append((feat, corr))
    print(f"{feat:40s}: {corr:7.4f}")

# Summary statistics for goal stat features
goal_stat_corr_values = [abs(c[1]) for c in goal_stat_corrs if not pd.isna(c[1])]
if goal_stat_corr_values:
    print(f"
Goal stat features summary:")
    print(f"Max |correlation|: {max(goal_stat_corr_values):.4f}")
    print(f"Mean |correlation|: {np.mean(goal_stat_corr_values):.4f}")
    print(f"Median |correlation|: {np.median(goal_stat_corr_values):.4f}")
    
# Show top features by absolute correlation
all_corrs = time_corrs + position_corrs + goal_stat_corrs
all_corrs_sorted = sorted(all_corrs, key=lambda x: abs(x[1]) if not pd.isna(x[1]) else 0, reverse=True)

print(f"\n4. TOP 15 FEATURES BY ABSOLUTE CORRELATION:")
for feat, corr in all_corrs_sorted[:15]:
    print(f"{feat:40s}: {corr:7.4f}")

In [None]:
# Visualize correlations
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# 1. Time-based features
time_corrs = [df_basic[f].corr(df_basic['over_2_5']) for f in num_features]
axes[0].barh(num_features, time_corrs, color='steelblue')
axes[0].axvline(x=0, color='black', linestyle='-', linewidth=0.8)
axes[0].set_xlabel('Correlation with Over 2.5', fontsize=11)
axes[0].set_title('Time-Based Features', fontsize=12, fontweight='bold')
axes[0].grid(True, alpha=0.3, axis='x')

# 2. Position features - show top 10 by absolute correlation
position_corr_df = pd.DataFrame(position_corrs, columns=['feature', 'correlation'])
position_corr_df['abs_corr'] = position_corr_df['correlation'].abs()
top_position = position_corr_df.nlargest(10, 'abs_corr')
axes[1].barh(range(len(top_position)), top_position['correlation'].values, color='coral')
axes[1].set_yticks(range(len(top_position)))
axes[1].set_yticklabels([f.replace('home_', 'H_').replace('away_', 'A_').replace('position_', 'pos_').replace('_pct', '%') for f in top_position['feature']], fontsize=8)
axes[1].axvline(x=0, color='black', linestyle='-', linewidth=0.8)
axes[1].set_xlabel('Correlation with Over 2.5', fontsize=11)
axes[1].set_title(f'Top 10 Position Features (of {len(position_features)})', fontsize=12, fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='x')

# 3. Goal stats - show top 10 by absolute correlation
goal_corr_df = pd.DataFrame(goal_stat_corrs, columns=['feature', 'correlation'])
goal_corr_df['abs_corr'] = goal_corr_df['correlation'].abs()
top_goals = goal_corr_df.nlargest(10, 'abs_corr')
axes[2].barh(range(len(top_goals)), top_goals['correlation'].values, color='mediumseagreen')
axes[2].set_yticks(range(len(top_goals)))
axes[2].set_yticklabels([f.replace('home_', 'H_').replace('away_', 'A_').replace('_pct_season_', '_S') for f in top_goals['feature']], fontsize=8)
axes[2].axvline(x=0, color='black', linestyle='-', linewidth=0.8)
axes[2].set_xlabel('Correlation with Over 2.5', fontsize=11)
axes[2].set_title(f'Top 10 Goal Pattern Features (of {len(goal_stat_features)})', fontsize=12, fontweight='bold')
axes[2].grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

print("\n" + "="*70)
print("CORRELATION SUMMARY")
print("="*70)
print(f"Overall weak correlations suggest non-linear relationships are important.")
print(f"Tree-based models will likely perform better than linear models.")
print(f"\nStrongest absolute correlations:")
all_corrs = pd.concat([
    pd.DataFrame({'feature': num_features, 'correlation': time_corrs}),
    position_corr_df[['feature', 'correlation']],
    goal_corr_df[['feature', 'correlation']]
])
all_corrs['abs_corr'] = all_corrs['correlation'].abs()
top_overall = all_corrs.nlargest(5, 'abs_corr')
for _, row in top_overall.iterrows():
    print(f"{row['feature']:50s}: {row['correlation']:7.4f}")


**Interpretation of Correlation Analysis**

**Overall Finding:** All features show **very weak correlations** with the Over/Under 2.5 target (|r| < 0.10), indicating minimal linear relationships.

**1. Time-Based Features:**
- **Days since last match** (~-0.01): Virtually no effect — rest periods don't linearly predict goal totals
- **Home goals MA5** (+0.074): Strongest time-based correlation — teams scoring recently tend toward slightly higher Over 2.5 rates
- **Away goals MA5** (+0.048): Weak positive signal for away team scoring form
- **Home/Away conceded MA5** (~-0.04 to -0.05): Slight negative — defensive stability correlates with fewer goals
- **League tier** (-0.042): Lower divisions marginally less likely to see 3+ goals
- **Month** (+0.019): Negligible seasonal effect

**2. Historical Position Features (12 total - percentiles only):**
- **Summary statistics:**
  - Max |correlation|: ~0.04-0.06 (very weak)
  - Mean |correlation|: ~0.02-0.03
  - Median |correlation|: ~0.02
- **Top correlations:** Home/away percentile positions from recent seasons (S-1, S-2) show slightly stronger effects
- **Insight:** Position matters, but not in a simple linear way — likely interacts with other factors (e.g., promoted teams behave differently than established teams at same position)
- **Scale:** 0-100 percentile where higher = better position (normalized across different league sizes)

**3. Historical Goal Pattern Features (72 total):**
- **Summary statistics:**
  - Max |correlation|: ~0.05-0.07
  - Mean |correlation|: ~0.02-0.03
  - Top features: Typically `total_2plus_pct` or `scored_2plus_pct` from recent seasons (S-1, S-2)
- **Insight:** Teams with history of high-scoring matches continue that pattern, but effect is weak linearly
- **Strongest patterns:** Home team total goals 2+ percentages from recent seasons show positive correlations (0.04-0.07)

**4. Key Takeaways:**
-  **Weak linear relationships** suggest soccer is governed by **non-linear patterns and interactions**
-  **Recent form** (MA5 goals) shows slightly stronger signal than historical season data in linear terms
-  **Goal patterns persist** — teams with high-scoring history continue that tendency, but modestly
-  **Percentile positions** provide normalized strength indicator across different league sizes
-  **Modeling implication:** Linear models (e.g., logistic regression) will struggle. Tree-based models (Random Forest, XGBoost) better suited to capture:
  - Non-linear thresholds (e.g., optimal rest days 4-6, not linear)
  - Interactions (e.g., promoted + low position != stable low position)
  - Context-dependent effects (e.g., position matters more in competitive leagues)

**Conclusion:** While individual correlations are weak, these features will likely contribute to model performance through interactions and non-linear patterns when combined in ensemble models.

---

**Summary: What We Added**

**New Features (Section 4.3.5):**
1. **Position percentile difference** - Quality gap between teams using normalized percentiles (higher = home team stronger)
2. **Combined attack/defense strength** - Blend recent form (60%) + historical patterns (40%)
3. **Form trend** - Improving vs declining (last 5 vs previous 5 matches)
4. **Rest days advantage** - Difference in recovery time
5. **Season progress** - Early/mid/late season effects (0.0 to 1.0)

**Total: 7 derived features** (reduced from 8 by using only percentile difference)

**Feature Testing (Section 4.3.X):**
- **Correlation Analysis** - Fast initial screening, shows linear relationships only
- **Correlation Visualization** - Bar charts showing top correlated features by group

**Key Insight:** Weak correlations suggest non-linear patterns dominate in soccer:
- Goal-scoring has complex dynamics (e.g., very weak teams score less, but so do very defensive teams)
- Features likely interact (promoted teams + low position != stable low position)
- Thresholds and context matter (rest days, form trends, league tier effects)

**Interpretation of Numerical Features**

All numerical features show **very weak correlations** (|r| < 0.08) with the Over/Under 2.5 goals target, suggesting minimal linear dependence.  
**Feature-wise summary:**
- **Days since last match** (-0.010, -0.009):  
  Essentially no relationship — rest periods have no meaningful effect on goal totals.
- **Home goals MA5** (0.074):  
  Slight positive signal — teams scoring more in recent matches tend to have marginally higher Over 2.5 rates.
- **Away goals MA5** (0.048):  
  Small negative correlation — possibly due to opponents adapting defensively to strong away attacks.
- **League tier** (-0.042):  
  Strongest (yet still weak) correlation — lower divisions show a slightly lower frequency of high-scoring games.
- **Month** (0.019):  
  Negligible seasonal influence on goal totals.
**Modeling note:**  

Although weak on their own, these variables may still provide value to **non-linear or ensemble models** by capturing interaction effects and subtle contextual patterns.

**Test boolean/categorical features against the target.**

In [None]:
# Boolean features - automatically include all that exist in df_basic
# Define expected boolean features (add new ones here as you create them)
expected_bool_features = ['is_weekend', 'home_promoted', 'home_demoted',
                          'away_promoted', 'away_demoted']
bool_features = [feat for feat in expected_bool_features if feat in df_basic.columns]

print(f"Analyzing {len(bool_features)} boolean/categorical features:")
print(f"Features: {bool_features}\n")

print("=" * 70)
print("Over 2.5 rate by categorical feature:")
print("=" * 70)

for feat in bool_features:
    grouped = df_basic.groupby(feat)['over_2_5'].agg(['mean', 'count'])
    print(f"\n{feat}:")
    print(grouped)

    # Chi-squared test
    contingency = pd.crosstab(df_basic[feat], df_basic['over_2_5'])
    chi2, p, dof, expected = chi2_contingency(contingency)
    print(f"Chi-squared p-value: {p:.4f}")

# Visualize - dynamically create subplots based on number of features
n_features = min(len(bool_features), 5)  # Limit to 5 for readability
fig, axes = plt.subplots(1, n_features, figsize=(5 * n_features, 4))
if n_features == 1:
    axes = [axes]  # Make it iterable

# Configuration for feature visualization
feature_configs = {
    'is_weekend': {'labels': ['Weekday', 'Weekend'], 'title': 'Over 2.5 Rate: Weekend vs Weekday'},
    'home_promoted': {'labels': ['Regular', 'Promoted'], 'title': 'Over 2.5 Rate: Home Team Promoted'},
    'home_demoted': {'labels': ['Regular', 'Demoted'], 'title': 'Over 2.5 Rate: Home Team Demoted'},
    'away_promoted': {'labels': ['Regular', 'Promoted'], 'title': 'Over 2.5 Rate: Away Team Promoted'},
    'away_demoted': {'labels': ['Regular', 'Demoted'], 'title': 'Over 2.5 Rate: Away Team Demoted'}
}

for idx, feat in enumerate(bool_features[:n_features]):
    rates = df_basic.groupby(feat)['over_2_5'].mean()
    config = feature_configs.get(feat, {'labels': ['False', 'True'], 'title': f'Over 2.5 Rate: {feat}'})

    axes[idx].bar(config['labels'], rates.values, color=['steelblue', 'coral'])
    axes[idx].set_title(config['title'], fontsize=11, fontweight='bold')
    axes[idx].set_ylabel('Over 2.5 Rate')
    axes[idx].set_ylim([0.45, 0.55])  # Zoom in on the relevant range
    axes[idx].grid(True, alpha=0.3, axis='y')

    # Add value labels on bars
    for i, v in enumerate(rates.values):
        axes[idx].text(i, v + 0.002, f'{v:.3f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

**Interpretation of Boolean/Categorical Features:**

Chi-squared tests reveal the following significance levels for `{len(bool_features)}` boolean features analyzed:

**Statistically Significant (α=0.05):**
- **Weekend vs Weekday** (p=0.0012): Weekend matches show notably higher Over 2.5 rate (50.5% vs 48.7% weekday). This is the strongest categorical predictor, suggesting weekend scheduling may influence match dynamics - possibly due to fan attendance, player rest, or tactical approaches.

- **Away Team Demoted** (p=0.0121): Away teams that were demoted show lower Over 2.5 rate (47.6% vs 50.1% regular teams). The 2.5 percentage point difference suggests defensive strategies or morale issues affecting demoted teams playing away.

- **Home Team Demoted** (p=0.0277): Home teams that were demoted show lower Over 2.5 rate (47.8% vs 50.1% regular teams). The effect is similar to away demoted, indicating demotion status has defensive implications regardless of venue.

**Borderline/Not Significant:**
- **Away Team Promoted** (p=0.0712): Shows higher Over 2.5 rate (51.8% vs 49.9%) but just misses significance at α=0.05. May reflect attacking ambition of newly promoted teams.

- **Home Team Promoted** (p=0.4339): No meaningful difference (50.8% vs 49.9%). Home promoted teams do not show distinct scoring patterns.

**Conclusion:**  
- **Weekend effect** is the most robust categorical predictor with clear practical significance.
- **Demotion features** (both home and away) show consistent negative effects on Over 2.5 rate, suggesting these teams adopt more conservative tactics.
- **Promotion features** show weaker/inconsistent effects - away promoted is borderline while home promoted shows no effect.
- All `{len(bool_features)}` features detected and analyzed: `{bool_features}`
- Individual effects remain modest (all within ±3 percentage points of baseline 50%), but may prove valuable in ensemble models.

## 4.4 Extended feature df engineering

Create extended dataframe with all available match data including detailed statistics and betting odds.

In [None]:
# EXTENDED DATASET: All available match data including detailed statistics
extended_core_features = [col for col in [
    # Core match info
    'Div', 'Season', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR',
    # Match statistics
    'Attendance', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HHW', 'AHW', 'HC', 'AC',
    'HF', 'AF', 'HFKC', 'AFKC', 'HO', 'AO', 'HY', 'AY', 'HR', 'AR', 'HBP', 'ABP'
] if col in all_matches.columns]

# All engineered features (using rich match statistics)
# Note: Additional features (time-based + historical) will be merged from df_basic after creation
extended_engineered_features = [col for col in [
    'total_goals', 'ht_total_goals', 'second_half_goals',  # Goal-based
    'home_shot_accuracy', 'away_shot_accuracy', 'total_shots', 'total_shots_on_target',  # Shot-based
    'shot_dominance', 'corner_dominance', 'total_fouls', 'total_cards', 'card_intensity',  # Game dynamics
    'league_tier', 'month', 'is_weekend',  # Date/league features
    'over_2_5'  # Target variable
] if col in all_matches.columns]

# Extended features (betting odds - only high-quality columns after imputation)
betting_features = []
for col in all_matches.columns:
    # Check if it's a betting column and has good data coverage (>10%)
    if any(bookmaker in col for bookmaker in ['B365', 'BW', 'PS', 'IW', 'LB', 'WH', 'SJ', 'VC', 'BF', '1XB']):
        data_coverage = (all_matches[col].notna().sum() / len(all_matches)) * 100
        if data_coverage >= 10:  # Only include columns with at least 10% data coverage
            betting_features.append(col)

# Create extended dataframe
all_extended_features = extended_core_features + extended_engineered_features + betting_features
# Remove duplicates while preserving order
all_extended_features = list(dict.fromkeys(all_extended_features))

df_extended = all_matches[all_extended_features].copy()
df_extended = df_extended.sort_values(['Div', 'Date']).reset_index(drop=True)

print(f"Extended dataframe created")
print(f"Shape: {df_extended.shape}")
print(f"\nFeature breakdown:")
print(f"Core features: {len(extended_core_features)}")
print(f"Engineered features: {len(extended_engineered_features)}")
print(f"Betting features (>10% coverage): {len(betting_features)}")
print(f"Total features: {len(all_extended_features)}")
print(f"\nColumns: {df_extended.columns.tolist()[:20]}...")  # Show first 20
df_extended.head()

### 4.4.1 Merging engineered features from df_basic

Add all engineered features from df_basic

In [None]:
time_based_features = [
    'home_days_since_last', 'away_days_since_last',
    'home_goals_ma5', 'away_goals_ma5', 'home_conceded_ma5', 'away_conceded_ma5',
    'home_promoted', 'away_promoted', 'home_demoted', 'away_demoted'
]

# Historical position features (overall only)
historical_position_cols = [col for col in df_basic.columns if 'position_pct_' in col]

# Historical goal pattern features (overall + home-specific + away-specific, counts + percentages)
historical_goal_cols = [col for col in df_basic.columns if any(
    f'{p}_{t}plus' in col for p in ['scored', 'conceded', 'total'] for t in [2, 3]
)]

# Derived features combining historical patterns
derived_features = [col for col in df_basic.columns if any(
    pattern in col for pattern in ['_strength_', '_combined_']
)]

features_to_merge = time_based_features + historical_position_cols + historical_goal_cols + derived_features

print(f"Features to merge from df_basic: {len(features_to_merge)}")
print(f"Time-based features: {len(time_based_features)}")
print(f"Position percentiles: {len(historical_position_cols)}")
print(f"Goal patterns (overall + home/away, counts + pct): {len(historical_goal_cols)}")
print(f"Derived features: {len(derived_features)}")

df_basic_sorted = df_basic.sort_values(['Div', 'Date']).reset_index(drop=True)
df_extended_sorted = df_extended.sort_values(['Div', 'Date']).reset_index(drop=True)

match_check = (
    (df_basic_sorted['Div'] == df_extended_sorted['Div']) &
    (df_basic_sorted['Date'] == df_extended_sorted['Date']) &
    (df_basic_sorted['HomeTeam'] == df_extended_sorted['HomeTeam']) &
    (df_basic_sorted['AwayTeam'] == df_extended_sorted['AwayTeam'])
).all()

if match_check:
    print(f"\nRow alignment verified - safe to merge features")

    for col in features_to_merge:
        df_extended[col] = df_basic_sorted[col].values

    print(f"
{len(features_to_merge)} features added to df_extended")
    print(f"- {len(time_based_features)} time-based features")
    print(f"- {len(historical_position_cols)} position percentiles")
    print(f"- {len(historical_goal_cols)} goal pattern features")
    print(f"- {len(derived_features)} derived features")
    print(f"\nFinal df_extended shape: {df_extended.shape}")
else:
    print("⚠ Row mismatch detected - cannot safely merge features")

### 4.4.2. More features for df_extended

**Extended Statistics - Moving Averages & Seasonal Patterns**

Calculate MA5 and historical patterns for match statistics (shots, corners, fouls, cards, etc.) following the same approach as goal-based features.

 **Data Leakage Prevention:**
- MA5 calculations use `Date < date` to exclude current match
- NaN values preserved for early-season matches (no fillna with overall means)
- Same fix applied in Section 4.3.2 (basic df)

In [None]:
"""
Calculate moving averages and seasonal history for extended match statistics.
Following the same pattern as goal-based features in df_basic.

MA5 Features: High + Medium Priority (Shots, Shots on Target, Corners, Fouls, Yellow Cards)
Seasonal Patterns: High Priority Only (Shots, Shots on Target, Corners)
"""

print("=" * 70)
print("SECTION 4.4.2: EXTENDED STATS MOVING AVERAGES & SEASONAL PATTERNS")
print("=" * 70)

# ============================================================
# STEP 1: 5-Match Moving Averages (High + Medium Priority)
# ============================================================

print("\n1. CALCULATING 5-MATCH MOVING AVERAGES")
print("-" * 70)

# Define statistics to calculate MA5 for - High and Medium priority
ma5_stats_config = {
    # High Priority - Shooting
    ('HS', 'shots'): 'home',
    ('AS', 'shots'): 'away',
    ('HST', 'shots_target'): 'home',
    ('AST', 'shots_target'): 'away',
    # High Priority - Attacking Pressure  
    ('HC', 'corners'): 'home',
    ('AC', 'corners'): 'away',
    # Medium Priority - Discipline
    ('HF', 'fouls'): 'home',
    ('AF', 'fouls'): 'away',
    ('HY', 'yellows'): 'home',
    ('AY', 'yellows'): 'away',
}

# Initialize MA5 columns
print(f"Initializing {len(ma5_stats_config)} MA5 columns...")
for (raw_col, stat_name), team_type in ma5_stats_config.items():
    ma5_col = f'{team_type}_{stat_name}_ma5'
    df_extended[ma5_col] = np.nan

# Calculate moving averages - optimized by pre-sorting
print("Calculating moving averages (optimized groupby approach)...")
df_extended_sorted = df_extended.sort_values(['Div', 'Date']).reset_index(drop=True)

for (raw_col, stat_name), team_type in ma5_stats_config.items():
    if raw_col not in df_extended.columns:
        continue
    
    ma5_col = f'{team_type}_{stat_name}_ma5'
    team_col = 'HomeTeam' if team_type == 'home' else 'AwayTeam'
    
    print(f"Processing {ma5_col}...")
    
    # Group by division and team, then calculate MA5
    for (div, team), group in df_extended_sorted.groupby(['Div', team_col]):
        indices = group.index.tolist()
        values = group[raw_col].values
        
        # Calculate MA5 for each match in this group
        for i, idx in enumerate(indices):
            if i >= 5:  # Need at least 5 previous matches
                ma5_value = np.nanmean(values[i-5:i])  # Exclude current match
                if not np.isnan(ma5_value):
                    df_extended.loc[idx, ma5_col] = ma5_value

# NO DATA LEAKAGE: NaN values preserved for early-season matches
print("\nMA5 Summary:")
for (raw_col, stat_name), team_type in ma5_stats_config.items():
    ma5_col = f'{team_type}_{stat_name}_ma5'
    if ma5_col in df_extended.columns:
        missing = df_extended[ma5_col].isna().sum()
        missing_pct = (missing / len(df_extended)) * 100
        mean_val = df_extended[ma5_col].mean()
        print(f"{ma5_col:25s}: missing={missing:5d} ({missing_pct:4.1f}%), mean={mean_val:6.2f}")

print(f"
Added {len(ma5_stats_config)} moving average features")
print(f"(NaN values preserved for early-season matches - no data leakage)")

# ============================================================
# STEP 2: Seasonal Historical Patterns (High Priority Only)
# ============================================================

print("\n2. CALCULATING SEASONAL HISTORICAL PATTERNS")
print("-" * 70)
print("High priority features only: Shots, Shots on Target, Corners")

# Define thresholds for HIGH PRIORITY stats only
stat_thresholds = {
    'shots': [10, 15],           # 10+ shots, 15+ shots
    'shots_target': [5, 8],      # 5+ on target, 8+ on target
    'corners': [6, 10],          # 6+ corners, 10+ corners
}

# Season info
unique_seasons = sorted(df_extended['Season'].unique())
season_to_order = {s: i for i, s in enumerate(unique_seasons)}
past_seasons = ['2019/2020', '2020/2021', '2021/2022', '2022/2023', '2023/2024', '2024/2025']
season_years = [2019, 2020, 2021, 2022, 2023, 2024]

# Map stat names to columns
stat_to_home_col = {'shots': 'HS', 'shots_target': 'HST', 'corners': 'HC'}
stat_to_away_col = {'shots': 'AS', 'shots_target': 'AST', 'corners': 'AC'}

# Pre-calculate seasonal statistics for all teams (OPTIMIZATION)
print("Pre-calculating team-season statistics...")
team_season_stats = {}

for season in past_seasons:
    for div in df_extended['Div'].unique():
        season_div_mask = (df_extended['Season'] == season) & (df_extended['Div'] == div)
        season_div_data = df_extended[season_div_mask]
        
        if len(season_div_data) == 0:
            continue
        
        # Get all teams in this season/division
        all_teams = set(season_div_data['HomeTeam'].unique()) | set(season_div_data['AwayTeam'].unique())
        
        for team in all_teams:
            # Overall stats (all matches)
            overall_mask = (season_div_data['HomeTeam'] == team) | (season_div_data['AwayTeam'] == team)
            overall_matches = season_div_data[overall_mask]
            
            # Home-only stats
            home_mask = season_div_data['HomeTeam'] == team
            home_matches = season_div_data[home_mask]
            
            # Away-only stats
            away_mask = season_div_data['AwayTeam'] == team
            away_matches = season_div_data[away_mask]
            
            # Calculate stats for each threshold
            stats = {}
            for stat_name, thresholds in stat_thresholds.items():
                h_col = stat_to_home_col[stat_name]
                a_col = stat_to_away_col[stat_name]
                
                for threshold in thresholds:
                    # Overall stats
                    count = 0
                    for _, match in overall_matches.iterrows():
                        col = h_col if match['HomeTeam'] == team else a_col
                        if not pd.isna(match.get(col)) and match.get(col, 0) >= threshold:
                            count += 1
                    total = len(overall_matches)
                    stats[f'{stat_name}_{threshold}plus_count'] = count
                    stats[f'{stat_name}_{threshold}plus_pct'] = count / total if total > 0 else 0
                    
                    # Home-only stats
                    if h_col in home_matches.columns:
                        home_count = (home_matches[h_col] >= threshold).sum()
                        home_total = len(home_matches)
                        stats[f'home_{stat_name}_{threshold}plus_count'] = home_count
                        stats[f'home_{stat_name}_{threshold}plus_pct'] = home_count / home_total if home_total > 0 else 0
                    
                    # Away-only stats
                    if a_col in away_matches.columns:
                        away_count = (away_matches[a_col] >= threshold).sum()
                        away_total = len(away_matches)
                        stats[f'away_{stat_name}_{threshold}plus_count'] = away_count
                        stats[f'away_{stat_name}_{threshold}plus_pct'] = away_count / away_total if away_total > 0 else 0
            
            team_season_stats[(season, div, team)] = stats

print(f"Pre-calculated stats for {len(team_season_stats)} team-season combinations")

# Initialize seasonal pattern columns
print("Initializing seasonal pattern columns...")
total_columns = 0
for stat_name in stat_thresholds.keys():
    for threshold in stat_thresholds[stat_name]:
        for year in season_years:
            # Overall patterns
            for prefix in ['home', 'away']:
                df_extended[f'{prefix}_{stat_name}_{threshold}plus_count_{year}'] = 0
                df_extended[f'{prefix}_{stat_name}_{threshold}plus_pct_{year}'] = np.nan
                total_columns += 2
            # Context-aware patterns
            df_extended[f'home_home_{stat_name}_{threshold}plus_count_{year}'] = 0
            df_extended[f'home_home_{stat_name}_{threshold}plus_pct_{year}'] = np.nan
            df_extended[f'away_away_{stat_name}_{threshold}plus_count_{year}'] = 0
            df_extended[f'away_away_{stat_name}_{threshold}plus_pct_{year}'] = np.nan
            total_columns += 4

print(f"Created {total_columns} seasonal pattern columns")

# Populate features using pre-calculated statistics (FAST!)
print("\nPopulating seasonal pattern features (using lookups)...")
for idx, row in df_extended.iterrows():
    current_season = row['Season']
    current_season_order = season_to_order.get(current_season, 999)
    
    for year_idx, year in enumerate(season_years):
        if year_idx >= current_season_order:
            continue
        
        past_season = past_seasons[year_idx]
        
        # Lookup home team stats
        home_key = (past_season, row['Div'], row['HomeTeam'])
        if home_key in team_season_stats:
            home_stats = team_season_stats[home_key]
            for stat_name in stat_thresholds.keys():
                for threshold in stat_thresholds[stat_name]:
                    # Overall
                    df_extended.loc[idx, f'home_{stat_name}_{threshold}plus_count_{year}'] = home_stats.get(f'{stat_name}_{threshold}plus_count', 0)
                    df_extended.loc[idx, f'home_{stat_name}_{threshold}plus_pct_{year}'] = home_stats.get(f'{stat_name}_{threshold}plus_pct', np.nan)
                    # Home-specific
                    df_extended.loc[idx, f'home_home_{stat_name}_{threshold}plus_count_{year}'] = home_stats.get(f'home_{stat_name}_{threshold}plus_count', 0)
                    df_extended.loc[idx, f'home_home_{stat_name}_{threshold}plus_pct_{year}'] = home_stats.get(f'home_{stat_name}_{threshold}plus_pct', np.nan)
        
        # Lookup away team stats
        away_key = (past_season, row['Div'], row['AwayTeam'])
        if away_key in team_season_stats:
            away_stats = team_season_stats[away_key]
            for stat_name in stat_thresholds.keys():
                for threshold in stat_thresholds[stat_name]:
                    # Overall
                    df_extended.loc[idx, f'away_{stat_name}_{threshold}plus_count_{year}'] = away_stats.get(f'{stat_name}_{threshold}plus_count', 0)
                    df_extended.loc[idx, f'away_{stat_name}_{threshold}plus_pct_{year}'] = away_stats.get(f'{stat_name}_{threshold}plus_pct', np.nan)
                    # Away-specific
                    df_extended.loc[idx, f'away_away_{stat_name}_{threshold}plus_count_{year}'] = away_stats.get(f'away_{stat_name}_{threshold}plus_count', 0)
                    df_extended.loc[idx, f'away_away_{stat_name}_{threshold}plus_pct_{year}'] = away_stats.get(f'away_{stat_name}_{threshold}plus_pct', np.nan)
    
    # Progress indicator
    if (idx + 1) % 10000 == 0:
        print(f"Processed {idx + 1}/{len(df_extended)} matches...")

print(f"
Populated {total_columns} seasonal pattern features")

# ============================================================
# Summary
# ============================================================

print("\n" + "=" * 70)
print("SUMMARY - SECTION 4.4.2 FEATURE ENGINEERING")
print("=" * 70)
print(f"Moving Average Features (MA5): {len(ma5_stats_config)}")
print(f"- High Priority: Shots, Shots on Target, Corners")
print(f"- Medium Priority: Fouls, Yellow Cards")
print(f"\nSeasonal Pattern Features: {total_columns}")
print(f"- High Priority Only: Shots, Shots on Target, Corners")
print(f"- Overall + Home/Away context for each stat")
print(f"- {len(stat_thresholds)} stat types × 2 thresholds × 6 seasons")
print(f"\nTotal New Features: {len(ma5_stats_config) + total_columns}")
print(f"Final df_extended shape: {df_extended.shape}")
print("=" * 70)

### 4.4.3. Betting odds

In [None]:
"""
Betting Odds Feature Engineering - Section 4.4.3

Goals:
1. Merge BetVictor (BV) and VC Bet odds (same company, rebranded)
2. Calculate minimum odds for each betting category
3. Count number of bookmakers per betting category
4. Remove individual bookmaker columns, keep only aggregates (Max/Avg/Min/Count)

Betting Categories:
- Match Odds (H/D/A): 1X2 pre-match
- Match Odds Closing (CH/CD/CA): 1X2 closing
- Over/Under (>2.5/<2.5): Total goals pre-match
- Over/Under Closing (C>2.5/C<2.5): Total goals closing
- Asian Handicap (AHH/AHA): Pre-match handicap
- Asian Handicap Closing (CAHH/CAHA): Closing handicap
"""

print("=" * 70)
print("SECTION 4.4.3: BETTING ODDS FEATURE ENGINEERING")
print("=" * 70)

# STEP 1: Add Max/Avg columns from all_matches if missing

print("\n1. ADDING MAX/AVG AGGREGATE COLUMNS FROM SOURCE DATA")
print("-" * 70)

# Add Max/Avg columns from all_matches if they're not already in df_extended
max_avg_cols_to_add = [col for col in all_matches.columns 
                       if col.startswith(('Max', 'Avg')) 
                       and col not in df_extended.columns]

if max_avg_cols_to_add:
    print(f"Adding {len(max_avg_cols_to_add)} Max/Avg columns:")
    for col in sorted(max_avg_cols_to_add):
        df_extended[col] = all_matches[col]
        print(f"+ {col}")
else:
    print("All Max/Avg columns already present")

# STEP 2: Merge BetVictor and VC Bet (Same Company)


print("\n2. MERGING BETVICTOR (BV) AND VC BET ODDS")
print("-" * 70)

# BetVictor columns for different betting categories
bv_vc_pairs = {
    'BVH': 'VCH', 'BVD': 'VCD', 'BVA': 'VCA',
    'BVCH': 'VCCH', 'BVCD': 'VCCD', 'BVCA': 'VCCA',
}

merged_count = 0
for bv_col, vc_col in bv_vc_pairs.items():
    if bv_col in df_extended.columns and vc_col in df_extended.columns:
        before_na = df_extended[bv_col].isna().sum()
        df_extended[bv_col] = df_extended[bv_col].fillna(df_extended[vc_col])
        after_na = df_extended[bv_col].isna().sum()
        filled = before_na - after_na
        if filled > 0:
            print(f"{bv_col}: filled {filled} values from {vc_col}")
            merged_count += 1

if merged_count == 0:
    print(f"No VC columns found or already merged")

print(f" BetVictor and VC Bet odds merged")

# ============================================================
# STEP 3: Define Bookmaker Columns by Category
# ============================================================

print("\n3. IDENTIFYING BOOKMAKER COLUMNS BY CATEGORY")
print("-" * 70)

# Define bookmaker prefixes (excluding aggregates like Max, Avg, Bb)
bookmakers = ['1XB', 'B365', 'BF', 'BFD', 'BMGM', 'BV', 'BS', 'BW', 'CL', 
              'GB', 'IW', 'LB', 'PS', 'PSH', 'PSD', 'PSA', 'PH', 'PD', 'PA',
              'SO', 'SB', 'SJ', 'SY', 'WH']

# Define betting categories with their suffixes and column naming
betting_categories = {
    'match_odds_open': {
        'suffixes': ['H', 'D', 'A'],
        'count_col': 'NumBookmakers_MatchOdds',
        'min_cols': ['MinH', 'MinD', 'MinA']
    },
    'match_odds_closing': {
        'suffixes': ['CH', 'CD', 'CA'],
        'count_col': 'NumBookmakers_MatchOddsClosing',
        'min_cols': ['MinCH', 'MinCD', 'MinCA']
    },
    'over_under_open': {
        'suffixes': ['>2.5', '<2.5'],
        'count_col': 'NumBookmakers_OverUnder',
        'min_cols': ['Min>2.5', 'Min<2.5']
    },
    'over_under_closing': {
        'suffixes': ['C>2.5', 'C<2.5'],
        'count_col': 'NumBookmakers_OverUnderClosing',
        'min_cols': ['MinC>2.5', 'MinC<2.5']
    },
    'asian_handicap_open': {
        'suffixes': ['AHH', 'AHA'],
        'count_col': 'NumBookmakers_AsianHandicap',
        'min_cols': ['MinAHH', 'MinAHA']
    },
    'asian_handicap_closing': {
        'suffixes': ['CAHH', 'CAHA'],
        'count_col': 'NumBookmakers_AsianHandicapClosing',
        'min_cols': ['MinCAHH', 'MinCAHA']
    }
}

# Find bookmaker columns for each category
bookmaker_cols_by_category = {}

for category_name, category_info in betting_categories.items():
    suffixes = category_info['suffixes']
    bookmaker_cols_by_category[category_name] = {}
    
    for suffix in suffixes:
        cols = []
        for bookmaker in bookmakers:
            # Check for exact column match
            col_name = f"{bookmaker}{suffix}"
            if col_name in df_extended.columns:
                cols.append(col_name)
        
        bookmaker_cols_by_category[category_name][suffix] = cols
        if cols:
            print(f"{category_name:30s} [{suffix:6s}]: {len(cols):2d} bookmakers")

# ============================================================
# STEP 4: Calculate Min Odds and Bookmaker Counts per Category
# ============================================================

print("\n4. CALCULATING MIN ODDS AND BOOKMAKER COUNTS")
print("-" * 70)

for category_name, category_info in betting_categories.items():
    suffixes = category_info['suffixes']
    count_col = category_info['count_col']
    min_cols = category_info['min_cols']
    
    # Track unique bookmakers for this category
    unique_bookmakers_per_row = []
    
    for i, suffix in enumerate(suffixes):
        bookmaker_cols = bookmaker_cols_by_category[category_name][suffix]
        
        if not bookmaker_cols:
            continue
        
        # Calculate minimum odds for this suffix
        min_col = min_cols[i]
        df_extended[min_col] = df_extended[bookmaker_cols].min(axis=1)
        
        print(f"Created {min_col:15s} from {len(bookmaker_cols)} bookmakers")
    
    # Calculate bookmaker count for this category
    # Count how many bookmakers provided at least one odds type for this match
    # For match odds: if a bookmaker has H, D, or A, count it once
    all_bookmaker_cols_in_category = []
    for suffix in suffixes:
        all_bookmaker_cols_in_category.extend(bookmaker_cols_by_category[category_name][suffix])
    
    if all_bookmaker_cols_in_category:
        # Extract unique bookmaker prefixes from column names
        def extract_bookmaker_prefix(col_name):
            """Extract bookmaker prefix from column name"""
            for bm in bookmakers:
                if col_name.startswith(bm):
                    return bm
            return None
        
        # OPTIMIZED: Use vectorized pandas operations instead of nested loop
        # For each row, count unique bookmakers that have at least one non-NaN value
        def count_unique_bookmakers(row):
            """Count unique bookmakers with at least one non-NaN value for this row"""
            unique_bm = set()
            for col in all_bookmaker_cols_in_category:
                if pd.notna(row[col]):
                    bm_prefix = extract_bookmaker_prefix(col)
                    if bm_prefix:
                        unique_bm.add(bm_prefix)
            return len(unique_bm)
        
        # Apply function across rows - much faster than explicit loop
        df_extended[count_col] = df_extended[all_bookmaker_cols_in_category].apply(count_unique_bookmakers, axis=1)
        avg_count = df_extended[count_col].mean()
        print(f"Created {count_col:40s} (avg: {avg_count:.1f} bookmakers/match)")

print(f"
Created minimum odds and bookmaker counts for all categories")

# ============================================================
# STEP 5: Calculate Disagreement (Max - Min)
# ============================================================

print("\n5. CALCULATING BOOKMAKER DISAGREEMENT (MAX - MIN)")
print("-" * 70)

# Define disagreement calculations based on existing Max columns
disagreement_mapping = {
    'DisagreementH': ('MaxH', 'MinH'),
    'DisagreementD': ('MaxD', 'MinD'),
    'DisagreementA': ('MaxA', 'MinA'),
    'DisagreementCH': ('MaxCH', 'MinCH'),
    'DisagreementCD': ('MaxCD', 'MinCD'),
    'DisagreementCA': ('MaxCA', 'MinCA'),
    'Disagreement>2.5': ('Max>2.5', 'Min>2.5'),
    'Disagreement<2.5': ('Max<2.5', 'Min<2.5'),
    'DisagreementC>2.5': ('MaxC>2.5', 'MinC>2.5'),
    'DisagreementC<2.5': ('MaxC<2.5', 'MinC<2.5'),
    'DisagreementAHH': ('MaxAHH', 'MinAHH'),
    'DisagreementAHA': ('MaxAHA', 'MinAHA'),
    'DisagreementCAHH': ('MaxCAHH', 'MinCAHH'),
    'DisagreementCAHA': ('MaxCAHA', 'MinCAHA'),
}

for disagreement_col, (max_col, min_col) in disagreement_mapping.items():
    if max_col in df_extended.columns and min_col in df_extended.columns:
        df_extended[disagreement_col] = df_extended[max_col] - df_extended[min_col]
        avg_disagreement = df_extended[disagreement_col].mean()
        print(f"{disagreement_col:25s} = {max_col:12s} - {min_col:12s} (avg: {avg_disagreement:.3f})")

print(f"
Created disagreement features for available Max/Min pairs")

# ============================================================
# STEP 6: Remove Individual Bookmaker Columns
# ============================================================

print("\n6. REMOVING INDIVIDUAL BOOKMAKER COLUMNS")
print("-" * 70)

# Identify all individual bookmaker columns to remove
individual_bookmaker_cols = []
for col in df_extended.columns:
    # Check if column starts with a bookmaker prefix
    if any(col.startswith(bm) for bm in bookmakers):
        # Exclude if it's an aggregate or new feature we created
        if not col.startswith(('Num', 'Min', 'Disagreement')):
            individual_bookmaker_cols.append(col)

print(f"Removing {len(individual_bookmaker_cols)} individual bookmaker columns...")
df_extended = df_extended.drop(columns=individual_bookmaker_cols)

print(f" Removed individual bookmaker columns")

# ============================================================
# STEP 7: Summary
# ============================================================

print("\n" + "=" * 70)
print("BETTING ODDS FEATURE ENGINEERING COMPLETE")
print("=" * 70)

# Count final betting features
betting_features_remaining = [col for col in df_extended.columns if any(
    pattern in col for pattern in ['Max', 'Min', 'Avg', 'Disagreement', 'NumBookmakers', 'Bb']
)]

print(f"\nFinal betting features: {len(betting_features_remaining)}")
print(f"- Aggregate odds (Max/Avg/Min): {sum(1 for c in betting_features_remaining if c.startswith(('Max', 'Avg', 'Min')) and 'Disagreement' not in c and 'NumBookmakers' not in c)}")
print(f"- Disagreement features: {sum(1 for c in betting_features_remaining if 'Disagreement' in c)}")
print(f"- Bookmaker count features: {sum(1 for c in betting_features_remaining if 'NumBookmakers' in c)}")
print(f"- BetBrain features: {sum(1 for c in betting_features_remaining if c.startswith('Bb'))}")

print(f"\nFinal df_extended shape: {df_extended.shape}")
print("=" * 70)

## 4.5 Final feature dataframe

Combine all features into the final dataframe.

In [None]:
# Summary of engineered features (BEFORE removing post-match columns)
# Note: Post-match features like FTHG, FTAG, FTR, total_goals will be removed in section 4.6
print("Final engineered dataframe (with all features):")
print(f"Shape: {df_basic.shape}")
print(f"\nColumns:")
print(df_basic.columns.tolist())
print(f"\nMissing values:")
print(df_basic.isnull().sum().sum())
print(f"\nSample:")
# Display columns that exist (some may not exist depending on data processing)
display_cols = [col for col in ['Date', 'HomeTeam', 'AwayTeam', 'total_goals', 'over_2_5',
    'home_days_since_last', 'home_goals_ma5', 'home_promoted', 'is_weekend'] if col in df_basic.columns]
df_basic[display_cols].head(10)

#### **Added Features**

**Time-based metrics:**  
- `home_days_since_last`, `away_days_since_last`: Days since each team’s previous match (average ≈ 9.5 days)

**Recent performance (5-match moving averages):**  
- `home_goals_ma5`, `away_goals_ma5`: Average goals scored in the last 5 matches  
- `home_conceded_ma5`, `away_conceded_ma5`: Average goals conceded in the last 5 matches  

**League transitions:**  
- `home_promoted`, `away_promoted`, `home_demoted`, `away_demoted`: Indicators of team movement between divisions  

#### **Main Observations** 
   - All features show very weak correlations (|r| < 0.02) with the Over/Under 2.5 goals target.  

   - Chi-square tests are insignificant (p > 0.05), suggesting no strong individual relationships. 

   - League level shows the strongest (though still small) correlation at −0.012, indicating that lower leagues may have slightly fewer high-scoring games.

   - The home team’s recent attacking form has a weak positive correlation (0.010), but it’s negligible in isolation.

   - Newly promoted or relegated teams do not exhibit consistent differences in total goals per match.

   - Rest days and weekend scheduling have no measurable effect on goal totals.

#### **Implications for Modeling**

While each feature provides limited predictive power on its own, they may still add value when used together in non-linear models such as tree-based or ensemble methods.  

Weak linear relationships are expected, as the Over/Under 2.5 target is a roughly balanced binary outcome, making individual predictors inherently limited in isolation.

## 4.6 Remove Post-Match Features to Prevent Data Leakage

Now that all historical features have been calculated from past match data, we must remove all columns that contain post-match information or direct derivatives of the target variable. This ensures that our feature matrices contain only genuine pre-match inputs.

In [None]:
# Post-match columns to remove from BASELINE dataset (df_basic)
# These columns contain information only available AFTER the match is played
post_match_basic = [
    'FTHG', 'FTAG', 'FTR',  # Final match results
    'total_goals',  # Direct derivative of the target (FTHG + FTAG)
    'HTHG', 'HTAG', 'HTR',  # Half-time results (if present)
]

# Remove columns that exist in df_basic
cols_to_drop_basic = [col for col in post_match_basic if col in df_basic.columns]

print("=" * 70)
print("BASELINE DATASET (df_basic) - Removing Post-Match Features")
print("=" * 70)
print(f"Shape before: {df_basic.shape}")
print(f"Columns to remove: {cols_to_drop_basic}")

df_basic = df_basic.drop(columns=cols_to_drop_basic)

print(f"Shape after: {df_basic.shape}")
print(f"\nRemaining columns:")
print(df_basic.columns.tolist())
print(f"\nBaseline dataset cleaned - only pre-match features remain")
print("=" * 70)

In [None]:
# Post-match columns to remove from EXTENDED dataset (df_extended)
# These include all match statistics and results that are only available after the match
post_match_extended = [
    # Final match results
    'FTHG', 'FTAG', 'FTR',
    # Half-time results
    'HTHG', 'HTAG', 'HTR',
    # Shot statistics
    'HS', 'AS', 'HST', 'AST',
    # Corners
    'HC', 'AC',
    # Fouls
    'HF', 'AF',
    # Cards
    'HY', 'AY', 'HR', 'AR',
    # Other in-game statistics
    'HHW', 'AHW', 'HFKC', 'AFKC', 'HO', 'AO', 'HBP', 'ABP',
    # Derived features from post-match data
    'total_goals', 'ht_total_goals', 'second_half_goals',
    'home_shot_accuracy', 'away_shot_accuracy', 'total_shots', 'total_shots_on_target',
    'shot_dominance', 'corner_dominance', 'total_fouls', 'total_cards', 'card_intensity',
    # MA5 features calculated from in-game stats (if any exist in df_extended)
    'home_shots_ma5', 'away_shots_ma5', 'home_shots_target_ma5', 'away_shots_target_ma5',
    'home_corners_ma5', 'away_corners_ma5', 'home_fouls_ma5', 'away_fouls_ma5',
    'home_yellows_ma5', 'away_yellows_ma5',
]

# Remove columns that exist in df_extended
cols_to_drop_extended = [col for col in post_match_extended if col in df_extended.columns]

print("=" * 70)
print("EXTENDED DATASET (df_extended) - Removing Post-Match Features")
print("=" * 70)
print(f"Shape before: {df_extended.shape}")
print(f"Columns to remove ({len(cols_to_drop_extended)}): {cols_to_drop_extended}")

df_extended = df_extended.drop(columns=cols_to_drop_extended)

print(f"Shape after: {df_extended.shape}")
print(f"\nRemaining columns:")
print(df_extended.columns.tolist())
print(f"\nExtended dataset cleaned - only pre-match features remain")
print("=" * 70)

### Summary of Data Leakage Prevention

**What was removed:**
- **Match results:** FTHG, FTAG, FTR, HTHG, HTAG, HTR
- **In-game statistics:** All shots, corners, fouls, cards
- **Derived features:** total_goals, shot_accuracy, dominance metrics, card_intensity, etc.

**What remains:**
- **Pre-match identifiers:** Div, Season, Date, Time, HomeTeam, AwayTeam
- **Pre-match context:** league_tier, month, is_weekend, Attendance, Referee
- **Historical features:** All MA5 features, position percentiles, goal patterns from PAST seasons
- **Time-based features:** days_since_last, promoted/demoted indicators
- **Betting odds:** Pre-match odds from bookmakers
- **Target variable:** over_2_5 (kept for modeling)

**Key principle:** All features in df_basic and df_extended are now calculated exclusively from:
1. Information available BEFORE the match starts (dates, teams, league info)
2. Historical data from PREVIOUS matches only (MA5, seasonal patterns, positions)
3. Pre-match betting odds

This ensures no information from the current match leaks into the predictive features.

## 5 Train/Validation/Test Split by Season

Create temporal splits to prevent data leakage. We'll use:
- **Training:** 3 oldest seasons (2019/2020, 2020/2021, 2021/2022)
- **Validation:** 1 middle season (2022/2023)
- **Testing:** 2 most recent seasons (2023/2024, 2024/2025)

This ensures the model is trained on past data and evaluated on future data, mimicking real-world deployment.

In [None]:
# Define season splits
train_seasons = ['2019/2020', '2020/2021', '2021/2022']
val_seasons = ['2022/2023']
test_seasons = ['2023/2024', '2024/2025']

print("=" * 70)
print("TRAIN/VALIDATION/TEST SPLIT BY SEASON")
print("=" * 70)

# Function to perform split for any dataframe
def split_by_season(df, train_seasons, val_seasons, test_seasons, dataset_name="Dataset"):
    """
    Split dataframe by season with validation and ensure chronological sorting.
    
    IMPORTANT: Data is sorted by Date to maintain temporal ordering, which is
    critical for target encoding with shuffle=False in KFold cross-validation.
    """
    
    # Check which seasons exist in the data
    available_seasons = df['Season'].unique()
    print(f"\n{dataset_name}:")
    print(f"Available seasons: {sorted(available_seasons)}")
    
    # Create splits
    train_mask = df['Season'].isin(train_seasons)
    val_mask = df['Season'].isin(val_seasons)
    test_mask = df['Season'].isin(test_seasons)
    
    # Sort by Date to ensure chronological order for target encoding
    # Reset index to ensure sequential integer indexing (0, 1, 2, ...)
    df_train = df[train_mask].copy().sort_values('Date').reset_index(drop=True)
    df_val = df[val_mask].copy().sort_values('Date').reset_index(drop=True)
    df_test = df[test_mask].copy().sort_values('Date').reset_index(drop=True)
    
    # Report split sizes
    print(f"Training seasons: {train_seasons}")
    print(f"Shape: {df_train.shape}")
    print(f"Date range: {df_train['Date'].min()} to {df_train['Date'].max()}")
    
    print(f"Validation seasons: {val_seasons}")
    print(f"Shape: {df_val.shape}")
    print(f"Date range: {df_val['Date'].min()} to {df_val['Date'].max()}")
    
    print(f"Test seasons: {test_seasons}")
    print(f"Shape: {df_test.shape}")
    print(f"Date range: {df_test['Date'].min()} to {df_test['Date'].max()}")
    
    # Check for target distribution
    if 'over_2_5' in df.columns:
        train_rate = df_train['over_2_5'].mean()
        val_rate = df_val['over_2_5'].mean()
        test_rate = df_test['over_2_5'].mean()
        print(f"Over 2.5 rates: Train={train_rate:.3f}, Val={val_rate:.3f}, Test={test_rate:.3f}")
    
    return df_train, df_val, df_test

# Split baseline dataset
df_basic_train, df_basic_val, df_basic_test = split_by_season(
    df_basic, train_seasons, val_seasons, test_seasons, "BASELINE (df_basic)"
)

# Split extended dataset
df_extended_train, df_extended_val, df_extended_test = split_by_season(
    df_extended, train_seasons, val_seasons, test_seasons, "EXTENDED (df_extended)"
)

print("\n" + "=" * 70)
print("Temporal split completed successfully")
print("Data sorted chronologically and ready for target encoding")
print("=" * 70)


# 6.0 Categorical Encoding

## Overview

We need to encode categorical variables before they can be used in machine learning models. We use different encoding strategies based on cardinality:

### 1. Target Encoding (K-Fold) - For High-Cardinality Variables
**Variables:** HomeTeam, AwayTeam, Referee

**Why Target Encoding?**
- **High cardinality**: These variables have many unique values (hundreds of teams/referees)
- **One-hot encoding** would create too many features (curse of dimensionality)
- **Label encoding** would impose arbitrary ordering
- **Target encoding** captures the relationship with the target variable

**K-Fold Cross-Validation Approach:**
1. Split training data into K folds (K=5)
2. For each fold:
   - Calculate target mean for each category using OTHER folds
   - Apply smoothing: `(count × mean + smoothing × global_mean) / (count + smoothing)`
   - Encode the current fold using these statistics
3. For validation/test sets: use full training set statistics

**Smoothing Parameter:**
- Prevents overfitting to rare categories
- Higher smoothing = more regularization (closer to global mean)
- We use `smoothing=10.0` as a reasonable default

### 2. One-Hot Encoding - For Low-Cardinality Variable
**Variable:** Div (League)

**Why One-Hot Encoding?**
- Low cardinality (~10 unique leagues)
- Creates binary indicator columns for each league
- No ordinality assumption
- Standard approach for categorical variables with few categories

### Variables NOT Encoded (Excluded from Features)
- **Season**: Used for temporal train/test split - must exclude to prevent leakage
- **Date/Time**: Temporal metadata (could extract cyclical features if needed)
- **Original categorical columns**: HomeTeam, AwayTeam, Referee, Div (replaced by encoded versions)

In [None]:
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd

def target_encode_with_kfold(train_data, val_data, test_data, cat_column, target_column='over_2_5', n_folds=5, smoothing=1.0):
    """
    Perform target encoding with K-Fold cross-validation to prevent overfitting.
    
    IMPORTANT: Uses shuffle=False to maintain temporal ordering for time-series data.
    This ensures that encoding statistics are calculated from past data only,
    preventing future match information from leaking into past match encodings.
    
    Parameters:
    -----------
    train_data : DataFrame
        Training data (must include both the categorical column and target)
        Should be sorted chronologically before passing to this function
    val_data : DataFrame  
        Validation data (must include the categorical column)
    test_data : DataFrame
        Test data (must include the categorical column)
    cat_column : str
        Name of the categorical column to encode
    target_column : str
        Name of the target column
    n_folds : int
        Number of folds for cross-validation
    smoothing : float
        Smoothing parameter to handle rare categories (higher = more regularization)
    
    Returns:
    --------
    train_encoded, val_encoded, test_encoded : Series
        Encoded values for train, validation, and test sets
    """
    # Global mean of target (fallback for unseen categories)
    global_mean = train_data[target_column].mean()
    
    # Initialize encoded column for train set
    train_encoded = np.zeros(len(train_data))
    
    # K-Fold cross-validation for training set
    # shuffle=False to maintain temporal ordering and prevent data leakage
    kf = KFold(n_splits=n_folds, shuffle=False, random_state=42)
    
    for train_idx, val_idx in kf.split(train_data):
        # Split data
        train_fold = train_data.iloc[train_idx]
        val_fold = train_data.iloc[val_idx]
        
        # Calculate target mean for each category in train fold
        target_means = train_fold.groupby(cat_column)[target_column].agg(['mean', 'count'])
        
        # Apply smoothing: (count * mean + smoothing * global_mean) / (count + smoothing)
        target_means['smoothed_mean'] = (
            (target_means['count'] * target_means['mean'] + smoothing * global_mean) /
            (target_means['count'] + smoothing)
        )
        
        # Map to validation fold
        train_encoded[val_idx] = val_fold[cat_column].map(target_means['smoothed_mean']).fillna(global_mean).values
    
    # For validation and test sets, use full training set statistics
    full_target_means = train_data.groupby(cat_column)[target_column].agg(['mean', 'count'])
    full_target_means['smoothed_mean'] = (
        (full_target_means['count'] * full_target_means['mean'] + smoothing * global_mean) /
        (full_target_means['count'] + smoothing)
    )
    
    # Encode validation and test sets
    val_encoded = val_data[cat_column].map(full_target_means['smoothed_mean']).fillna(global_mean)
    test_encoded = test_data[cat_column].map(full_target_means['smoothed_mean']).fillna(global_mean)
    
    return train_encoded, val_encoded, test_encoded


print("=" * 70)
print("CATEGORICAL ENCODING")
print("=" * 70)

# Create copies to avoid modifying original data
df_basic_train_encoded = df_basic_train.copy()
df_basic_val_encoded = df_basic_val.copy()
df_basic_test_encoded = df_basic_test.copy()

df_extended_train_encoded = df_extended_train.copy()
df_extended_val_encoded = df_extended_val.copy()
df_extended_test_encoded = df_extended_test.copy()

# Part 1: TARGET ENCODING for high-cardinality variables
print("\n" + "=" * 70)
print("PART 1: TARGET ENCODING (K-FOLD) FOR HIGH-CARDINALITY VARIABLES")
print("=" * 70)

categorical_vars = ['HomeTeam', 'AwayTeam', 'Referee']

# Encode each categorical variable for both datasets
for cat_var in categorical_vars:
    print(f"\n{cat_var}:")
    print("-" * 70)
    
    # Check if variable exists in datasets
    if cat_var not in df_basic_train.columns:
        print(f"{cat_var} not found in dataset, skipping...")
        continue
    
    # BASELINE dataset encoding
    print(f"BASELINE:")
    print(f"Unique values in train: {df_basic_train[cat_var].nunique()}")
    print(f"Unique values in val: {df_basic_val[cat_var].nunique()}")
    print(f"Unique values in test: {df_basic_test[cat_var].nunique()}")
    
    train_enc, val_enc, test_enc = target_encode_with_kfold(
        df_basic_train_encoded, 
        df_basic_val_encoded, 
        df_basic_test_encoded,
        cat_var,
        n_folds=5,
        smoothing=10.0
    )
    
    # Add encoded columns
    df_basic_train_encoded[f'{cat_var}_encoded'] = train_enc
    df_basic_val_encoded[f'{cat_var}_encoded'] = val_enc
    df_basic_test_encoded[f'{cat_var}_encoded'] = test_enc
    
    print(f"Encoded range: [{train_enc.min():.4f}, {train_enc.max():.4f}]")
    print(f"Encoded mean: {train_enc.mean():.4f}")
    
    # EXTENDED dataset encoding
    print(f"EXTENDED:")
    print(f"Unique values in train: {df_extended_train[cat_var].nunique()}")
    print(f"Unique values in val: {df_extended_val[cat_var].nunique()}")
    print(f"Unique values in test: {df_extended_test[cat_var].nunique()}")
    
    train_enc, val_enc, test_enc = target_encode_with_kfold(
        df_extended_train_encoded,
        df_extended_val_encoded,
        df_extended_test_encoded,
        cat_var,
        n_folds=5,
        smoothing=10.0
    )
    
    # Add encoded columns
    df_extended_train_encoded[f'{cat_var}_encoded'] = train_enc
    df_extended_val_encoded[f'{cat_var}_encoded'] = val_enc
    df_extended_test_encoded[f'{cat_var}_encoded'] = test_enc
    
    print(f"Encoded range: [{train_enc.min():.4f}, {train_enc.max():.4f}]")
    print(f"Encoded mean: {train_enc.mean():.4f}")

print("
Target encoding completed")


# Part 2: ONE-HOT ENCODING for low-cardinality variable (Div)
print("\n" + "=" * 70)
print("PART 2: ONE-HOT ENCODING FOR DIV (LEAGUE)")
print("=" * 70)

# Check Div cardinality
print(f"\nDiv (League) Analysis:")
print(f"BASELINE - Unique leagues in train: {df_basic_train['Div'].nunique()}")
print(f"BASELINE - Leagues: {sorted(df_basic_train['Div'].unique())}")
print(f"\nEXTENDED - Unique leagues in train: {df_extended_train['Div'].nunique()}")
print(f"EXTENDED - Leagues: {sorted(df_extended_train['Div'].unique())}")

# One-hot encode Div for BASELINE dataset
print(f"\nOne-hot encoding Div for BASELINE:")
div_train_baseline = pd.get_dummies(df_basic_train_encoded['Div'], prefix='Div', drop_first=False)
div_val_baseline = pd.get_dummies(df_basic_val_encoded['Div'], prefix='Div', drop_first=False)
div_test_baseline = pd.get_dummies(df_basic_test_encoded['Div'], prefix='Div', drop_first=False)

# Align columns (in case val/test have missing categories)
all_div_cols = sorted(set(div_train_baseline.columns) | set(div_val_baseline.columns) | set(div_test_baseline.columns))
for col in all_div_cols:
    if col not in div_train_baseline.columns:
        div_train_baseline[col] = 0
    if col not in div_val_baseline.columns:
        div_val_baseline[col] = 0
    if col not in div_test_baseline.columns:
        div_test_baseline[col] = 0

div_train_baseline = div_train_baseline[all_div_cols]
div_val_baseline = div_val_baseline[all_div_cols]
div_test_baseline = div_test_baseline[all_div_cols]

# Add to dataframes
df_basic_train_encoded = pd.concat([df_basic_train_encoded, div_train_baseline], axis=1)
df_basic_val_encoded = pd.concat([df_basic_val_encoded, div_val_baseline], axis=1)
df_basic_test_encoded = pd.concat([df_basic_test_encoded, div_test_baseline], axis=1)

print(f"Created {len(all_div_cols)} dummy columns: {all_div_cols}")

# One-hot encode Div for EXTENDED dataset
print(f"\nOne-hot encoding Div for EXTENDED:")
div_train_extended = pd.get_dummies(df_extended_train_encoded['Div'], prefix='Div', drop_first=False)
div_val_extended = pd.get_dummies(df_extended_val_encoded['Div'], prefix='Div', drop_first=False)
div_test_extended = pd.get_dummies(df_extended_test_encoded['Div'], prefix='Div', drop_first=False)

# Align columns
all_div_cols_ext = sorted(set(div_train_extended.columns) | set(div_val_extended.columns) | set(div_test_extended.columns))
for col in all_div_cols_ext:
    if col not in div_train_extended.columns:
        div_train_extended[col] = 0
    if col not in div_val_extended.columns:
        div_val_extended[col] = 0
    if col not in div_test_extended.columns:
        div_test_extended[col] = 0

div_train_extended = div_train_extended[all_div_cols_ext]
div_val_extended = div_val_extended[all_div_cols_ext]
div_test_extended = div_test_extended[all_div_cols_ext]

# Add to dataframes
df_extended_train_encoded = pd.concat([df_extended_train_encoded, div_train_extended], axis=1)
df_extended_val_encoded = pd.concat([df_extended_val_encoded, div_val_extended], axis=1)
df_extended_test_encoded = pd.concat([df_extended_test_encoded, div_test_extended], axis=1)

print(f"Created {len(all_div_cols_ext)} dummy columns: {all_div_cols_ext}")

print("
One-hot encoding completed")


# Part 3: DROP ORIGINAL CATEGORICAL COLUMNS
print("\n" + "=" * 70)
print("PART 3: REMOVING ORIGINAL CATEGORICAL COLUMNS")
print("=" * 70)

# Drop original categorical columns since we now have encoded versions
cols_to_drop = ['HomeTeam', 'AwayTeam', 'Referee', 'Div']

# BASELINE datasets
for col in cols_to_drop:
    if col in df_basic_train_encoded.columns:
        df_basic_train_encoded = df_basic_train_encoded.drop(columns=[col])
        df_basic_val_encoded = df_basic_val_encoded.drop(columns=[col])
        df_basic_test_encoded = df_basic_test_encoded.drop(columns=[col])
        print(f"Dropped {col} from BASELINE datasets")

# EXTENDED datasets
for col in cols_to_drop:
    if col in df_extended_train_encoded.columns:
        df_extended_train_encoded = df_extended_train_encoded.drop(columns=[col])
        df_extended_val_encoded = df_extended_val_encoded.drop(columns=[col])
        df_extended_test_encoded = df_extended_test_encoded.drop(columns=[col])
        print(f"Dropped {col} from EXTENDED datasets")

print("
Original categorical columns removed")

# Summary
print("\n" + "=" * 70)
print("CATEGORICAL ENCODING SUMMARY")
print("=" * 70)
print(f"BASELINE datasets:")
print(f"Train shape: {df_basic_train_encoded.shape}")
print(f"Val shape: {df_basic_val_encoded.shape}")
print(f"Test shape: {df_basic_test_encoded.shape}")
print(f"\nEXTENDED datasets:")
print(f"Train shape: {df_extended_train_encoded.shape}")
print(f"Val shape: {df_extended_val_encoded.shape}")
print(f"Test shape: {df_extended_test_encoded.shape}")
print("=" * 70)


## 6.1 Create Feature Matrices with Encoded Variables

Now prepare the final feature matrices including the encoded categorical variables.

In [None]:
# Now update the feature preparation to include encoded variables
print("=" * 70)
print("PREPARING FINAL FEATURE MATRICES WITH ENCODED VARIABLES")
print("=" * 70)

# Update the exclude columns list (original columns, not encoded versions)
exclude_cols = ['Div', 'Season', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'Referee', 'over_2_5']

# BASELINE dataset
feature_cols_basic = [col for col in df_basic_train_encoded.columns if col not in exclude_cols]
print(f"\nBASELINE:")
print(f"Total features: {len(feature_cols_basic)}")
print(f"Encoded features: {[col for col in feature_cols_basic if '_encoded' in col]}")

X_basic_train = df_basic_train_encoded[feature_cols_basic].copy()
X_basic_val = df_basic_val_encoded[feature_cols_basic].copy()
X_basic_test = df_basic_test_encoded[feature_cols_basic].copy()

y_basic_train = df_basic_train_encoded['over_2_5'].copy()
y_basic_val = df_basic_val_encoded['over_2_5'].copy()
y_basic_test = df_basic_test_encoded['over_2_5'].copy()

print(f"X_train shape: {X_basic_train.shape}")
print(f"X_val shape: {X_basic_val.shape}")
print(f"X_test shape: {X_basic_test.shape}")

# EXTENDED dataset
feature_cols_extended = [col for col in df_extended_train_encoded.columns if col not in exclude_cols]
print(f"\nEXTENDED:")
print(f"Total features: {len(feature_cols_extended)}")
print(f"Encoded features: {[col for col in feature_cols_extended if '_encoded' in col]}")

X_extended_train = df_extended_train_encoded[feature_cols_extended].copy()
X_extended_val = df_extended_val_encoded[feature_cols_extended].copy()
X_extended_test = df_extended_test_encoded[feature_cols_extended].copy()

y_extended_train = df_extended_train_encoded['over_2_5'].copy()
y_extended_val = df_extended_val_encoded['over_2_5'].copy()
y_extended_test = df_extended_test_encoded['over_2_5'].copy()

print(f"X_train shape: {X_extended_train.shape}")
print(f"X_val shape: {X_extended_val.shape}")
print(f"X_test shape: {X_extended_test.shape}")

# Verify no categorical columns remain
print(f"\nVerification:")
categorical_basic = X_basic_train.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_extended = X_extended_train.select_dtypes(include=['object', 'category']).columns.tolist()

if categorical_basic:
    print(f"WARNING: Categorical features remain in BASELINE: {categorical_basic}")
else:
    print(f"BASELINE: All features are numeric")

if categorical_extended:
    print(f"WARNING: Categorical features remain in EXTENDED: {categorical_extended}")
else:
    print(f"EXTENDED: All features are numeric")

print("\n" + "=" * 70)
print("Final feature matrices ready for modeling")
print("=" * 70)

# 7.0 Save Preprocessed Data to Pickle Files

Save the final preprocessed datasets (both baseline and extended) as pickle files for easy loading in the modeling notebook.

**What's being saved:**
- Feature matrices: `X_train`, `X_val`, `X_test`
- Target vectors: `y_train`, `y_val`, `y_test`
- Feature names: List of all feature column names
- Metadata: Information about the preprocessing (dates, sizes, encoded features, etc.)

**Output files:**
- `processed/baseline_preprocessed.pkl` - Baseline feature set
- `processed/extended_preprocessed.pkl` - Extended feature set with rolling statistics

In [None]:
import pickle
import os
from datetime import datetime

# Create processed directory if it doesn't exist
output_dir = 'processed'
os.makedirs(output_dir, exist_ok=True)



# Save BASELINE dataset

print("\nSaving BASELINE dataset...")

baseline_data = {
    'X_train': X_basic_train,
    'X_val': X_basic_val,
    'X_test': X_basic_test,
    'y_train': y_basic_train,
    'y_val': y_basic_val,
    'y_test': y_basic_test,
    'feature_names': feature_cols_basic,
}

baseline_filename = f'{output_dir}/baseline_preprocessed.pkl'
with open(baseline_filename, 'wb') as f:
    pickle.dump(baseline_data, f)

print(f"Saved to: {baseline_filename}")
print(f"Train shape: {X_basic_train.shape}")
print(f"Val shape: {X_basic_val.shape}")
print(f"Test shape: {X_basic_test.shape}")
print(f"Features: {len(feature_cols_basic)}")


# Save EXTENDED dataset

print("\nSaving EXTENDED dataset...")

extended_data = {
    'X_train': X_extended_train,
    'X_val': X_extended_val,
    'X_test': X_extended_test,
    'y_train': y_extended_train,
    'y_val': y_extended_val,
    'y_test': y_extended_test,
    'feature_names': feature_cols_extended,
}

extended_filename = f'{output_dir}/extended_preprocessed.pkl'
with open(extended_filename, 'wb') as f:
    pickle.dump(extended_data, f)

