# 🧠 Football Match Outcome Predictor — First Model

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
df = pd.read_csv("../data/raw/matches.csv", parse_dates=["date"])
df = df[df["pre_vig_ok"] == True]  # filter valid rows
df = df[df["fulltime_result"].notna()]  # ensure label exists
print(f"Loaded {len(df)} valid matches.")

Loaded 9410 valid matches.


In [None]:
# Convert H/D/A to numbers
label_map = {"H": 0, "D": 1, "A": 2}
df["target"] = df["fulltime_result"].map(label_map)
df = df[df["target"].notna()]

In [None]:
features = ["pre_p_H", "pre_p_D", "pre_p_A"]
X = df[features]
y = df["target"]

In [None]:
# Chronological split: train on old matches, test on recent ones
df_sorted = df.sort_values("date")
split_date = "2018-07-01"
train_df = df_sorted[df_sorted["date"] < split_date]
test_df = df_sorted[df_sorted["date"] >= split_date]

X_train = train_df[features]
y_train = train_df["target"]
X_test = test_df[features]
y_test = test_df["target"]

print(f"Train: {len(X_train)} rows, Test: {len(X_test)} rows")

In [None]:
model = GradientBoostingClassifier()
model.fit(X_train, y_train)

In [None]:
probs = model.predict_proba(X_test)
preds = np.argmax(probs, axis=1)

print("Accuracy:", accuracy_score(y_test, preds))
print("Log Loss:", log_loss(y_test, probs))
print(classification_report(y_test, preds, target_names=["Home", "Draw", "Away"]))

In [None]:
# Compare predicted vs market probabilities
compare_df = test_df[["pre_p_H", "pre_p_D", "pre_p_A"]].copy()
compare_df[["model_p_H", "model_p_D", "model_p_A"]] = probs

plt.figure(figsize=(12,4))
for i, outcome in enumerate(["H", "D", "A"]):
    plt.subplot(1, 3, i+1)
    sns.scatterplot(x=compare_df[f"pre_p_{outcome}"], y=compare_df[f"model_p_{outcome}"], alpha=0.5)
    plt.plot([0, 1], [0, 1], "--", color="gray")
    plt.xlabel("Market pre_p_" + outcome)
    plt.ylabel("Model prob_" + outcome)
    plt.title(outcome)
plt.tight_layout()
plt.show()