In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

df = pd.read_csv('E0.csv')

# ============================================================
# Step 1: Calculate Form (rolling points over last 5 matches)
# ============================================================
def get_points(row, venue):
    if venue == "home":
        if row["FTHG"] > row["FTAG"]:
            return 3
        elif row["FTHG"] == row["FTAG"]:
            return 1
        else:
            return 0
    else:  # away
        if row["FTAG"] > row["FTHG"]:
            return 3
        elif row["FTAG"] == row["FTHG"]:
            return 1
        else:
            return 0

# Match points
df["HOME_POINTS"] = df.apply(lambda row: get_points(row, "home"), axis=1)
df["AWAY_POINTS"] = df.apply(lambda row: get_points(row, "away"), axis=1)

# Rolling form (last 5 matches)
df["HOME_FORM"] = (
    df.groupby("HomeTeam")["HOME_POINTS"]
      .transform(lambda x: x.rolling(5, min_periods=1).sum())
)

df["AWAY_FORM"] = (
    df.groupby("AwayTeam")["AWAY_POINTS"]
      .transform(lambda x: x.rolling(5, min_periods=1).sum())
)

# ============================================================
# Step 2: Average Goals (last 5 matches)
# ============================================================
df["HOME_GOALS"] = df["FTHG"]
df["AWAY_GOALS"] = df["FTAG"]

# Rolling average goals scored
df["HOME_AVG_GOALS"] = (
    df.groupby("HomeTeam")["HOME_GOALS"]
      .transform(lambda x: x.rolling(5, min_periods=1).mean())
)

df["AWAY_AVG_GOALS"] = (
    df.groupby("AwayTeam")["AWAY_GOALS"]
      .transform(lambda x: x.rolling(5, min_periods=1).mean())
)

# ============================================================
# Step 3: Create Target Variables (Markets)
# ============================================================
def get_hda(row):
    if row["FTHG"] > row["FTAG"]:
        return 1   # Home win
    elif row["FTHG"] == row["FTAG"]:
        return 0   # Draw
    else:
        return -1  # Away win

df["HDA"] = df.apply(get_hda, axis=1)
df["OVER25"] = ((df["FTHG"] + df["FTAG"]) > 2.5).astype(int)
df["BTTS"] = ((df["FTHG"] > 0) & (df["FTAG"] > 0)).astype(int)
df["DOUBLE_CHANCE"] = ((df["FTHG"] >= df["FTAG"])).astype(int)

# ============================================================
# Step 4: Train Models for Each Market
# ============================================================
features = ["HOME_FORM", "AWAY_FORM", "HOME_AVG_GOALS", "AWAY_AVG_GOALS"]

markets = {
    "HDA": df["HDA"],
    "OVER25": df["OVER25"],
    "BTTS": df["BTTS"],
    "DOUBLE_CHANCE": df["DOUBLE_CHANCE"],
}

models = {}

for market, target in markets.items():
    X_train, X_test, y_train, y_test = train_test_split(
        df[features], target, test_size=0.2, random_state=42
    )

    model = RandomForestClassifier(n_estimators=200, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"===== {market} =====")
    print(classification_report(y_test, y_pred))

    # Save model
    joblib.dump(model, f"MODEL_{market}.pkl")
    models[market] = model


===== HDA =====
              precision    recall  f1-score   support

          -1       0.58      0.47      0.52        15
           0       0.40      0.20      0.27        10
           1       0.69      0.89      0.77        27

    accuracy                           0.63        52
   macro avg       0.56      0.52      0.52        52
weighted avg       0.60      0.63      0.60        52

===== OVER25 =====
              precision    recall  f1-score   support

           0       0.27      0.22      0.24        18
           1       0.62      0.68      0.65        34

    accuracy                           0.52        52
   macro avg       0.44      0.45      0.45        52
weighted avg       0.50      0.52      0.51        52

===== BTTS =====
              precision    recall  f1-score   support

           0       0.47      0.43      0.45        21
           1       0.64      0.68      0.66        31

    accuracy                           0.58        52
   macro avg       0.5

In [5]:
print(df.head(20))

   Div        Date   Time          HomeTeam          AwayTeam  FTHG  FTAG FTR  \
0   E0  11/08/2023  20:00           Burnley          Man City     0     3   A   
1   E0  12/08/2023  12:30           Arsenal     Nott'm Forest     2     1   H   
2   E0  12/08/2023  15:00       Bournemouth          West Ham     1     1   D   
3   E0  12/08/2023  15:00          Brighton             Luton     4     1   H   
4   E0  12/08/2023  15:00           Everton            Fulham     0     1   A   
5   E0  12/08/2023  15:00  Sheffield United    Crystal Palace     0     1   A   
6   E0  12/08/2023  17:30         Newcastle       Aston Villa     5     1   H   
7   E0  13/08/2023  14:00         Brentford         Tottenham     2     2   D   
8   E0  13/08/2023  16:30           Chelsea         Liverpool     1     1   D   
9   E0  14/08/2023  20:00        Man United            Wolves     1     0   H   
10  E0  18/08/2023  19:45     Nott'm Forest  Sheffield United     2     1   H   
11  E0  19/08/2023  15:00   