In [1]:
import pandas as pd
import numpy as np

In [2]:
def compute_head_to_head(row, matches):
    relevant_matches = matches[(
        ((matches['HomeTeam'] == row['HomeTeam']) & (matches['AwayTeam'] == row['AwayTeam'])) |
        ((matches['HomeTeam'] == row['AwayTeam']) & (matches['AwayTeam'] == row['HomeTeam']))
    )]
    
    home_wins = (
        ((relevant_matches['HomeTeam'] == row['HomeTeam']) & (relevant_matches['FTR'] == 'H')) |
        ((relevant_matches['AwayTeam'] == row['HomeTeam']) & (relevant_matches['FTR'] == 'A'))
    ).sum()
    
    away_wins = (
        ((relevant_matches['HomeTeam'] == row['AwayTeam']) & (relevant_matches['FTR'] == 'H')) |
        ((relevant_matches['AwayTeam'] == row['AwayTeam']) & (relevant_matches['FTR'] == 'A'))
    ).sum()
    
    return (home_wins - away_wins) / max(len(relevant_matches), 1)

In [3]:
def compute_directional_head_to_head(row, matches):
    relevant_matches = matches[(matches['HomeTeam'] == row['HomeTeam']) & (matches['AwayTeam'] == row['AwayTeam'])]
    home_wins = (relevant_matches['FTR'] == 'H').sum()
    away_wins = (relevant_matches['FTR'] == 'A').sum()
    return (home_wins - away_wins) / max(len(relevant_matches), 1)

In [4]:
def replace_second_h(stat):
    if stat.count('H') > 1:
        h_index = stat.find('H', stat.find('H') + 1)
        return stat[:h_index] + 'A' + stat[h_index + 1:]
    return stat.replace('H', 'A')

def compute_recent_stats(team, matches, stat, num_games=5):
    team_matches = matches[(matches['HomeTeam'] == team) | (matches['AwayTeam'] == team)].tail(num_games)
    home_stats = team_matches[team_matches['HomeTeam'] == team][stat].sum()
    away_stats = team_matches[team_matches['AwayTeam'] == team][replace_second_h(stat)].sum()
    total_stats = home_stats + away_stats
    return total_stats / num_games if len(team_matches) > 0 else np.nan

In [5]:
def compute_points_per_game(team, matches, num_games=5):
    team_matches = matches[(matches['HomeTeam'] == team) | (matches['AwayTeam'] == team)].tail(num_games)
    points = 0
    for _, match in team_matches.iterrows():
        if match['HomeTeam'] == team:
            if match['FTR'] == 'H':
                points += 3
            elif match['FTR'] == 'D':
                points += 1
        elif match['AwayTeam'] == team:
            if match['FTR'] == 'A':
                points += 3
            elif match['FTR'] == 'D':
                points += 1
    return points / num_games if len(team_matches) > 0 else np.nan

In [6]:
def compute_features(df):
    df = df.copy()

    # Head-to-Head Features
    df['GeneralHeadToHead'] = df.apply(lambda row: compute_head_to_head(row, df[:row.name]), axis=1)
    df['DirectionalHeadToHead'] = df.apply(lambda row: compute_directional_head_to_head(row, df[:row.name]), axis=1)

    # Recent Performance Metrics
    stats_to_average = ['HS', 'HST', 'HC', 'FTHG', 'HTHG', 'HF', 'HY', 'HR']
    for stat in stats_to_average:
        df[f'Home_{stat}_Avg'] = df.apply(lambda row: compute_recent_stats(row['HomeTeam'], df[:row.name], stat), axis=1)
        df[f'Away_{replace_second_h(stat)}_Avg'] = df.apply(lambda row: compute_recent_stats(row['AwayTeam'], df[:row.name], stat), axis=1)

    # Points Per Game Feature
    df['Home_PPG'] = df.apply(lambda row: compute_points_per_game(row['HomeTeam'], df[:row.name]), axis=1)
    df['Away_PPG'] = df.apply(lambda row: compute_points_per_game(row['AwayTeam'], df[:row.name]), axis=1)

    # Fill Default Values
    df.fillna({
        'GeneralHeadToHead': 0,
        'DirectionalHeadToHead': 0,
        'Home_HS_Avg': 14,
        'Away_AS_Avg': 11,
        'Home_HST_Avg': 6,
        'Away_AST_Avg': 5,
        'Home_HC_Avg': 6,
        'Away_AC_Avg': 5,
        'Home_FTHG_Avg': 2,
        'Away_FTAG_Avg': 1,
        'Home_HTHG_Avg': 1,
        'Away_HTAG_Avg': 1,
        'Home_HF_Avg': 11,
        'Away_AF_Avg': 12,
        'Home_HY_Avg': 1,
        'Away_AY_Avg': 2,
        'Home_HR_Avg': 0,
        'Away_AR_Avg': 0,
        'Home_PPG': 1.5,
        'Away_PPG': 1.2
    }, inplace=True)

    return df

In [7]:
df = pd.read_csv('../data/epl-training.csv')
df.drop(index=5700, inplace=True)
df = compute_features(df)

In [31]:
new_df = df.drop(columns=['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'HTHG', 'HTAG', 'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR'])

In [25]:
train_size = int(0.9*len(new_df))
X_train = new_df.drop(columns=['FTR']).iloc[:train_size]
y_train = new_df['FTR'].iloc[:train_size]
X_test = new_df.drop(columns=['FTR']).iloc[train_size:]
y_test = new_df['FTR'].iloc[train_size:]

In [26]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression(max_iter=1000, class_weight={'H': 1, 'D': 2, 'A': 1})
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Accuracy: 0.6670281995661606
              precision    recall  f1-score   support

           A       0.73      0.63      0.68       287
           D       0.36      0.45      0.40       204
           H       0.81      0.80      0.80       431

    accuracy                           0.67       922
   macro avg       0.64      0.62      0.63       922
weighted avg       0.69      0.67      0.68       922



In [28]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

target_mapping = {'H': 0, 'D': 1, 'A': 2}
y_train = y_train.map(target_mapping)
y_test = y_test.map(target_mapping)

y_train_cat = to_categorical(y_train, num_classes=3)
y_test_cat = to_categorical(y_test, num_classes=3)

model = Sequential()

model.add(Dense(128, input_dim=X_train_scaled.shape[1], activation='relu'))
model.add(Dropout(0.3))  # To prevent overfitting

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

class_weight = {0: 1, 1: 1, 2: 1}
history = model.fit(X_train_scaled, y_train_cat, validation_data=(X_test_scaled, y_test_cat), epochs=50, batch_size=32)

loss, accuracy = model.evaluate(X_test_scaled, y_test_cat)
print(f'Accuracy: {accuracy:.2f}')

y_pred_proba = model.predict(X_test_scaled)
y_pred = np.argmax(y_pred_proba, axis=1)

inverse_target_mapping = {0: 'H', 1: 'D', 2: 'A'}
y_pred_labels = [inverse_target_mapping[val] for val in y_pred]
y_test_labels = [inverse_target_mapping[val] for val in y_test]

print(classification_report(y_test_labels, y_pred_labels))


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 806us/step - accuracy: 0.4774 - loss: 1.0561 - val_accuracy: 0.6475 - val_loss: 0.8080
Epoch 2/50
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 509us/step - accuracy: 0.5476 - loss: 0.9643 - val_accuracy: 0.6692 - val_loss: 0.7783
Epoch 3/50
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 496us/step - accuracy: 0.5498 - loss: 0.9478 - val_accuracy: 0.6844 - val_loss: 0.7619
Epoch 4/50
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 489us/step - accuracy: 0.5579 - loss: 0.9330 - val_accuracy: 0.6920 - val_loss: 0.7690
Epoch 5/50
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 633us/step - accuracy: 0.5429 - loss: 0.9453 - val_accuracy: 0.6931 - val_loss: 0.7608
Epoch 6/50
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 498us/step - accuracy: 0.5631 - loss: 0.9335 - val_accuracy: 0.6920 - val_loss: 0.7392
Epoch 7/50
[1m260/260[0m 

In [29]:
# Final Test Set
df_test = pd.read_csv('../data/epl-test.csv')

test_features = []

for index, row in df_test.iterrows():
    # Use the full training data (df) to compute the necessary stats for the current row
    matches = df

    # Head-to-Head Features
    general_head_to_head = compute_head_to_head(row, matches)
    directional_head_to_head = compute_directional_head_to_head(row, matches)

    # Recent Performance Metrics
    stats_to_average = ['HS', 'HST', 'HC', 'FTHG', 'HTHG', 'HF', 'HY', 'HR']
    home_stats = {}
    away_stats = {}
    for stat in stats_to_average:
        home_stats[f'Home_{stat}_Avg'] = compute_recent_stats(row['HomeTeam'], matches, stat)
        away_stats[f'Away_{replace_second_h(stat)}_Avg'] = compute_recent_stats(row['AwayTeam'], matches, stat)

    # Points Per Game Feature
    home_ppg = compute_points_per_game(row['HomeTeam'], matches)
    away_ppg = compute_points_per_game(row['AwayTeam'], matches)

    # Assuming `home_stats` and `away_stats` are dictionaries
    home_stats_keys = list(home_stats.keys())
    away_stats_keys = list(away_stats.keys())

    # Create interleaved keys
    interleaved_keys = [val for pair in zip(home_stats_keys, away_stats_keys) for val in pair]

    # Create interleaved dictionary
    interleaved_stats = {key: home_stats[key] if key in home_stats else away_stats[key] for key in interleaved_keys}

    # Rebuild features_dict with interleaved stats
    features_dict = {
        'GeneralHeadToHead': general_head_to_head,
        'DirectionalHeadToHead': directional_head_to_head,
        **interleaved_stats,
        'Home_PPG': home_ppg,
        'Away_PPG': away_ppg
    }

    test_features.append(features_dict)

# Convert test features into DataFrame
df_test_features = pd.DataFrame(test_features)

# Fill Default Values
df_test_features.fillna({
    'GeneralHeadToHead': 0,
    'DirectionalHeadToHead': 0,
    'Home_HS_Avg': 14,
    'Away_AS_Avg': 11,
    'Home_HST_Avg': 6,
    'Away_AST_Avg': 5,
    'Home_HC_Avg': 6,
    'Away_AC_Avg': 5,
    'Home_FTHG_Avg': 2,
    'Away_FTAG_Avg': 1,
    'Home_HTHG_Avg': 1,
    'Away_HTAG_Avg': 1,
    'Home_HF_Avg': 11,
    'Away_AF_Avg': 12,
    'Home_HY_Avg': 1,
    'Away_AY_Avg': 2,
    'Home_HR_Avg': 0,
    'Away_AR_Avg': 0,
    'Home_PPG': 1.5,
    'Away_PPG': 1.2
}, inplace=True)

X_test_new = df_test_features

X_test_scaled_new = scaler.transform(X_test_new)

y_pred_proba_new = model.predict(X_test_scaled_new)

y_pred_new = np.argmax(y_pred_proba_new, axis=1)

inverse_target_mapping = {0: 'H', 1: 'D', 2: 'A'}
y_pred_labels_new = [inverse_target_mapping[val] for val in y_pred_new]

df_test['Predicted_FTR'] = y_pred_labels_new

print(df_test[['Date', 'HomeTeam', 'AwayTeam', 'Predicted_FTR']])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
        Date           HomeTeam        AwayTeam Predicted_FTR
0  01-Feb-25    AFC Bournemouth       Liverpool             H
1  01-Feb-25            Arsenal        Man City             H
2  01-Feb-25          Brentford           Spurs             H
3  01-Feb-25            Chelsea        West Ham             H
4  01-Feb-25            Everton  Leicester City             H
5  01-Feb-25       Ipswich Town     Southampton             H
6  01-Feb-25            Man Utd  Crystal Palace             A
7  01-Feb-25          Newcastle          Fulham             H
8  01-Feb-25  Nottingham Forest        Brighton             H
9  01-Feb-25             Wolves     Aston Villa             H
