# Predicting Match Result for the Upcoming Gameweek

Now the aim of the machine learning models is to predict the result of the upcoming matches. However, the matches haven't been played yet and as a result there are no statistics. As a result, form will be a key indicator in helping to predict the performance as well as average statistics throughout the season.

In [1]:
import pandas as pd
import numpy as np
import pickle
import os
import matplotlib.pyplot as plt
from logistic_regression_predict import predict_logreg

## Computing form and season averages

In [2]:
df = pd.read_csv("../Data/API-Football_FORM/API-Football-Data-2024:25_FORM.csv")
df["Date"] = pd.to_datetime(df["Date"])

In [3]:
# Change possession to decimal
for col in df.columns:
    if "Possession" in col and df[col].dtype == object:
        df[col] = df[col].str.rstrip('%').astype(float) / 100

In [4]:
def get_latest_form(df):
    form_cols = [
        "Points", "Goals_Scored", "Goals_Conceded",
        "Wins", "Draws", "Losses", "Form_Score"
    ]
    home_cols = [f"Home_{col}" for col in form_cols]
    away_cols = [f"Away_{col}" for col in form_cols]

    latest_form = {}

    for idx in reversed(df.index):
        row = df.loc[idx]
        for team, prefix in [(row["Home Team"], "Home_"), (row["Away Team"], "Away_")]:
            if team not in latest_form:
                latest_form[team] = {col.replace(prefix, ""): row[f"{prefix}{col}"] for col in form_cols}
                latest_form[team]["Team"] = team

        if len(latest_form) == len(pd.unique(df[["Home Team", "Away Team"]].values.ravel())):
            break

    return pd.DataFrame(latest_form.values()).set_index("Team")

In [5]:
form_df = get_latest_form(df)
print(form_df)

                   Points  Goals_Scored  Goals_Conceded  Wins  Draws  Losses  \
Team                                                                           
Leicester             0.0           0.0            13.0   0.0    0.0     5.0   
Manchester United     5.0           6.0             8.0   1.0    2.0     2.0   
Fulham                9.0           7.0             7.0   3.0    0.0     2.0   
Tottenham            10.0           9.0             4.0   3.0    1.0     1.0   
Arsenal               8.0           8.0             3.0   2.0    2.0     1.0   
Chelsea               9.0           8.0             6.0   3.0    0.0     2.0   
Bournemouth           4.0           6.0             8.0   1.0    1.0     3.0   
Brentford             7.0           6.0             4.0   2.0    1.0     2.0   
Ipswich               1.0           5.0            11.0   0.0    1.0     4.0   
Nottingham Forest     7.0          12.0             6.0   2.0    1.0     2.0   
Manchester City       6.0           6.0 

In [6]:
def compute_season_averages(df):
    stat_fields = [
        "Shots on Goal", "Corner Kicks", "Ball Possession",
        "Yellow Cards", "Red Cards", "Offsides", "expected_goals"
    ]
    teams = pd.unique(df[["Home Team", "Away Team"]].values.ravel())
    avg_stats = []

    for team in teams:
        home_matches = df[df["Home Team"] == team]
        away_matches = df[df["Away Team"] == team]

        combined = pd.DataFrame()
        for field in stat_fields:
            home_col = f"Home {field}"
            away_col = f"Away {field}"
            if home_col in df.columns and away_col in df.columns:
                combined[field] = pd.concat([
                    home_matches[home_col],
                    away_matches[away_col]
                ], ignore_index=True)

        averages = combined.mean()
        averages["Team"] = team
        avg_stats.append(averages)

    return pd.DataFrame(avg_stats).set_index("Team")

In [7]:
avg_df = compute_season_averages(df)

In [8]:
team_features = form_df.merge(avg_df, left_index=True, right_index=True)
os.makedirs("Prediction_Features", exist_ok=True)
current_gw = "30"
team_features.to_csv(f"Prediction_Features/Team_Form_And_Averages_2024_25_gw_{current_gw}.csv")

## Predicting result

In [9]:
with open("../Pre-Processing/Encoders_final/onehot_encoder.pkl", "rb") as f:
    ohe = pickle.load(f)

with open("../Models/saved_models_result/logistic_regression_model_l2.pkl", "rb") as f:
    logreg_model = pickle.load(f)

In [11]:
# Column names when training the model
included_cols = [
    "Home Team", "Away Team",
    "Home_Points", "Home_Goals_Scored", "Home_Goals_Conceded",
    "Home_Wins", "Home_Draws", "Home_Losses", "Home_Form_Score",
    "Away_Points", "Away_Goals_Scored", "Away_Goals_Conceded",
    "Away_Wins", "Away_Draws", "Away_Losses", "Away_Form_Score",
    "Home Shots on Goal", "Away Shots on Goal",
    "Home Corner Kicks", "Away Corner Kicks",
    "Home Ball Possession", "Away Ball Possession",
    "Home Yellow Cards", "Away Yellow Cards",
    "Home Red Cards", "Away Red Cards",
    "Home Offsides", "Away Offsides",
    "Home expected_goals", "Away expected_goals",
]

home_mapping = {
    "Points": "Home_Points",
    "Goals_Scored": "Home_Goals_Scored",
    "Goals_Conceded": "Home_Goals_Conceded",
    "Wins": "Home_Wins",
    "Draws": "Home_Draws",
    "Losses": "Home_Losses",
    "Form_Score": "Home_Form_Score",
    "Shots on Goal": "Home Shots on Goal",
    "Corner Kicks": "Home Corner Kicks",
    "Ball Possession": "Home Ball Possession",
    "Yellow Cards": "Home Yellow Cards",
    "Red Cards": "Home Red Cards",
    "Offsides": "Home Offsides",
    "expected_goals": "Home expected_goals",
}

away_mapping = {
    "Points": "Away_Points",
    "Goals_Scored": "Away_Goals_Scored",
    "Goals_Conceded": "Away_Goals_Conceded",
    "Wins": "Away_Wins",
    "Draws": "Away_Draws",
    "Losses": "Away_Losses",
    "Form_Score": "Away_Form_Score",
    "Shots on Goal": "Away Shots on Goal",
    "Corner Kicks": "Away Corner Kicks",
    "Ball Possession": "Away Ball Possession",
    "Yellow Cards": "Away Yellow Cards",
    "Red Cards": "Away Red Cards",
    "Offsides": "Away Offsides",
    "expected_goals": "Away expected_goals",
}

In [12]:
current_gw = "30"
team_df = pd.read_csv(f"Prediction_Features/Team_Form_And_Averages_2024_25_gw_{current_gw}.csv", index_col="Team")
print(team_df)

                   Points  Goals_Scored  Goals_Conceded  Wins  Draws  Losses  \
Team                                                                           
Leicester             0.0           0.0            13.0   0.0    0.0     5.0   
Manchester United     5.0           6.0             8.0   1.0    2.0     2.0   
Fulham                9.0           7.0             7.0   3.0    0.0     2.0   
Tottenham            10.0           9.0             4.0   3.0    1.0     1.0   
Arsenal               8.0           8.0             3.0   2.0    2.0     1.0   
Chelsea               9.0           8.0             6.0   3.0    0.0     2.0   
Bournemouth           4.0           6.0             8.0   1.0    1.0     3.0   
Brentford             7.0           6.0             4.0   2.0    1.0     2.0   
Ipswich               1.0           5.0            11.0   0.0    1.0     4.0   
Nottingham Forest     7.0          12.0             6.0   2.0    1.0     2.0   
Manchester City       6.0           6.0 

In [13]:
home_team = "Nottingham Forest"
away_team = "Manchester United"

row = {
    "Home Team": home_team,
    "Away Team": away_team,
}

In [14]:
# Get the row where index = home_team
home_row = team_df.loc[home_team]
away_row = team_df.loc[away_team]

for src_col, dst_col in home_mapping.items():
    row[dst_col] = home_row[src_col]

for src_col, dst_col in away_mapping.items():
    row[dst_col] = away_row[src_col]

print(row)

{'Home Team': 'Nottingham Forest', 'Away Team': 'Manchester United', 'Home_Points': 7.0, 'Home_Goals_Scored': 12.0, 'Home_Goals_Conceded': 6.0, 'Home_Wins': 2.0, 'Home_Draws': 1.0, 'Home_Losses': 2.0, 'Home_Form_Score': 10.0, 'Home Shots on Goal': 4.586206896551724, 'Home Corner Kicks': 4.275862068965517, 'Home Ball Possession': 0.3962068965517241, 'Home Yellow Cards': 2.206896551724138, 'Home Red Cards': 0.0689655172413793, 'Home Offsides': 2.310344827586207, 'Home expected_goals': 1.2879310344827586, 'Away_Points': 5.0, 'Away_Goals_Scored': 6.0, 'Away_Goals_Conceded': 8.0, 'Away_Wins': 1.0, 'Away_Draws': 2.0, 'Away_Losses': 2.0, 'Away_Form_Score': 4.0, 'Away Shots on Goal': 4.551724137931035, 'Away Corner Kicks': 4.9655172413793105, 'Away Ball Possession': 0.526551724137931, 'Away Yellow Cards': 2.206896551724138, 'Away Red Cards': 0.1034482758620689, 'Away Offsides': 1.896551724137931, 'Away expected_goals': 1.3510344827586207}


In [15]:
df_input = pd.DataFrame([row])

In [16]:
categorical_cols = ["Home Team", "Away Team"]
df_cat = df_input[categorical_cols]
arr_encoded = ohe.transform(df_cat)

encoded_cols = ohe.get_feature_names_out(categorical_cols)

df_cat_encoded = pd.DataFrame(arr_encoded, 
                              columns=encoded_cols,
                              index=df_input.index)

df_num = df_input.drop(columns=categorical_cols)
df_encoded = pd.concat([df_num, df_cat_encoded], axis=1)

model_cols = logreg_model.feature_names_in_
df_encoded = df_encoded.reindex(columns=model_cols, fill_value=0)

In [17]:
y_pred = logreg_model.predict(df_encoded)
y_probs = logreg_model.predict_proba(df_encoded)

## Testing predicting functions

In [18]:
#===== PREDICTING MATCH RESULT FOR PL GW 30 =====#
result_message, home_prob, draw_prob, away_prob = predict_logreg("Nottingham Forest", "Manchester United", 30)

print(result_message)

Match: Nottingham Forest vs Manchester United
Prediction: Draw


In [22]:
def predict_whole_gameweek(gw: int, matches: list):
    all_predictions = []

    for (home_team, away_team) in matches:
        nice_prediction, home_prob, draw_prob, away_prob = predict_logreg(home_team, away_team, gw)
        
        print(f"Match: {home_team} vs {away_team}")
        print(f"Prediction: {nice_prediction}")
        print(f"Probabilities -> Home: {home_prob:.3f}, Draw: {draw_prob:.3f}, Away: {away_prob:.3f}")
        print("#==========#")

        all_predictions.append(
            (home_team, away_team, nice_prediction, home_prob, draw_prob, away_prob)
        )
    
    return all_predictions

In [24]:
matches_gw30 = [
    ("Arsenal", "Fulham"),
    ("Wolves", "West Ham"),
    ("Nottingham Forest", "Manchester United"),
    ("Bournemouth", "Ipswich"),
    ("Brighton", "Aston Villa"),
    ("Manchester City", "Leicester"),
    ("Newcastle", "Brentford"),
    ("Southampton", "Crystal Palace"),
    ("Liverpool", "Everton"),
    ("Chelsea", "Tottenham"),
]

predictions_gw30 = predict_whole_gameweek(30, matches_gw30)

Match: Arsenal vs Fulham
Prediction: Match: Arsenal vs Fulham
Prediction: Arsenal Win
Probabilities -> Home: 0.622, Draw: 0.288, Away: 0.089
Match: Wolves vs West Ham
Prediction: Match: Wolves vs West Ham
Prediction: Wolves Win
Probabilities -> Home: 0.425, Draw: 0.273, Away: 0.302
Match: Nottingham Forest vs Manchester United
Prediction: Match: Nottingham Forest vs Manchester United
Prediction: Draw
Probabilities -> Home: 0.286, Draw: 0.415, Away: 0.299
Match: Bournemouth vs Ipswich
Prediction: Match: Bournemouth vs Ipswich
Prediction: Bournemouth Win
Probabilities -> Home: 0.591, Draw: 0.320, Away: 0.089
Match: Brighton vs Aston Villa
Prediction: Match: Brighton vs Aston Villa
Prediction: Brighton Win
Probabilities -> Home: 0.528, Draw: 0.333, Away: 0.139
Match: Manchester City vs Leicester
Prediction: Match: Manchester City vs Leicester
Prediction: Manchester City Win
Probabilities -> Home: 0.899, Draw: 0.090, Away: 0.010
Match: Newcastle vs Brentford
Prediction: Match: Newcastle vs