In [1]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

## Add Form to datasets before 2021

In [11]:
csv_output_file_2010 = "../data/API-Football/API-Football-Data-2010:11.csv"
csv_output_file_2011 = "../data/API-Football/API-Football-Data-2011:12.csv"
csv_output_file_2012 = "../data/API-Football/API-Football-Data-2012:13.csv"
csv_output_file_2013 = "../data/API-Football/API-Football-Data-2013:14.csv"
csv_output_file_2014 = "../data/API-Football/API-Football-Data-2014:15.csv"
csv_output_file_2015 = "../data/API-Football/API-Football-Data-2015:16.csv"
csv_output_file_2016 = "../data/API-Football/API-Football-Data-2016:17.csv"
csv_output_file_2017 = "../data/API-Football/API-Football-Data-2017:18.csv"
csv_output_file_2018 = "../data/API-Football/API-Football-Data-2018:19.csv"
csv_output_file_2019 = "../data/API-Football/API-Football-Data-2019:20.csv"
csv_output_file_2020 = "../data/API-Football/API-Football-Data-2020:21.csv"

In [None]:
def preprocess_dates(df):
    df = df.copy()
    df["Date"] = pd.to_datetime(df["Date"])
    df = df.sort_values("Date").reset_index(drop=True)
    return df

def compute_team_form(df, team_name, current_date, current_match_id, window=7):
    # Get past matches involving the team before the current date
    past_matches = df[
        ((df["Home Team"] == team_name) | (df["Away Team"] == team_name)) &
        (df["Date"] < current_date) &
        (df["Fixture ID"] != current_match_id)
    ].sort_values("Date", ascending=False).head(window)

    match_count = len(past_matches)
    if match_count == 0:
        # First match of the season â€” no form
        return {
            "Points": 0,
            "Goals Scored": 0,
            "Goals Conceded": 0,
            "Wins": 0,
            "Draws": 0,
            "Losses": 0,
            "Form Score": 0.0
        }

    # Initialize counters
    points = goals_scored = goals_conceded = wins = draws = losses = 0

    for _, match in past_matches.iterrows():
        is_home = match["Home Team"] == team_name
        team_goals = match["Home Team Goals"] if is_home else match["Away Team Goals"]
        opponent_goals = match["Away Team Goals"] if is_home else match["Home Team Goals"]

        goals_scored += team_goals
        goals_conceded += opponent_goals

        if team_goals > opponent_goals:
            wins += 1
            points += 3
        elif team_goals == opponent_goals:
            draws += 1
            points += 1
        else:
            losses += 1

    # Interpolate form to reflect a full 'window' of matches
    scale = window / match_count
    form_score = (points + (goals_scored - goals_conceded) * 0.5) * scale

    return {
        "Points": points * scale,
        "Goals Scored": goals_scored * scale,
        "Goals Conceded": goals_conceded * scale,
        "Wins": wins * scale,
        "Draws": draws * scale,
        "Losses": losses * scale,
        "Form Score": form_score
    }

def add_form_features(df, window=5):
    df = preprocess_dates(df)
    features = []

    for idx, row in df.iterrows():
        home_team = row["Home Team"]
        away_team = row["Away Team"]
        match_date = row["Date"]
        fixture_id = row["Fixture ID"]

        home_form = compute_team_form(df, home_team, match_date, fixture_id, window=window)
        away_form = compute_team_form(df, away_team, match_date, fixture_id, window=window)

        row_features = {
            f"Home_{k.replace(' ', '_')}": v for k, v in home_form.items()
        }
        row_features.update({
            f"Away_{k.replace(' ', '_')}": v for k, v in away_form.items()
        })

        features.append(row_features)

    form_df = pd.DataFrame(features)
    return pd.concat([df.reset_index(drop=True), form_df], axis=1)


In [13]:
seasons = [
    "2010:11", "2011:12", "2012:13", "2013:14", "2014:15", "2015:16", "2016:17", "2017:18", "2018:19", "2019:20", "2020:21", "2024:25"
]

for season in seasons:
    input_file = f"../data/API-Football/API-Football-Data-{season}.csv"
    output_file = f"../Data/API-Football_FORM/API-Football-Data-{season}_FORM.csv"

    df = pd.read_csv(input_file)

    df = add_form_features(df)

    df.to_csv(output_file, index=False)

    print(f"Processed and saved: {output_file}")

Processed and saved: ../Data/API-Football_FORM/API-Football-Data-2010:11_FORM.csv
Processed and saved: ../Data/API-Football_FORM/API-Football-Data-2011:12_FORM.csv
Processed and saved: ../Data/API-Football_FORM/API-Football-Data-2012:13_FORM.csv
Processed and saved: ../Data/API-Football_FORM/API-Football-Data-2013:14_FORM.csv
Processed and saved: ../Data/API-Football_FORM/API-Football-Data-2014:15_FORM.csv
Processed and saved: ../Data/API-Football_FORM/API-Football-Data-2015:16_FORM.csv
Processed and saved: ../Data/API-Football_FORM/API-Football-Data-2016:17_FORM.csv
Processed and saved: ../Data/API-Football_FORM/API-Football-Data-2017:18_FORM.csv
Processed and saved: ../Data/API-Football_FORM/API-Football-Data-2018:19_FORM.csv
Processed and saved: ../Data/API-Football_FORM/API-Football-Data-2019:20_FORM.csv
Processed and saved: ../Data/API-Football_FORM/API-Football-Data-2020:21_FORM.csv
Processed and saved: ../Data/API-Football_FORM/API-Football-Data-2024:25_FORM.csv


# Dataset Preprocessing

### Load and combine datasets

In [14]:
df_2010 = pd.read_csv("../Data/API-Football_FORM/API-Football-Data-2010:11_FORM.csv")
df_2011 = pd.read_csv("../Data/API-Football_FORM/API-Football-Data-2011:12_FORM.csv")
df_2012 = pd.read_csv("../Data/API-Football_FORM/API-Football-Data-2012:13_FORM.csv")
df_2013 = pd.read_csv("../Data/API-Football_FORM/API-Football-Data-2013:14_FORM.csv")
df_2014 = pd.read_csv("../Data/API-Football_FORM/API-Football-Data-2014:15_FORM.csv")
df_2015 = pd.read_csv("../Data/API-Football_FORM/API-Football-Data-2015:16_FORM.csv")
df_2016 = pd.read_csv("../Data/API-Football_FORM/API-Football-Data-2016:17_FORM.csv")
df_2017 = pd.read_csv("../Data/API-Football_FORM/API-Football-Data-2017:18_FORM.csv")
df_2018 = pd.read_csv("../Data/API-Football_FORM/API-Football-Data-2018:19_FORM.csv")
df_2019 = pd.read_csv("../Data/API-Football_FORM/API-Football-Data-2019:20_FORM.csv")
df_2020 = pd.read_csv("../Data/API-Football_FORM/API-Football-Data-2020:21_FORM.csv")
df_2021 = pd.read_csv("../Data/API-Football_FORM/API-Football-Data-2021:22_FORM.csv")
df_2022 = pd.read_csv("../Data/API-Football_FORM/API-Football-Data-2022:23_FORM.csv")
df_2023 = pd.read_csv("../Data/API-Football_FORM/API-Football-Data-2023:24_FORM.csv")
df_2024 = pd.read_csv("../Data/API-Football_FORM/API-Football-Data-2024:25_FORM.csv")

df_all = pd.concat([df_2010, df_2011, df_2012, df_2013, df_2014, df_2015, df_2016, df_2017, df_2018, df_2019, df_2020, df_2021, df_2022, df_2023, df_2024], ignore_index=True)

In [15]:
for col in df_all.columns:
    if "Possession" in col and df_all[col].dtype == 'object':
        df_all[col] = df_all[col].str.rstrip('%').astype(float) / 100.0

### Label Targets

In [16]:
def encode_result(row):
    if row["Home Team Goals"] > row["Away Team Goals"]:
        return 0
    elif row["Home Team Goals"] == row["Away Team Goals"]:
        return 1
    else:
        return 2

df_all["MatchResult"] = df_all.apply(encode_result, axis=1)

### Feature Selection

In [17]:
included_cols = [
    "Home Team", "Away Team",
    "Home_Points", "Home_Goals_Scored", "Home_Goals_Conceded",
    "Home_Wins", "Home_Draws", "Home_Losses", "Home_Form_Score",
    "Away_Points", "Away_Goals_Scored", "Away_Goals_Conceded",
    "Away_Wins", "Away_Draws", "Away_Losses", "Away_Form_Score",
    "Home Shots on Goal", "Away Shots on Goal",
    "Home Corner Kicks", "Away Corner Kicks",
    "Home Ball Possession", "Away Ball Possession",
    "Home Yellow Cards", "Away Yellow Cards",
    "Home Red Cards", "Away Red Cards",
    "Home Offsides", "Away Offsides",
    "Home expected_goals", "Away expected_goals",
]

df_features = df_all[included_cols].copy()

### Encoding

In [18]:
#===== LABEL ENCODING =====#

label_encoded = df_features.copy()
label_encoders = {}
os.makedirs("Encoders", exist_ok=True)

for col in ["Home Team", "Away Team"]:
    le = LabelEncoder()
    label_encoded[col + "_LabelEnc"] = le.fit_transform(label_encoded[col])
    label_encoders[col] = le

    # Save LabelEncoder
    with open(f"Encoders_final/{col.replace(' ', '_')}_label_encoder.pkl", "wb") as f:
        pickle.dump(le, f)

# Drop original team/ref columns after label encoding
label_encoded = label_encoded.drop(columns=["Home Team", "Away Team"])

In [19]:
#===== ONE-HOT ENCODING =====#

onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_array = onehot_encoder.fit_transform(df_features[["Home Team", "Away Team"]])
encoded_df = pd.DataFrame(
    encoded_array,
    columns=onehot_encoder.get_feature_names_out(["Home Team", "Away Team"]),
    index=df_features.index
)

# Combine one-hot with numeric stats
onehot_encoded = pd.concat([df_features.drop(columns=["Home Team", "Away Team"]), encoded_df], axis=1)

# Save OneHotEncoder
with open("Encoders_final/onehot_encoder.pkl", "wb") as f:
    pickle.dump(onehot_encoder, f)

### Final Setup

In [20]:
df_all.to_csv("../Data/FINAL_FULL/PL_dataset_2010-2025.csv", index=False)
label_encoded.to_csv("../Data/FINAL_FULL/Football-Training-2010_2025_LABELENC.csv", index=False)
onehot_encoded.to_csv("../Data/FINAL_FULL/Football-Training-2010_2025_ONEHOT.csv", index=False)