In [22]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import joblib
import numpy as np
import pickle

# Form Calculation

In [23]:
csv_file_recent = "../data/PL/2024:25.csv"

df = pd.read_csv(csv_file_recent)

### Function with Team Input

In [24]:
# Reverted function to calculate recent form for a given team
def get_recent_form(team_name, df, n=5, points_weight=1.0, goal_diff_weight=0.5):
    """
    Calculates the recent form of a team over the last n matches.

    Parameters:
    - team_name (str): Name of the team.
    - df (DataFrame): The main match dataset.
    - n (int): Number of previous matches to consider.

    Returns:
    - dict: Recent form metrics (points, goals scored, goals conceded, wins, draws, losses).
    """
    # Filter matches involving the team
    team_matches = df[(df['HomeTeam'] == team_name) | (df['AwayTeam'] == team_name)]
    team_matches = team_matches.tail(n)  # Since the most recent matches are at the bottom
    
    # Initialize counters
    points = 0
    goals_scored = 0
    goals_conceded = 0
    wins = 0
    draws = 0
    losses = 0

    # Iterate over matches to calculate form
    for _, row in team_matches.iterrows():
        if row['HomeTeam'] == team_name:
            goals_scored += row['FTHG']
            goals_conceded += row['FTAG']
            result = row['FTR']
            if result == 'H':
                points += 3
                wins += 1
            elif result == 'D':
                points += 1
                draws += 1
            else:
                losses += 1
        else:
            goals_scored += row['FTAG']
            goals_conceded += row['FTHG']
            result = row['FTR']
            if result == 'A':
                points += 3
                wins += 1
            elif result == 'D':
                points += 1
                draws += 1
            else:
                losses += 1

    # Calculate Goal Difference
    goal_difference = goals_scored - goals_conceded

    # Calculate Form Score
    form_score = (points * points_weight) + (goal_difference * goal_diff_weight)

    return {
        'Team': team_name,
        'Matches Considered': len(team_matches),
        'Points': points,
        'Goals Scored': goals_scored,
        'Goals Conceded': goals_conceded,
        'Goal Difference': goal_difference,
        'Wins': wins,
        'Draws': draws,
        'Losses': losses,
        'Form Score': form_score
    }

In [25]:
# Example usage for a specific team
example_team = "Liverpool"
recent_form = get_recent_form(example_team, df, n=5)

recent_form

{'Team': 'Liverpool',
 'Matches Considered': 5,
 'Points': 13,
 'Goals Scored': 11,
 'Goals Conceded': 4,
 'Goal Difference': 7,
 'Wins': 4,
 'Draws': 1,
 'Losses': 0,
 'Form Score': 16.5}

### Calculate From for all Teams
Calculate the form for all the teams based on previous 5 games. Results are then stored in team_recent_form.csv

In [26]:
# Step 1: Get unique teams from the dataset
all_teams = pd.unique(df[['HomeTeam', 'AwayTeam']].values.ravel())
all_teams = sorted(all_teams)
print(all_teams)

['Arsenal', 'Aston Villa', 'Bournemouth', 'Brentford', 'Brighton', 'Chelsea', 'Crystal Palace', 'Everton', 'Fulham', 'Ipswich', 'Leicester', 'Liverpool', 'Man City', 'Man United', 'Newcastle', "Nott'm Forest", 'Southampton', 'Tottenham', 'West Ham', 'Wolves']


In [27]:
form_data = []

for team in all_teams:
    team_form = get_recent_form(team, df, n=5)
    form_data.append(team_form)

form_df = pd.DataFrame(form_data)

form_file_path = "team_recent_form.csv"
form_df.to_csv(form_file_path, index=False)

### Calculate form for all fixtures in dataset

In [28]:
csv_file_2021 = "../Data/API-Football/API-Football-Data-2021:22.csv"
csv_file_2022 = "../Data/API-Football/API-Football-Data-2022:23.csv"
csv_file_2023 = "../Data/API-Football/API-Football-Data-2023:24.csv"

In [None]:
def preprocess_dates(df):
    df = df.copy()
    df["Date"] = pd.to_datetime(df["Date"])
    df = df.sort_values("Date").reset_index(drop=True)
    return df

def compute_team_form(df, team_name, current_date, current_match_id, window=5):
    # Get past matches involving the team before the current date
    past_matches = df[
        ((df["Home Team"] == team_name) | (df["Away Team"] == team_name)) &
        (df["Date"] < current_date) &
        (df["Fixture ID"] != current_match_id)
    ].sort_values("Date", ascending=False).head(window)

    match_count = len(past_matches)
    if match_count == 0:
        # First match of the season — no form
        return {
            "Points": 0,
            "Goals Scored": 0,
            "Goals Conceded": 0,
            "Wins": 0,
            "Draws": 0,
            "Losses": 0,
            "Form Score": 0.0
        }

    # Initialize counters
    points = goals_scored = goals_conceded = wins = draws = losses = 0

    for _, match in past_matches.iterrows():
        is_home = match["Home Team"] == team_name
        team_goals = match["Home Team Goals"] if is_home else match["Away Team Goals"]
        opponent_goals = match["Away Team Goals"] if is_home else match["Home Team Goals"]

        goals_scored += team_goals
        goals_conceded += opponent_goals

        if team_goals > opponent_goals:
            wins += 1
            points += 3
        elif team_goals == opponent_goals:
            draws += 1
            points += 1
        else:
            losses += 1

    # Interpolate form to reflect a full 'window' of matches
    scale = window / match_count
    form_score = (points + (goals_scored - goals_conceded) * 0.5) * scale

    return {
        "Points": points * scale,
        "Goals Scored": goals_scored * scale,
        "Goals Conceded": goals_conceded * scale,
        "Wins": wins * scale,
        "Draws": draws * scale,
        "Losses": losses * scale,
        "Form Score": form_score
    }

def add_form_features(df, window=7):
    df = preprocess_dates(df)
    features = []

    for idx, row in df.iterrows():
        home_team = row["Home Team"]
        away_team = row["Away Team"]
        match_date = row["Date"]
        fixture_id = row["Fixture ID"]

        home_form = compute_team_form(df, home_team, match_date, fixture_id, window=window)
        away_form = compute_team_form(df, away_team, match_date, fixture_id, window=window)

        row_features = {
            f"Home_{k.replace(' ', '_')}": v for k, v in home_form.items()
        }
        row_features.update({
            f"Away_{k.replace(' ', '_')}": v for k, v in away_form.items()
        })

        features.append(row_features)

    form_df = pd.DataFrame(features)
    return pd.concat([df.reset_index(drop=True), form_df], axis=1)



In [30]:
df_2021 = pd.read_csv(csv_file_2021)
df_2022 = pd.read_csv(csv_file_2022)
df_2023 = pd.read_csv(csv_file_2023)

df_2021 = add_form_features(df_2021)
df_2022 = add_form_features(df_2022)
df_2023 = add_form_features(df_2023)

df_2021.to_csv("../Data/API-Football_FORM/API-Football-Data-2021:22_FORM.csv", index=False)
df_2022.to_csv("../Data/API-Football_FORM/API-Football-Data-2022:23_FORM.csv", index=False)
df_2023.to_csv("../Data/API-Football_FORM/API-Football-Data-2023:24_FORM.csv", index=False)


## Calculate form current seasons

In [31]:
df_2024 = pd.read_csv("../Data/PL/2024:25.csv")

In [32]:
def clean_dataset(df):
    betting_prefixes = [
        "B365", "BF", "BS", "BW", "GB", "IW", "LB", "PS", "SO", "SB", "SJ", "SY", "VC",
        "WH", "1XB", "Max", "Avg", "BFE", "P>", "P<", "PC", "AH", "PA"
    ]
    columns_to_drop = [col for col in df.columns if any(col.startswith(prefix) for prefix in betting_prefixes)]
    df_cleaned = df.drop(columns=columns_to_drop, errors='ignore')

    # Parse date
    df_cleaned["Date"] = pd.to_datetime(df_cleaned["Date"], dayfirst=True, errors="coerce")

    # Add random fixture ID
    df_cleaned["Fixture ID"] = np.random.randint(1_000_000, 9_999_999, size=len(df_cleaned))

    return df_cleaned


In [None]:
# Step 2: Compute form features
def compute_team_form(df, team_name, current_date, current_match_id, window=7, points_weight=1.0, goal_diff_weight=0.5):
    past_matches = df[
        ((df['HomeTeam'] == team_name) | (df['AwayTeam'] == team_name)) &
        (df['Date'] < current_date) &
        (df['Fixture ID'] != current_match_id)
    ].sort_values("Date", ascending=True)

    match_count = len(past_matches)

    if match_count < window:
        if match_count == 0:
            return {
                'Points': 0, 'Goals Scored': 0, 'Goals Conceded': 0,
                'Wins': 0, 'Draws': 0, 'Losses': 0, 'Form Score': 0.0
            }
        else:
            scale = window / match_count
            recent_matches = past_matches.tail(match_count)
    else:
        scale = 1
        recent_matches = past_matches.tail(window)

    points = goals_scored = goals_conceded = wins = draws = losses = 0

    for _, row in recent_matches.iterrows():
        if row['HomeTeam'] == team_name:
            goals_scored += row['FTHG']
            goals_conceded += row['FTAG']
            result = row['FTR']
            if result == 'H':
                points += 3; wins += 1
            elif result == 'D':
                points += 1; draws += 1
            else:
                losses += 1
        else:
            goals_scored += row['FTAG']
            goals_conceded += row['FTHG']
            result = row['FTR']
            if result == 'A':
                points += 3; wins += 1
            elif result == 'D':
                points += 1; draws += 1
            else:
                losses += 1

    goal_diff = goals_scored - goals_conceded
    form_score = (points * points_weight + goal_diff * goal_diff_weight) * scale

    return {
        'Points': points * scale,
        'Goals Scored': goals_scored * scale,
        'Goals Conceded': goals_conceded * scale,
        'Wins': wins * scale,
        'Draws': draws * scale,
        'Losses': losses * scale,
        'Form Score': form_score
    }

# Step 3: Apply form feature extraction
def add_form_features(df, window=5):
    df = df.sort_values("Date").reset_index(drop=True)
    features = []

    for idx, row in df.iterrows():
        home_team = row["HomeTeam"]
        away_team = row["AwayTeam"]
        match_date = row["Date"]
        fixture_id = row["Fixture ID"]

        home_form = compute_team_form(df, home_team, match_date, fixture_id, window=window)
        away_form = compute_team_form(df, away_team, match_date, fixture_id, window=window)

        row_features = {
            f"Home_{k.replace(' ', '_')}": v for k, v in home_form.items()
        }
        row_features.update({
            f"Away_{k.replace(' ', '_')}": v for k, v in away_form.items()
        })

        features.append(row_features)

    form_df = pd.DataFrame(features)
    return pd.concat([df.reset_index(drop=True), form_df], axis=1)

In [34]:
df_2024_cleaned = clean_dataset(df_2024)
df_2024_with_form = add_form_features(df_2024_cleaned)
df_2024_with_form.to_csv("../Data/PL/2024:25_FORM.csv", index=False)

# Dataset Preprocessing

### Load and combine all datasets

In [72]:
df_2021 = pd.read_csv("../Data/API-Football_FORM/API-Football-Data-2021:22_FORM.csv")
df_2022 = pd.read_csv("../Data/API-Football_FORM/API-Football-Data-2022:23_FORM.csv")
df_2023 = pd.read_csv("../Data/API-Football_FORM/API-Football-Data-2023:24_FORM.csv")
df_2024 = pd.read_csv("../Data/API-Football_FORM/API-Football-Data-2024:25_FORM.csv")

df_all = pd.concat([df_2021, df_2022, df_2023, df_2024], ignore_index=True)

In [73]:
for col in df_all.columns:
    if "Possession" in col and df_all[col].dtype == 'object':
        df_all[col] = df_all[col].str.rstrip('%').astype(float) / 100.0


### Label the targets

In [74]:
def encode_result(row):
    if row["Home Team Goals"] > row["Away Team Goals"]:
        return 0
    elif row["Home Team Goals"] == row["Away Team Goals"]:
        return 1
    else:
        return 2

df_all["MatchResult"] = df_all.apply(encode_result, axis=1)

In [75]:
# Normalize possession columns (e.g., "70%" → 0.70)
for col in df_all.columns:
    if "Possession" in col and df_all[col].dtype == 'object':
        df_all[col] = df_all[col].str.rstrip('%').astype(float) / 100.0


### Feature Selection

In [77]:
# included_cols = [
#     "Home Team", "Away Team", "Referee",
#     "Home Team Goals", "Away Team Goals",
#     "Home_Points", "Home_Goals_Scored", "Home_Goals_Conceded",
#     "Home_Wins", "Home_Draws", "Home_Losses", "Home_Form_Score",
#     "Away_Points", "Away_Goals_Scored", "Away_Goals_Conceded",
#     "Away_Wins", "Away_Draws", "Away_Losses", "Away_Form_Score",
#     "Home Shots on Goal", "Away Shots on Goal",
#     "Home Corner Kicks", "Away Corner Kicks",
#     "Home Ball Possession", "Away Ball Possession",
#     "Home Yellow Cards", "Away Yellow Cards",
#     "Home Red Cards", "Away Red Cards",
#     "Home Offsides", "Away Offsides"
# ]

included_cols = [
    "Home Team", "Away Team", "Referee",
    "Home_Points", "Home_Goals_Scored", "Home_Goals_Conceded",
    "Home_Wins", "Home_Draws", "Home_Losses", "Home_Form_Score",
    "Away_Points", "Away_Goals_Scored", "Away_Goals_Conceded",
    "Away_Wins", "Away_Draws", "Away_Losses", "Away_Form_Score",
    "Home Shots on Goal", "Away Shots on Goal",
    "Home Corner Kicks", "Away Corner Kicks",
    "Home Ball Possession", "Away Ball Possession",
    "Home Yellow Cards", "Away Yellow Cards",
    "Home Red Cards", "Away Red Cards",
    "Home Offsides", "Away Offsides"
]

df_features = df_all[included_cols].copy()

### Encoding

In [78]:
#===== LABEL ENCODING =====#

label_encoded = df_features.copy()
label_encoders = {}
os.makedirs("Encoders", exist_ok=True)

for col in ["Home Team", "Away Team", "Referee"]:
    le = LabelEncoder()
    label_encoded[col + "_LabelEnc"] = le.fit_transform(label_encoded[col])
    label_encoders[col] = le

    # Save LabelEncoder
    with open(f"Encoders/{col.replace(' ', '_')}_label_encoder.pkl", "wb") as f:
        pickle.dump(le, f)

# Drop original team/ref columns after label encoding
label_encoded = label_encoded.drop(columns=["Home Team", "Away Team", "Referee"])

In [79]:
#===== ONE-HOT ENCODING =====#

onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_array = onehot_encoder.fit_transform(df_features[["Home Team", "Away Team", "Referee"]])
encoded_df = pd.DataFrame(
    encoded_array,
    columns=onehot_encoder.get_feature_names_out(["Home Team", "Away Team", "Referee"]),
    index=df_features.index
)

# Combine one-hot with numeric stats
onehot_encoded = pd.concat([df_features.drop(columns=["Home Team", "Away Team", "Referee"]), encoded_df], axis=1)

# Save OneHotEncoder
with open("Encoders/onehot_encoder.pkl", "wb") as f:
    pickle.dump(onehot_encoder, f)

### Final Setup

In [80]:
df_all.to_csv("../Data/FINAL/PL_dataset_2021-2024.csv", index=False)
label_encoded.to_csv("../Data/FINAL/Football-Training-2021_2024_LABELENC.csv", index=False)
onehot_encoded.to_csv("../Data/FINAL/Football-Training-2021_2024_ONEHOT.csv", index=False)