# COMP0036: Beat the Bookie

### Table of Contents

1. [Introduction](#1-Introduction)

2. [Data Import](#2-Data-Import)

3. [Data Transformation & Exploration](#3-Data-Transformation-&-Exploration)

4. [Methodology Overview](#4-Methodology-Overview)

5. [Model Training & Validation](#5-Model-Training-&-Validation) 

6. [Results](#6-Results)

7. [Final Predictions on Test Set](#7-Final-Predictions-on-Test-Set)

### 1. Introduction

### 2. Data Import

In [None]:
import pandas as pd

# Configure Pandas display settings for better readability
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)

# Load dataset
df = pd.read_csv('../data/epl-training.csv')

# Show the first few rows
df.head()


### 3. Data Transformation & Exploration

#### 3.1. Data Cleaning

In [None]:
# Define constants
N_SEASONS = 24
MATCHES_PER_SEASON = 380
EXPECTED_LENGTH = N_SEASONS * MATCHES_PER_SEASON

# Check the initial length against the expected length
print(f"Length of DataFrame before cleaning: {len(df)}")
print(f"This is {len(df) - EXPECTED_LENGTH} more than the expected length of {EXPECTED_LENGTH}.")

# Remove rows that are completely empty
df.dropna(how="all", inplace=True)

# Check for duplicates in the first three columns
duplicates = df.iloc[:, :3].duplicated()

if duplicates.any():
    # Get indices of duplicate rows
    duplicate_indices = duplicates[duplicates].index.tolist()
    print(f"Duplicate rows found at indices: {duplicate_indices}")
    print(f"Total duplicates: {len(duplicate_indices)}")

    # Remove those duplicates
    df.drop(index=duplicate_indices, inplace=True)
    print(f"Duplicate rows removed. New length: {len(df)}")
else:
    print("No duplicates found in the first three columns.")

# Print final info
print(f"Final length of the cleaned DataFrame: {len(df)}")


#### 3.2. Web Scraping
```py
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def get_market_val(year_start, year_end):
    """
    Scrapes Transfermarkt market value data for the Premier League from season year_start 
    up to (but not including) year_end. Saves the final combined data to a CSV file.
    """

    # Fake user agent to avoid 403 Forbidden errors
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/91.0.4472.124 Safari/537.36"
        )
    }

    combined_df = pd.DataFrame()

    # Loop through each season in the specified range
    for year in range(year_start, year_end):
        data_list = []
        print(f"Processing season: {year}")

        # Build the Transfermarkt URL for the given season
        url = f"https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1/plus/?saison_id={year}#google_vignette"
        
        # Send GET request
        response = requests.get(url, headers=headers)
        print(f"Status code: {response.status_code}")

        # Parse the HTML using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Locate the table with class 'items'
        table = soup.find('table', class_='items')
        rows = table.find('tbody').find_all('tr')

        # Extract the desired data from each row
        for row in rows:
            columns = row.find_all('td')
            row_data = {
                'Year': year,
                'Club': columns[1].text.strip(),
                'TMV': columns[6].text.strip()[1:],  # Remove the currency symbol (e.g., '£')
            }
            data_list.append(row_data)

        # Create a DataFrame for the current season
        df = pd.DataFrame(data_list)

        # Remove rows that are entirely empty
        df = df[~df.apply(lambda row: row.astype(str).str.strip().eq('').all(), axis=1)]

        # Concatenate with the main DataFrame
        combined_df = pd.concat([combined_df, df], ignore_index=True)

        # Sleep to avoid hitting the server too frequently
        time.sleep(5)

    # Save the final DataFrame to a CSV file
    combined_df.to_csv('Engineered Data/Final Data/marketval.csv', index=False)
    print("Data saved to 'Engineered Data/Final Data/marketval.csv'.")

# Example usage
get_market_val(2000, 2025)

```

#### 3.3. Exploring Scraped Data

##### 3.3.1. 14-Day Match Density and Match Attendance

In [None]:
import pandas as pd
from datetime import timedelta

# Load data scraping CSV files
EPL_S = pd.read_csv('../scraped-data/combined-epl.csv')
FA_S = pd.read_csv('../scraped-data/combined-fa-e.csv')
EFL_S = pd.read_csv('../scraped-data/combined-efl-e.csv')
UCL_S = pd.read_csv('../scraped-data/combined-ucl-e.csv')
UEL_S = pd.read_csv('../scraped-data/combined-uel-e.csv')

# Tag each DataFrame to identify its source
EPL_S.insert(0, 'df name', 'EPL')
FA_S.insert(0, 'df name', 'FA')
EFL_S.insert(0, 'df name', 'EFL')
UCL_S.insert(0, 'df name', 'UCL')
UEL_S.insert(0, 'df name', 'UEL')

# Combine all DataFrames into one
combined = pd.concat([EPL_S, FA_S, EFL_S, UCL_S, UEL_S], ignore_index=True)

# Convert the 'Date' columns to datetime using DD/MM/YYYY format
# (Fixes warning about parsing with dayfirst=False)
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
combined['Date'] = pd.to_datetime(combined['Date'], format='%d/%m/%Y')

# Sort the combined DataFrame by date and home team
combined.sort_values(['Date', 'HomeTeam'], ascending=[True, True], inplace=True)

# Define a function to count matches for a given team in the prior 14 days
def calculate_matches(team, match_date):
    start_date = match_date - timedelta(days=14)
    matches = combined.loc[
        (
            (combined['HomeTeam'] == team) |
            (combined['AwayTeam'] == team)
        ) &
        (combined['Date'] >= start_date) &
        (combined['Date'] < match_date)
    ]
    return len(matches)

# Filter EPL rows and make a copy to avoid SettingWithCopyWarning
combined_epl = combined.loc[combined['df name'] == 'EPL'].copy()

# Calculate 14-day match density for home and away teams
combined_epl['H14'] = combined_epl.apply(
    lambda row: calculate_matches(row['HomeTeam'], row['Date']), axis=1
)
combined_epl['A14'] = combined_epl.apply(
    lambda row: calculate_matches(row['AwayTeam'], row['Date']), axis=1
)

# Merge H14, A14, and Attendance columns into the main epl DataFrame
df = df.merge(
    combined_epl[['Date', 'HomeTeam', 'AwayTeam', 'H14', 'A14', 'Attendance']],
    on=['Date', 'HomeTeam', 'AwayTeam'],
    how='left'
)

# Convert the Date column back to DD/MM/YYYY format for consistency
df['Date'] = df['Date'].dt.strftime('%d/%m/%Y')

# Preview the first few rows
df.head()


##### 3.3.2. Referee Strictness

In [None]:
# Function to standardize referee names based on index ranges
def StandardNames(index, name):
    """
    Standardizes referee names differently depending on the row index.
    """
    # First 380 rows: split and keep first initial + last name
    if index <= 379:
        parts = name.split()
        if len(parts) > 1:
            return f"{parts[0][0]} {parts[1]}"
        else:
            return name

    # Rows 380 to 549: remove periods, then keep first initial + last name
    elif index <= 549:
        parts = name.replace('.', '').split()
        if len(parts) > 1:
            return f"{parts[0][0]} {parts[-1]}"
        else:
            return name

    # Rows 550 to 759: remove commas/periods, swap order, keep first initial + last name
    elif index <= 759:
        parts = name.replace(',', '').replace('.', '').split()
        if len(parts) > 1:
            return f"{parts[1][0]} {parts[0]}"
        else:
            return name

    # Rows 1855 to 1863: keep last letter of first name + last name
    elif index >= 1855 and index <= 1863:
        parts = name.split()
        if len(parts) > 1:
            return f"{parts[0][-1]} {parts[1]}"
        else:
            return name

    # Default: do not modify name
    else:
        return name

# Apply the StandardNames function to each row
df['Referee'] = df.apply(lambda row: StandardNames(row.name, row['Referee']), axis=1)

# Define a lookup dictionary for inconsistent names
name_corrections = {
    "D Gallaghe": "D Gallagher",
    "D Gallagh": "D Gallagher"
}

# Apply corrections to the 'Referee' column
df['Referee'] = df['Referee'].apply(
    lambda name: name_corrections[name] if name in name_corrections else name
)

# Get unique refs and initialize dictionaries
refs = df['Referee'].unique()
Y = {ref: 0 for ref in refs}
R = {ref: 0 for ref in refs}
MatchCount = {ref: 0 for ref in refs}
strictness = {}

# Loop over DataFrame rows to count yellow/red cards and appearances
for index, row in df.iterrows():
    ref = row['Referee']
    if pd.notna(ref):
        Y[ref] += row['AY'] + row['HY']  # Summation of all yellow cards
        R[ref] += row['AY'] + row['HY']  # In the original code: same as above (possibly an intentional choice)
        MatchCount[ref] += 1

# Compute "strictness" based on total cards and matches
for ref in refs:
    if MatchCount[ref] > 0:
        strictness[ref] = (Y[ref] + 3 * R[ref]) / MatchCount[ref]
    else:
        strictness[ref] = 0

# Add a 'Strictness' column to the epl DataFrame
df['Strictness'] = df['Referee'].map(strictness)

# Preview the updated DataFrame
df.head()


##### 3.3.3. Standings

In [None]:
from tqdm import tqdm

# Enable tqdm for Pandas apply operations
tqdm.pandas()

# Convert FTR to winner name or 'Draw'
df['FTR'] = df.apply(
    lambda row: row['HomeTeam'] if row['FTR'] == 'H' 
                else ('Draw' if row['FTR'] == 'D' else row['AwayTeam']), 
    axis=1
)

# Assign 'Season' based on row index (380 matches per season)
df['Season'] = df.index // 380
df['Season'] = df['Season'].apply(lambda i: 2000 + i)  # e.g., 0 -> 2000, 1 -> 2001, etc.

# Assign 'Round' values (there are 38 rounds in a 380-match season)
roundindex = (df.index - 10) // 10 + 1
df['Round'] = (roundindex % 38) + 1

# Recursive function to calculate cumulative points
def get_pts(team, season, current_round):
    """
    Returns the points accumulated by 'team' up to (and including) 'current_round'
    in 'season' (where season=2018 means 2018-2019, etc.).
    """
    if current_round == 1:
        return 0
    
    # Filter for the previous round in the same season
    prev_round_data = df[
        (df['Season'] == season) & 
        (df['Round'] == current_round - 1)
    ]
    
    # Conditions to check if the team won or drew in the previous round
    homewin = (prev_round_data['HomeTeam'] == team) & (prev_round_data['FTR'] == team)
    awaywin = (prev_round_data['AwayTeam'] == team) & (prev_round_data['FTR'] == team)
    draw = (
        ((prev_round_data['HomeTeam'] == team) | (prev_round_data['AwayTeam'] == team)) 
        & (prev_round_data['FTR'] == 'Draw')
    )

    # Points in the previous round
    if homewin.any() or awaywin.any():
        roundpts = 3
    elif draw.any():
        roundpts = 1
    else:
        roundpts = 0
    
    # Recursively add the points from earlier rounds
    return roundpts + get_pts(team, season, current_round - 1)

# Calculate home and away teams' cumulative points
df['Hpts'] = df.progress_apply(
    lambda row: get_pts(row['HomeTeam'], row['Season'], row['Round']), 
    axis=1
)
df['Apts'] = df.progress_apply(
    lambda row: get_pts(row['AwayTeam'], row['Season'], row['Round']), 
    axis=1
)

# Revert FTR back to 'H', 'D', or 'A'
df['FTR'] = df.progress_apply(
    lambda row: 'H' if row['FTR'] == row['HomeTeam']
                else ('D' if row['FTR'] == 'Draw' else 'A'),
    axis=1
)

# Show the first few rows
df.head()


##### 3.3.4. Other Features

In [None]:
from collections import defaultdict, deque

# Temporarily add the HomePoints/AwayPoints features for easier tracking of match outcomes
df['HomePoints'] = df['FTR'].apply(lambda x: 3 if x == 'H' else (1 if x == 'D' else 0))
df['AwayPoints'] = df['FTR'].apply(lambda x: 3 if x == 'A' else (1 if x == 'D' else 0))

# Define rolling window size for form metrics (e.g., last 10 matches)
form_window = 10

# Dictionaries to track stats for each team
team_strength_stats = {}      # Contains overall points & games played
team_goals_scored = {}        # Cumulative goals scored by each team
team_goals_conceded = {}      # Cumulative goals conceded by each team
team_form_points = {}         # Rolling points over last `form_window` matches
team_form_goal_diff = {}      # Rolling goal difference over last `form_window` matches
team_win_streak = {}          # Current win streak
h2h_record = defaultdict(lambda: {"matches": 0, "home_wins": 0, "away_wins": 0})  # H2H stats

# --- Helper Functions ---

# Overall team strength
def get_team_strength(team):
    if team in team_strength_stats and team_strength_stats[team]['games'] > 0:
        return team_strength_stats[team]['points'] / team_strength_stats[team]['games']
    return 0

def update_team_strength(team, points):
    if team not in team_strength_stats:
        team_strength_stats[team] = {'points': 0, 'games': 0}
    team_strength_stats[team]['points'] += points
    team_strength_stats[team]['games'] += 1

# Goal scoring rate
def get_goal_scoring_rate(team):
    if team in team_goals_scored and team_strength_stats[team]['games'] > 0:
        return team_goals_scored[team] / team_strength_stats[team]['games']
    return 0

def update_goal_scoring_rate(team, goals):
    if team not in team_goals_scored:
        team_goals_scored[team] = 0
    team_goals_scored[team] += goals

# Defensive strength
def get_defensive_strength(team):
    if team in team_goals_conceded and team_strength_stats[team]['games'] > 0:
        return team_goals_conceded[team] / team_strength_stats[team]['games']
    return 0

def update_defensive_strength(team, goals_conceded):
    if team not in team_goals_conceded:
        team_goals_conceded[team] = 0
    team_goals_conceded[team] += goals_conceded

# Team form (last `form_window` matches)
def get_form_points(team):
    if team in team_form_points and len(team_form_points[team]) > 0:
        return sum(team_form_points[team]) / len(team_form_points[team])
    return 0

def update_form_points(team, points):
    if team not in team_form_points:
        team_form_points[team] = deque(maxlen=form_window)
    team_form_points[team].append(points)

def get_goal_diff_form(team):
    if team in team_form_goal_diff and len(team_form_goal_diff[team]) > 0:
        return sum(team_form_goal_diff[team])
    return 0

def update_goal_diff_form(team, goal_diff):
    if team not in team_form_goal_diff:
        team_form_goal_diff[team] = deque(maxlen=form_window)
    team_form_goal_diff[team].append(goal_diff)

def get_win_streak(team):
    return team_win_streak.get(team, 0)

def update_win_streak(team, result):
    if team not in team_win_streak:
        team_win_streak[team] = 0
    if result == 3:  # Win
        team_win_streak[team] += 1
    else:            # Loss or Draw
        team_win_streak[team] = 0

# H2H functions
def calculate_h2h_win_rate(team, opponent, is_home):
    record = h2h_record[(team, opponent)]
    wins = record['home_wins'] if is_home else record['away_wins']
    total_matches = record['matches']
    return wins / total_matches if total_matches > 0 else 0

def update_h2h_record(home_team, away_team, result):
    h2h_record[(home_team, away_team)]['matches'] += 1
    h2h_record[(away_team, home_team)]['matches'] += 1
    if result == 'H':
        h2h_record[(home_team, away_team)]['home_wins'] += 1
    elif result == 'A':
        h2h_record[(away_team, home_team)]['away_wins'] += 1

# --- Lists to store computed metrics for each row ---
hts_list = []
ats_list = []
home_gsr_list = []
away_gsr_list = []
home_ds_list = []
away_ds_list = []
home_form_points_list = []
away_form_points_list = []
home_goal_diff_form_list = []
away_goal_diff_form_list = []
home_win_streak_list = []
away_win_streak_list = []
home_h2h_win_rate_list = []
away_h2h_win_rate_list = []

# --- Main loop to process each match ---
for _, row in df.iterrows():
    home_team = row['HomeTeam']
    away_team = row['AwayTeam']
    result = row['FTR']
    
    # Calculate current metrics before updating
    hts = get_team_strength(home_team)
    ats = get_team_strength(away_team)
    home_gsr = get_goal_scoring_rate(home_team)
    away_gsr = get_goal_scoring_rate(away_team)
    home_ds = get_defensive_strength(home_team)
    away_ds = get_defensive_strength(away_team)
    home_form_points = get_form_points(home_team)
    away_form_points = get_form_points(away_team)
    home_goal_diff_form = get_goal_diff_form(home_team)
    away_goal_diff_form = get_goal_diff_form(away_team)
    home_win_streak = get_win_streak(home_team)
    away_win_streak = get_win_streak(away_team)
    home_h2h_win_rate = calculate_h2h_win_rate(home_team, away_team, is_home=True)
    away_h2h_win_rate = calculate_h2h_win_rate(away_team, home_team, is_home=False)
    
    # Append values to lists
    hts_list.append(hts)
    ats_list.append(ats)
    home_gsr_list.append(home_gsr)
    away_gsr_list.append(away_gsr)
    home_ds_list.append(home_ds)
    away_ds_list.append(away_ds)
    home_form_points_list.append(home_form_points)
    away_form_points_list.append(away_form_points)
    home_goal_diff_form_list.append(home_goal_diff_form)
    away_goal_diff_form_list.append(away_goal_diff_form)
    home_win_streak_list.append(home_win_streak)
    away_win_streak_list.append(away_win_streak)
    home_h2h_win_rate_list.append(home_h2h_win_rate)
    away_h2h_win_rate_list.append(away_h2h_win_rate)
    
    # Update stats after reading the row
    update_team_strength(home_team, row['HomePoints'])
    update_team_strength(away_team, row['AwayPoints'])
    update_goal_scoring_rate(home_team, row['FTHG'])
    update_goal_scoring_rate(away_team, row['FTAG'])
    update_defensive_strength(home_team, row['FTAG'])
    update_defensive_strength(away_team, row['FTHG'])
    
    home_goal_diff = row['FTHG'] - row['FTAG']
    away_goal_diff = row['FTAG'] - row['FTHG']
    update_form_points(home_team, row['HomePoints'])
    update_form_points(away_team, row['AwayPoints'])
    update_goal_diff_form(home_team, home_goal_diff)
    update_goal_diff_form(away_team, away_goal_diff)
    update_win_streak(home_team, row['HomePoints'])
    update_win_streak(away_team, row['AwayPoints'])
    
    # Update head-to-head stats
    update_h2h_record(home_team, away_team, result)

# Assign computed metrics back to the DataFrame
df['HTS'] = hts_list
df['ATS'] = ats_list
df['HGSR'] = home_gsr_list
df['AGSR'] = away_gsr_list
df['Home_DS'] = home_ds_list
df['Away_DS'] = away_ds_list
df['Home_Form_Points'] = home_form_points_list
df['Away_Form_Points'] = away_form_points_list
df['Home_Goal_Diff_Form'] = home_goal_diff_form_list
df['Away_Goal_Diff_Form'] = away_goal_diff_form_list
df['Home_Win_Streak'] = home_win_streak_list
df['Away_Win_Streak'] = away_win_streak_list
df['Home_H2H_Win_Rate'] = home_h2h_win_rate_list
df['Away_H2H_Win_Rate'] = away_h2h_win_rate_list

# Remove the temporary columns
df.drop(columns=['HomePoints', 'AwayPoints'], inplace=True)

# Preview the updated DataFrame
df.head()


##### 3.3.5. Team Market Value, Match Possession, Set Piece

In [None]:
import pandas as pd

# Load additional scraped data
MarketVal = pd.read_csv('../scraped-data/market-values.csv')
Posession = pd.read_csv('../scraped-data/possession-data.csv')
SetPiece = pd.read_csv('../scraped-data/set-piece.csv')

# Dictionary for team name corrections
Alterations = {
    'Manchester City': 'Man City',
    'Arsenal FC': 'Arsenal',
    'Chelsea FC': 'Chelsea',
    'Liverpool FC': 'Liverpool',
    'Manchester United': 'Man United',
    'Tottenham Hotspur': 'Tottenham',
    'Newcastle United': 'Newcastle',
    'Brighton & Hove Albion': 'Brighton',
    'West Ham United': 'West Ham',
    'Nottingham Forest': "Nott'm Forest",
    'Brentford FC': 'Brentford',
    'Wolverhampton Wanderers': 'Wolves',
    'AFC Bournemouth': 'Bournemouth',
    'Everton FC': 'Everton',
    'Fulham FC': 'Fulham',
    'Southampton FC': 'Southampton',
    'Leicester City': 'Leicester',
    'Ipswich Town': 'Ipswich',
    'West Bromwich Albion': 'West Brom',
    'Queens Park Rangers': 'QPR',
    'Hull City': 'Hull',
    'Stoke City': 'Stoke',
    'Swansea City': 'Swansea',
    'Manchester Utd': 'Man United',
    'Newcastle Utd': 'Newcastle',
    "Nott'ham Forest": "Nott'm Forest",
    "Luton Town": "Luton",
    'Sheffield Utd': 'Sheffield United',
    'Leeds United': 'Leeds',
    'Norwich City': 'Norwich',
    'Cardiff City': 'Cardiff',
    'Birmingham City': 'Birmingham',
    'Blackburn Rovers': 'Blackburn',
    'Blackpool FC': 'Blackpool',
    'Bolton Wanderers': 'Bolton',
    'Bradford City': 'Bradford',
    'Burnley FC': 'Burnley',
    'Charlton Athletic': 'Charlton',
    'Coventry City': 'Coventry',
    'Derby County': 'Derby',
    'Huddersfield Town': 'Huddersfield',
    'Middlesbrough FC': 'Middlesbrough',
    'Portsmouth FC': 'Portsmouth',
    'Reading FC': 'Reading',
    'Sunderland AFC': 'Sunderland',
    'Watford FC': 'Watford',
    'Wigan Athletic': 'Wigan',
}

# Apply team name corrections in each DataFrame
MarketVal['Club'] = MarketVal['Club'].apply(lambda name: Alterations[name] if name in Alterations else name)
Posession['Team'] = Posession['Team'].apply(lambda name: Alterations[name] if name in Alterations else name)
SetPiece['Team'] = SetPiece['Team'].apply(lambda name: Alterations[name] if name in Alterations else name)

# Check how many teams exist in each dataset
cleaneplteams = list(df['HomeTeam'].unique())
MarketValteams = list(MarketVal['Club'].unique())
Posessionteams = list(Posession['Team'].unique())
setPieceteams = list(SetPiece['Team'].unique())

print(f"Number of teams in the clean EPL data: {len(cleaneplteams)}")
print(f"Number of teams in the MarketVal data:   {len(MarketValteams)}")
print(f"Number of teams in the Possession data:  {len(Posessionteams)}")
print(f"Number of teams in the SetPiece data:    {len(setPieceteams)}")

# Identify names not found in the dictionary
uniqueepl = sorted([team for team in cleaneplteams if team not in MarketValteams])
UniqueMarketVal = sorted([team for team in MarketValteams if team not in cleaneplteams])
UniquePosession = [team for team in Posessionteams if team not in cleaneplteams]
UniqueSetPiece = [team for team in setPieceteams if team not in cleaneplteams]

# Apply final corrections (just in case) after the check
MarketVal['Club'] = MarketVal['Club'].apply(lambda name: Alterations[name] if name in Alterations else name)
Posession['Team'] = Posession['Team'].apply(lambda name: Alterations[name] if name in Alterations else name)
SetPiece['Team'] = SetPiece['Team'].apply(lambda name: Alterations[name] if name in Alterations else name)

# Clean MarketVal's TMV column by converting from string to numeric
MarketVal['TMV'] = MarketVal['TMV'].apply(
    lambda val: float(str(val)[:-2]) * 1000 if isinstance(val, str) and val.endswith('bn')
    else (float(str(val)[:-1]) if isinstance(val, str) and val.endswith('m') else val)
)

# Merge MarketVal into `epl` for HomeTeam
df = pd.merge(
    df,
    MarketVal.rename(columns={'Club': 'HomeTeam', 'TMV': 'HTV($m)', 'Year': 'Season'}),
    how='left',
    on=['HomeTeam', 'Season']
)

# Merge MarketVal into `epl` for AwayTeam
df = pd.merge(
    df,
    MarketVal.rename(columns={'Club': 'AwayTeam', 'TMV': 'ATV($m)', 'Year': 'Season'}),
    how='left',
    on=['AwayTeam', 'Season']
)

# Process Posession data: adjust year, convert Poss to decimal
Posession['year'] = Posession['year'].apply(lambda yr: yr[:4]).astype(int)
Posession['Poss'] = Posession['Poss'].apply(lambda pos: pos / 100)
Posession = Posession[['Team', 'Poss', 'year']]

# Merge possession data for HomeTeam
df = pd.merge(
    df,
    Posession.rename(columns={'Team': 'HomeTeam', 'Poss': 'HTPos_avg', 'year': 'Season'}),
    how='left',
    on=['HomeTeam', 'Season']
)

# Merge possession data for AwayTeam
df = pd.merge(
    df,
    Posession.rename(columns={'Team': 'AwayTeam', 'Poss': 'ATPos_avg', 'year': 'Season'}),
    how='left',
    on=['AwayTeam', 'Season']
)

# Prepare SetPiece data: drop NaNs, adjust Season
SetPiece = SetPiece.dropna()
SetPiece['Season'] = SetPiece['Season'].apply(lambda yr: yr[:4]).astype(int)
SetPiece = SetPiece[['Season', 'Team', 'Set Piece Efficiency (%)', 'Penalty Efficiency (%)']]

# Merge set piece data for HomeTeam
df = pd.merge(
    df,
    SetPiece.rename(columns={
        'Team': 'HomeTeam',
        'Set Piece Efficiency (%)': 'HSPE (%)',
        'Penalty Efficiency (%)': 'HPE (%)'
    }),
    how='left',
    on=['HomeTeam', 'Season']
)

# Merge set piece data for AwayTeam
df = pd.merge(
    df,
    SetPiece.rename(columns={
        'Team': 'AwayTeam',
        'Set Piece Efficiency (%)': 'ASPE (%)',
        'Penalty Efficiency (%)': 'APE (%)'
    }),
    how='left',
    on=['AwayTeam', 'Season']
)

# Preview the final DataFrame
df.head()


#### 3.4. Data Transformation

##### 3.4.1. Splitting Date into Day, Month, Year, and Day of Week

In [None]:
# Convert 'Date' to datetime (DD/MM/YYYY)
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

# Extract day, month, year, and day of the week
day = df['Date'].dt.day
month = df['Date'].dt.month
year = df['Date'].dt.year
dow_num = df['Date'].dt.dayofweek + 1

# Drop the 'Date' column
df.drop(columns='Date', inplace=True)

# Insert the new columns back in starting at index 0
df.insert(0, 'Day', day)
df.insert(1, 'Month', month)
df.insert(2, 'Year', year)
df.insert(3, 'DayOfWeek', dow_num)

# Preview the updated DataFrame
df.head()


##### 3.4.2. Imputing possession averages

In [None]:
from xgboost import XGBRegressor
import numpy as np

def xgb_impute_pos_avg(df):
    """
    Imputes missing values in 'HTPos_avg' and 'ATPos_avg' using XGBRegressor.
    Adds indicator columns to track which values were missing before imputation.
    """
    # 1. Define the features you want to use for predicting possession
    #    (the same ones you used for KNN, or a superset)
    features = [
        'HS', 'AS', 'HST', 'AST', 
        'Hpts', 'Apts', 
        'Home_Form_Points', 'Away_Form_Points'
    ]
    target_columns = ["HTPos_avg", "ATPos_avg"]

    # 2. Create indicator columns showing which rows were missing
    for col in target_columns:
        df[f"{col}_missing"] = df[col].isnull().astype(int)

    # We'll impute each column (HTPos_avg, ATPos_avg) separately via XGBoost
    for target_col in target_columns:
        # a) Separate known vs missing
        not_missing_mask = df[target_col].notnull()
        missing_mask = df[target_col].isnull()

        # If no missing values, skip
        if not df[missing_mask].empty:
            # b) Training data (where target_col is known)
            X_train = df.loc[not_missing_mask, features]
            y_train = df.loc[not_missing_mask, target_col]

            # c) Rows to predict (where target_col is missing)
            X_missing = df.loc[missing_mask, features]

            # d) Define & train your XGBRegressor
            #    You can tweak these hyperparameters as needed
            xgb_model = XGBRegressor(
                n_estimators=300,
                max_depth=6,
                learning_rate=0.1,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
                n_jobs=-1
            )

            xgb_model.fit(X_train, y_train)

            # e) Predict and fill in the missing values
            df.loc[missing_mask, target_col] = xgb_model.predict(X_missing)

    return df

df = xgb_impute_pos_avg(df)
df.head()

##### 3.4.3. Imputing HSPE and ASPE

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Define a function to impute missing values using a RandomForestRegressor
def random_forest_impute(df, target_col, feature_cols):
    """
    Trains a RandomForestRegressor to predict 'target_col' using 'feature_cols' 
    and fills in missing values in 'target_col' within 'df'.
    """
    # Identify rows with and without missing values in the target column
    not_missing_mask = df[target_col].notnull()
    missing_mask = df[target_col].isnull()

    df_not_missing = df[not_missing_mask]
    df_missing = df[missing_mask]

    if df_missing.empty:
        print(f"No missing values for {target_col}; skipping RF imputation.")
        return df

    # Configure RandomForestRegressor
    rf = RandomForestRegressor(
        n_estimators=500,
        max_depth=20,
        min_samples_leaf=5,
        random_state=42,
        n_jobs=-1
    )

    # Train on rows where target_col is not missing
    rf.fit(df_not_missing[feature_cols], df_not_missing[target_col])

    # Predict and fill missing values
    imputed_values = rf.predict(df_missing[feature_cols])
    df.loc[missing_mask, target_col] = imputed_values

    return df

# Example features for imputation
rf_features = [
    'Hpts', 'Apts',
    'Home_Form_Points', 'Away_Form_Points',
    'Home_H2H_Win_Rate', 'Away_H2H_Win_Rate',
    'HTS', 'ATS'
]

# Flag rows where HSPE or ASPE is missing
df["HSPE_missing"] = df["HSPE (%)"].isnull().astype(int)
df["ASPE_missing"] = df["ASPE (%)"].isnull().astype(int)

# Impute missing 'HSPE (%)'
df = random_forest_impute(
    df=df,
    target_col='HSPE (%)',
    feature_cols=rf_features
)

# Impute missing 'ASPE (%)'
df = random_forest_impute(
    df=df,
    target_col='ASPE (%)',
    feature_cols=rf_features
)

# Preview the updated DataFrame
df.head()

##### 3.4.4. Imputing HPE and APE

In [None]:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

# Flag rows with missing 'HPE (%)' or 'APE (%)'
df["HPE_missing"] = df["HPE (%)"].isnull().astype(int)
df["APE_missing"] = df["APE (%)"].isnull().astype(int)

# Define features for RandomForest imputation
rf_features_for_hpe_ape = [
    'Hpts', 'Apts',
    'Home_Form_Points', 'Away_Form_Points',
    'Home_H2H_Win_Rate', 'Away_H2H_Win_Rate',
    'HTS', 'ATS'
]

# Impute HPE (%) using a random forest
df = random_forest_impute(
    df=df,
    target_col='HPE (%)',
    feature_cols=rf_features_for_hpe_ape
)

# Impute APE (%) using a random forest
df = random_forest_impute(
    df=df,
    target_col='APE (%)',
    feature_cols=rf_features_for_hpe_ape
)

# Combine imputed columns + original features into one list
impute_cols = rf_features_for_hpe_ape + ["HPE (%)", "APE (%)"]
impute_cols = list(dict.fromkeys(impute_cols))  # Remove duplicates, maintain order

# Create copies for iterative imputation
iter_data = df[impute_cols].copy()
original_features = df[impute_cols].copy()

# Configure IterativeImputer with a RandomForestRegressor
iter_imputer = IterativeImputer(
    estimator=RandomForestRegressor(
        n_estimators=500,
        max_depth=20,
        min_samples_leaf=5,
        random_state=42,
        n_jobs=-1
    ),
    max_iter=5,
    random_state=42
)

# Fit and transform the data with IterativeImputer
imputed_array = iter_imputer.fit_transform(iter_data)
imputed_iter_df = pd.DataFrame(imputed_array, columns=impute_cols)

# Update 'HPE (%)' and 'APE (%)' in the original df
df['HPE (%)'] = imputed_iter_df['HPE (%)']
df['APE (%)'] = imputed_iter_df['APE (%)']

# Revert other features back to original (if the imputer changed them)
for col in set(impute_cols) - set(["HPE (%)", "APE (%)"]):
    df[col] = original_features[col]

# Preview the updated DataFrame
df.head()

##### 3.4.5. Imputing HTV and ATV

In [None]:
from xgboost import XGBRegressor

# Mark rows that are missing 'HTV($m)' and 'ATV($m)'
df["HTV_missing"] = df["HTV($m)"].isnull().astype(int)
df["ATV_missing"] = df["ATV($m)"].isnull().astype(int)

# Features to be used for imputation with XGBoost
valuation_features = [
    "Season", "Round",
    "Hpts", "Apts",
    "Home_Form_Points", "Away_Form_Points",
    "Home_Win_Streak", "Away_Win_Streak",
    "Home_H2H_Win_Rate", "Away_H2H_Win_Rate"
]

def xgb_impute(df, target_col, feature_cols):
    """
    Trains an XGBRegressor to predict 'target_col' using 'feature_cols'.
    Fills in missing values in 'target_col' within 'df'.
    """
    # Identify rows with and without missing values for the target
    not_missing_mask = df[target_col].notnull()
    missing_mask = df[target_col].isnull()

    # If there are no missing values, no need to impute
    if df[missing_mask].empty:
        return df
    
    # Split the data into two subsets
    df_not_missing = df[not_missing_mask].copy()
    df_missing = df[missing_mask].copy()

    # Configure XGBRegressor
    xgb = XGBRegressor(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )

    # Train on rows where the target is not missing
    xgb.fit(
        df_not_missing[feature_cols],
        df_not_missing[target_col]
    )

    # Predict missing values
    imputed_values = xgb.predict(df_missing[feature_cols])

    # Fill the main DataFrame with predictions
    df.loc[missing_mask, target_col] = imputed_values

    return df

# Impute missing values for HTV($m) and ATV($m) columns
df = xgb_impute(df, target_col="HTV($m)", feature_cols=valuation_features)
df = xgb_impute(df, target_col="ATV($m)", feature_cols=valuation_features)

# Preview the updated DataFrame
df.head()

In [None]:
rolling_window = 5
df_sorted = df.sort_values(by=['HomeTeam', 'Season', 'Round']).reset_index(drop=True)

# Rolling average of points for HomeTeam
df_sorted['Home_Rolling_Points'] = df_sorted.groupby('HomeTeam')['Hpts'].transform(
    lambda x: x.shift(1).rolling(window=rolling_window, min_periods=1).mean()
)

# Rolling average of points for AwayTeam
df_sorted['Away_Rolling_Points'] = df_sorted.groupby('AwayTeam')['Apts'].transform(
    lambda x: x.shift(1).rolling(window=rolling_window, min_periods=1).mean()
)


# Rolling average of possession for HomeTeam
df_sorted['Home_Rolling_Possession'] = df_sorted.groupby('HomeTeam')['HTPos_avg'].transform(
    lambda x: x.shift(1).rolling(window=rolling_window, min_periods=1).mean()
)

# Rolling average of possession for AwayTeam
df_sorted['Away_Rolling_Possession'] = df_sorted.groupby('AwayTeam')['ATPos_avg'].transform(
    lambda x: x.shift(1).rolling(window=rolling_window, min_periods=1).mean()
)


# List of new rolling feature columns
rolling_features = [
    'Home_Rolling_Points', 'Away_Rolling_Points',
    'Home_Rolling_Possession', 'Away_Rolling_Possession'
]

# Fill NaN values with the mean of each column
for feature in rolling_features:
    df_sorted[feature].fillna(df_sorted[feature].mean(), inplace=True)

df = df_sorted.copy()
df.head()


##### 3.4.6. One-Hot Encoding FTR, HomeTeam, and AwayTeam

In [None]:
df = pd.get_dummies(
    df, 
    columns=['FTR', 'HomeTeam', 'AwayTeam'],
    prefix=['FTR', 'HomeTeam', 'AwayTeam']
)

# Preview the encoded DataFrame
df.head()

##### 3.4.7. Tidying Up the DataFrame

In [None]:
df_copy = df.copy()

# Drop unnecessary columns
df_copy = df_copy.drop(columns=['FTHG', 'FTAG', 'HTHG', 'HTAG', 'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR', 'Attendance'])

# Display the first few rows
df_copy.head()


### 4. Methodology Overview

In [351]:
X = df_copy.drop(columns=['FTR_A', 'FTR_D', 'FTR_H'])
y = df_copy[['FTR_A', 'FTR_D', 'FTR_H']]


### 5. Model Training Data Split

In [352]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2

# Data scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42
)

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, 
    test_size=0.2, 
    random_state=42
)

### 5.1 Other Approaches Explored

#### DNN

In [None]:

# # Compute class weights
# class_weights = compute_class_weight(
#     class_weight='balanced',
#     classes=np.arange(y.shape[1]),
#     y=np.argmax(y, axis=1)
# )
# class_weights = dict(enumerate(class_weights))

def compute_dnn():
    # Define the DNN model
    model = Sequential([
        Dense(512, kernel_regularizer=l2(0.01)),  # Add L2 regularization
        BatchNormalization(),  # Batch Normalization
        LeakyReLU(alpha=0.1),  # Leaky ReLU activation
        Dropout(0.5),

        Dense(256, kernel_regularizer=l2(0.01)),
        BatchNormalization(),
        LeakyReLU(alpha=0.1),
        Dropout(0.5),

        Dense(128, kernel_regularizer=l2(0.01)),
        BatchNormalization(),
        LeakyReLU(alpha=0.1),
        Dropout(0.5),

        Dense(64, activation='relu'),
        Dense(y_train.shape[1], activation='softmax')  # Output layer for multi-class classification
    ])

    # Compile the model
    model.compile(
        optimizer=Adam(learning_rate=0.00005),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    # Set up callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True,
        verbose=1
    )
    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=10,
        verbose=1
    )

    # Train the model
    history = model.fit(
        X_train,
        y_train,
        epochs=100,
        batch_size=32,
        validation_data=(X_val, y_val),
        callbacks=[early_stopping, reduce_lr],
        verbose=1
    )

    # Visualize training and validation log loss and accuracy
    import matplotlib.pyplot as plt

    # Plotting Log Loss and Accuracy
    plt.figure(figsize=(14, 6))

    # Subplot 1: Accuracy
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.ylim(0, 1)
    plt.legend()
    plt.title("Training and Validation Accuracy")
    plt.grid(True)

    # Subplot 2: Log Loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Training Log Loss')
    plt.plot(history.history['val_loss'], label='Validation Log Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Log Loss')
    plt.ylim(0, 10)  # Adjust if your log loss range differs
    plt.legend()
    plt.title("Training and Validation Log Loss")
    plt.grid(True)

    plt.tight_layout()
    plt.show()

    import numpy as np
    from sklearn.metrics import confusion_matrix, classification_report

    # Evaluate the model on the test set
    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f"Test Accuracy: {test_accuracy:.2f}")

    # Generate predictions on the test set
    y_pred = model.predict(X_test)

    # Convert predicted probabilities and true labels into class indices
    y_pred_original = np.argmax(y_pred, axis=1)
    y_test_original = np.argmax(y_test, axis=1)

    # Display confusion matrix
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test_original, y_pred_original))

    # Display classification report
    print("\nClassification Report:")
    print(classification_report(y_test_original, y_pred_original, digits=4))

compute_dnn()


#### K-Means

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from scipy.optimize import linear_sum_assignment

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA

# Scatter Plot of the Clusters with PCA  
def plot_clusters(X, y_pred, kmeans):
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)

    plt.figure(figsize=(10, 8))
    for cluster in np.unique(y_pred):
        cluster_points = X_pca[y_pred == cluster]
        plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f"Cluster {cluster}")

    # Plot centroids
    centroids = pca.transform(kmeans.cluster_centers_)
    plt.scatter(centroids[:, 0], centroids[:, 1], c='red', marker='x', s=200, label='Centroids')

    plt.title("K-Means Clustering Results with PCA")
    
    plt.legend(["H", "D", "A"])
    plt.show()



# Evaluate Performance
def evaluate_performance(X, y_true, y_pred, kmeans):
    print("Visualizing Clusters...")
    plot_clusters(X, y_pred, kmeans)
    
def compute_knn():

    y_single = y.idxmax(axis=1).map({'FTR_A': 'Away', 'FTR_D': 'Draw', 'FTR_H': 'Home'})
    le = LabelEncoder()
    y_encoded = le.fit_transform(y_single)


    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y_encoded, 
        test_size=0.2, 
        random_state=42, 
        stratify=y_encoded
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, 
        test_size=0.2, 
        random_state=42, 
        stratify=y_train
    )


    # Initialize K-Means with number of clusters equal to the number of unique classes
    n_clusters = len(le.classes_) 
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(X_train)
    cluster_labels = kmeans.labels_

    cont_matrix = confusion_matrix(y_train, cluster_labels)
    # Apply the Hungarian algorithm to find the optimal mapping
    row_ind, col_ind = linear_sum_assignment(-cont_matrix)
    cluster_to_label = {}
    for cluster, label in zip(col_ind, row_ind):
        cluster_to_label[cluster] = label


    test_cluster_labels = kmeans.predict(X_test)
    # Map cluster labels to actual labels using the optimal cluster_to_label mapping
    test_predicted_labels = pd.Series(test_cluster_labels).map(cluster_to_label)
    test_predicted_labels = test_predicted_labels.fillna(-1).astype(int)
    predicted_labels_original = np.empty_like(test_predicted_labels, dtype=object)

    # Identify valid predictions (clusters that were mapped)
    valid_indices = test_predicted_labels != -1

    # Inverse transform only valid predictions
    predicted_labels_original[valid_indices] = le.inverse_transform(test_predicted_labels[valid_indices])

    # Assign a placeholder for invalid predictions
    predicted_labels_original[~valid_indices] = "Unknown"

    # Inverse transform true labels for evaluation
    y_test_original = le.inverse_transform(y_test)

    print("\nClassification Report:")
    print(classification_report(
        y_test_original[valid_indices],
        predicted_labels_original[valid_indices],
        zero_division=0,
        digits=4
    ))

    evaluate_performance(X_train, y_test, y_train, kmeans)


compute_knn()

#### SVM

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns


def compute_svm():

    y_single = y.idxmax(axis=1).map({'FTR_A': 'Away', 'FTR_D': 'Draw', 'FTR_H': 'Home'})

    # Initialize LabelEncoder
    le = LabelEncoder()

    # Fit and transform the single labels
    y_encoded = le.fit_transform(y_single)

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.25, random_state=42, stratify=y_encoded)

    # Train SVM model
    svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
    svm_model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = svm_model.predict(X_test)
    report = classification_report(y_test, y_pred, target_names=le.classes_, digits=4)
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Display performance
    print("Classification Report:\n", report)

compute_svm()


### 5.2 Chosen Approach - Gradient Boosted Decision Tree

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold

# Convert one-hot encoded labels to single integer labels
label_mapping = {'FTR_A': 0, 'FTR_D': 1, 'FTR_H': 2}

# Function to convert one-hot to single label
def one_hot_to_single(y):
    return y.idxmax(axis=1).map(label_mapping)

# Apply the function to y_train and y_test
y_train_single = one_hot_to_single(y_train)
y_test_single = one_hot_to_single(y_test)
y_val_single = one_hot_to_single(y_val)





# # Compute class weights
# class_weights = compute_class_weight(
#     class_weight='balanced',  # Use 'balanced' strategy
#     classes=np.unique(y_train_single),  # Unique class labels
#     y=y_train_single  # Training target labels
# )

# # Map class weights to a dictionary
# class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
# print("Class weights:", class_weight_dict)

# # Assign sample weights based on class weights
# sample_weights = np.array([class_weight_dict[label] for label in y_train_single])

# Define custom class weights (adjusted manually based on confusion matrix analysis)
# class_weight_dict = {0: 1.07, 1: 1.63, 2: 0.91}  # Adjusted weights for Home Win, Draw, Away Win
# print("Custom Class Weights:", class_weight_dict)

# # Assign sample weights based on custom class weights
# sample_weights = np.array([class_weight_dict[label] for label in y_train_single])

# # Verify the sample weights
# print("Sample Weights Example:", sample_weights[:10])  # Display the first 10 sample weights

# Initialise model
xgb_clf = XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    eval_metric=['mlogloss', 'merror'],
    use_label_encoder=False,
    random_state=42,
    n_jobs=-1,
    
    # Lower learning_rate means the model trains more slowly but can generalize better
    learning_rate=0.001,     
    
    # Increased n_estimators to compensate for the lower learning rate
    n_estimators=100,      
    
    # Reduce max_depth to reduce model complexity
    max_depth=4,           
    
    # Increase min_child_weight to require more samples at leaf nodes
    min_child_weight=5,    
    
    # Increase gamma if you want to further penalize splits
    gamma=0.5,             
    
    # Increase regularization to penalize large coefficients
    reg_alpha=2.0,
    reg_lambda=15.0,
    early_stopping=5,
    booster='dart',          # Enable dropout-based boosting
    sample_type='uniform',   # How to sample weights (can be 'uniform' or 'weighted')
    normalize_type='tree',   # How to normalize tree weight (can be 'tree' or 'forest')
    rate_drop=0.1,           # Dropout rate for trees (tune as needed)
    skip_drop=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    colsample_bylevel=0.8,   # further reduce the chance of overfitting
    colsample_bynode=0.8,
    grow_policy='lossguide',    # or 'depthwise'
    max_leaves=32,   
)



# Map your one-hot labels to single integer labels
label_mapping = {'FTR_A': 0, 'FTR_D': 1, 'FTR_H': 2}

def one_hot_to_single(y):
    return y.idxmax(axis=1).map(label_mapping)

y_train_single = one_hot_to_single(y_train)
y_val_single = one_hot_to_single(y_val)
y_test_single = one_hot_to_single(y_test)


# Train the classifier
xgb_clf.fit(
    X_train, 
    y_train_single,
    eval_set=[(X_train, y_train_single), (X_val, y_val_single)],
    verbose=True
)



In [None]:
# Retrieve evaluation results
evals_result = xgb_clf.evals_result()

# Extract log loss for training and validation sets
train_logloss = evals_result['validation_0']['mlogloss']
val_logloss = evals_result['validation_1']['mlogloss']

# Extract error rates and convert to accuracy
train_merror = evals_result['validation_0']['merror']
val_merror = evals_result['validation_1']['merror']

train_accuracy = [1 - error for error in train_merror]
val_accuracy = [1 - error for error in val_merror]

# Determine the number of boosting rounds
num_rounds = len(train_logloss)

# Plotting Log Loss and Accuracy
plt.figure(figsize=(14, 6))

# Subplot 1: Accuracy
plt.subplot(1, 2, 1)
plt.plot(range(1, num_rounds + 1), train_accuracy, label='Training Accuracy')
plt.plot(range(1, num_rounds + 1), val_accuracy, label='Validation Accuracy')
plt.xlabel('Boosting Rounds')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
plt.title('XGBoost Training and Validation Accuracy')
plt.legend()
plt.grid(True)

# Subplot 2: Log Loss
plt.subplot(1, 2, 2)
plt.plot(range(1, num_rounds + 1), train_logloss, label='Training Log Loss')
plt.plot(range(1, num_rounds + 1), val_logloss, label='Validation Log Loss')
plt.xlabel('Boosting Rounds')
plt.ylabel('Log Loss')
plt.ylim(0, 10)
plt.title('XGBoost Training and Validation Log Loss')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Make predictions on the test set
y_pred_xgb = xgb_clf.predict(X_test)

# Calculate test accuracy
test_accuracy_xgb = accuracy_score(y_test_single, y_pred_xgb)
print(f"XGBoost Test Accuracy: {test_accuracy_xgb:.2f}")

# Generate classification report
print("\nXGBoost Classification Report:")
print(classification_report(y_test_single, y_pred_xgb, target_names=['FTR_A (Home Win)', 'FTR_D (Draw)', 'FTR_H (Away Win)'], digits=4))

# Generate confusion matrix
print("\nXGBoost Confusion Matrix:")
conf_matrix = confusion_matrix(y_test_single, y_pred_xgb)
print(conf_matrix)

# Visualize the confusion matrix
plt.figure(figsize=(6,5))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Home Win', 'Draw', 'Away Win'],
            yticklabels=['Home Win', 'Draw', 'Away Win'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('XGBoost Confusion Matrix')
plt.show()

### 6. Results

### 7. Final Predictions on Test Set

In [359]:
# Code for Section 7. Final Predictions on Test Set