In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [34]:
import os
import glob

dataset_dir = "../dataset"

pattern = os.path.join(dataset_dir, "*.csv")
files = glob.glob(pattern)


season_dfs = {}
for path in files:
    fname= os.path.basename(path)
    if not (fname.startswith("Season ")or fname.startswith("20")or fname.startswith("epl-2024-2025")):
        continue
    if fname.startswith("Season "):
        raw = fname[len("Season"):-4]
        season = raw[:4]+ "_" + raw[4:]
    elif fname.startswith("epl-2024-2025"):
        season="2024_25"
    else:
        raw = fname[:-4]
        season = raw[:4] + " " + raw[4:]
    season_dfs[season] = pd.read_csv(path)

    combined = pd.concat([df.assign(season=season) for season,df in season_dfs.items()], ignore_index=True)

In [35]:
combined.to_csv("../dataset/epl_all_seasons_2009_25.csv", index=False)

In [17]:
df = pd.read_csv("../dataset/epl_all_seasons_2009_25.csv")

In [18]:
df.head()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AST,HF,AF,HC,AC,HY,AY,HR,AR,season
0,E0,11/8/2023,20:00,Burnley,Man City,0.0,3.0,A,0.0,2.0,...,8.0,11.0,8.0,6.0,5.0,0.0,0.0,1.0,0.0,2023 2024
1,E0,12/8/2023,12:30,Arsenal,Nott'm Forest,2.0,1.0,H,2.0,0.0,...,2.0,12.0,12.0,8.0,3.0,2.0,2.0,0.0,0.0,2023 2024
2,E0,12/8/2023,15:00,Bournemouth,West Ham,1.0,1.0,D,0.0,0.0,...,3.0,9.0,14.0,10.0,4.0,1.0,4.0,0.0,0.0,2023 2024
3,E0,12/8/2023,15:00,Brighton,Luton,4.0,1.0,H,1.0,0.0,...,3.0,11.0,12.0,6.0,7.0,2.0,2.0,0.0,0.0,2023 2024
4,E0,12/8/2023,15:00,Everton,Fulham,0.0,1.0,A,0.0,0.0,...,2.0,12.0,6.0,10.0,4.0,0.0,2.0,0.0,0.0,2023 2024


In [22]:
df.isnull().sum()

Div            1
Date           1
Time        3801
HomeTeam       1
AwayTeam       1
FTHG           1
FTAG           1
FTR            1
HTHG           1
HTAG           1
HTR            1
Referee        1
HS             1
AS             1
HST            1
AST            1
HF             1
AF             1
HC             1
AC             1
HY             1
AY             1
HR             1
AR             1
season         0
dtype: int64

In [19]:
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
print("\nNumeric summary:\n", df[numeric_cols].describe())

cat_cols = df.select_dtypes(include=['object', 'category']).columns
for col in cat_cols:
    print(f"\nValue counts for {col}:\n", df[col].value_counts().head(5))



Numeric summary:
               FTHG         FTAG         HTHG         HTAG           HS  \
count  6080.000000  6080.000000  6080.000000  6080.000000  6080.000000   
mean      1.574836     1.227138     0.702632     0.544572    14.090461   
std       1.320990     1.185766     0.851381     0.755951     5.585452   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       1.000000     0.000000     0.000000     0.000000    10.000000   
50%       1.000000     1.000000     0.000000     0.000000    13.000000   
75%       2.000000     2.000000     1.000000     1.000000    17.000000   
max       9.000000     9.000000     5.000000     5.000000    43.000000   

                AS          HST          AST           HF           AF  \
count  6080.000000  6080.000000  6080.000000  6080.000000  6080.000000   
mean     11.405592     5.569572     4.522862    10.583882    11.063487   
std       4.926033     3.206666     2.752403     3.438344     3.607934   
min       0.000000

In [37]:
df["season"].value_counts().sort_index()


season
 200_92010    380
 201_02011    380
 201_12012    380
 201_22013    380
 201_32014    380
 201_42015    381
 201_52016    380
 201_62017    380
 201_72018    380
 201_82019    380
 201_92020    380
 202_02021    380
 202_12022    380
 202_22023    380
2023 2024     380
2024_25       380
Name: count, dtype: int64

In [33]:
all_teams = df["HomeTeam"].unique()
team_home_stats = []
team_away_stats =[]
for team in all_teams:
    home_games = df[df["HomeTeam"] == team]
    total_home_games = len(home_games)
    wins = home_games[home_games["FTR"] == "H"].shape[0]
    draws = home_games[home_games["FTR"] == "D"].shape[0]
    losses = home_games[home_games["FTR"] == "A"].shape[0]
    played_against = [home_games["AwayTeam"]]
    
    goals_for = home_games["FTHG"].sum()
    goals_against = home_games["FTAG"].sum()
    
    team_home_stats.append({
        "team": team,
        "home_games": total_home_games,
        "home_wins": wins,
        "home_draws": draws,
        "home_losses": losses,
        "goals_for_at_home": goals_for,
        "goals_against_at_home": goals_against,
        
    })

    away_games = df[df["AwayTeam"] == team]
    total_away_games = len(away_games)

    
    away_wins   = away_games[away_games["FTR"] == "A"].shape[0]
    away_draws  = away_games[away_games["FTR"] == "D"].shape[0]
    away_losses = away_games[away_games["FTR"] == "H"].shape[0]
    
    goals_for_away     = away_games["FTAG"].sum()
    goals_against_away = away_games["FTHG"].sum()

    team_away_stats.append({
        "team": team,
        "away_games": total_away_games,
        "away_wins": away_wins,
        "away_draws": away_draws,
        "away_losses": away_losses,
        "goals_for_away": goals_for_away,
        "goals_against_away": goals_against_away,
        
    })


In [34]:
team_home_Stats= pd.DataFrame(team_home_stats)
team_away_Stats= pd.DataFrame(team_away_stats)

In [38]:
df = df.copy()
df["home_points"] = df["FTR"].map({"H": 3, "D": 1, "A": 0})
df["away_points"] = df["FTR"].map({"A": 3, "D": 1, "H": 0})

home_df = df[["season", "Date", "HomeTeam", "home_points"]].rename(
    columns={"HomeTeam": "team", "home_points": "points", "Date": "date"}
)
away_df = df[["season", "Date", "AwayTeam", "away_points"]].rename(
    columns={"AwayTeam": "team", "away_points": "points", "Date": "date"}
)
long = pd.concat([home_df, away_df], ignore_index=True).sort_values(["team", "season", "date"])

long["rolling_5pts"] = (
    long
    .groupby(["team", "season"])["points"]
    .rolling(window=5, min_periods=1)
    .sum()
    .reset_index(level=[0,1], drop=True)
)
