In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [12]:
import os
import glob

dataset_dir = "../dataset"

pattern = os.path.join(dataset_dir, "*.csv")
files = glob.glob(pattern)


season_dfs = {}
for path in files:
    fname= os.path.basename(path)
    if not (fname.startswith("Season ")or fname.startswith("20")or fname.startswith("epl-2024-2025")):
        continue
    if fname.startswith("Season "):
        raw = fname[len("Season"):-4]
        season = raw[:4]+ "_" + raw[4:]
    elif fname.startswith("epl-2024-2025"):
        season="2024_25"
    else:
        raw = fname[:-4]
        season = raw[:4] + " " + raw[4:]
    season_dfs[season] = pd.read_csv(path)

    combined = pd.concat([df.assign(season=season) for season,df in season_dfs.items()], ignore_index=True)

In [None]:
combined.to_csv("../dataset/epl_all_seasons_2009_25.csv", index=False)

In [17]:
df = pd.read_csv("../dataset/epl_all_seasons_2009_25.csv")

In [3]:
df.isnull().sum()

Div         0
Date        0
Time        0
HomeTeam    0
AwayTeam    0
FTHG        0
FTAG        0
FTR         0
HTHG        0
HTAG        0
HTR         0
Referee     0
HS          0
AS          0
HST         0
AST         0
HF          0
AF          0
HC          0
AC          0
HY          0
AY          0
HR          0
AR          0
dtype: int64

In [4]:
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
print("\nNumeric summary:\n", df[numeric_cols].describe())

cat_cols = df.select_dtypes(include=['object', 'category']).columns
for col in cat_cols:
    print(f"\nValue counts for {col}:\n", df[col].value_counts().head(5))



Numeric summary:
              FTHG        FTAG        HTHG        HTAG          HS          AS  \
count  380.000000  380.000000  380.000000  380.000000  380.000000  380.000000   
mean     1.513158    1.421053    0.752632    0.610526   13.752632   12.165789   
std      1.277917    1.189922    0.875867    0.828831    5.588045    5.409987   
min      0.000000    0.000000    0.000000    0.000000    2.000000    1.000000   
25%      1.000000    1.000000    0.000000    0.000000   10.000000    9.000000   
50%      1.000000    1.000000    1.000000    0.000000   13.000000   11.500000   
75%      2.000000    2.000000    1.000000    1.000000   17.000000   15.000000   
max      7.000000    6.000000    4.000000    5.000000   36.000000   37.000000   

              HST         AST          HF          AF          HC          AC  \
count  380.000000  380.000000  380.000000  380.000000  380.000000  380.000000   
mean     4.834211    4.265789   10.789474   11.276316    5.426316    4.871053   
std     

In [5]:
df.head(2)

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,E0,16/08/2024,20:00,Man United,Fulham,1,0,H,0,0,...,5,2,12,10,7,8,2,3,0,0
1,E0,17/08/2024,12:30,Ipswich,Liverpool,0,2,A,0,0,...,2,5,9,18,2,10,3,1,0,0


In [6]:
all_teams = df["HomeTeam"].unique()
team_home_stats = []
team_away_stats =[]
for team in all_teams:
    home_games = df[df["HomeTeam"] == team]
    total_home_games = len(home_games)
    wins = home_games[home_games["FTR"] == "H"].shape[0]
    draws = home_games[home_games["FTR"] == "D"].shape[0]
    losses = home_games[home_games["FTR"] == "A"].shape[0]
    played_against = [home_games["AwayTeam"]]
    goals_for = home_games["FTHG"].sum()
    goals_against = home_games["FTAG"].sum()
    
    team_home_stats.append({
        "team": team,
        "home_games": total_home_games,
        "home_wins": wins,
        "home_draws": draws,
        "home_losses": losses,
        "goals_for_at_home": goals_for,
        "goals_against_at_home": goals_against,
    })

    away_games = df[df["AwayTeam"] == team]
    total_away_games = len(away_games)

    
    away_wins   = away_games[away_games["FTR"] == "A"].shape[0]
    away_draws  = away_games[away_games["FTR"] == "D"].shape[0]
    away_losses = away_games[away_games["FTR"] == "H"].shape[0]

    
    goals_for_away     = away_games["FTAG"].sum()
    goals_against_away = away_games["FTHG"].sum()

    team_away_stats.append({
        "team": team,
        "away_games": total_away_games,
        "away_wins": away_wins,
        "away_draws": away_draws,
        "away_losses": away_losses,
        "goals_for_away": goals_for_away,
        "goals_against_away": goals_against_away,
    })


In [7]:
team_home_Stats= pd.DataFrame(team_home_stats)
team_away_Stats= pd.DataFrame(team_away_stats)

In [8]:
team_home_Stats

Unnamed: 0,team,home_games,home_wins,home_draws,home_losses,goals_for_at_home,goals_against_at_home
0,Man United,19,7,3,9,23,28
1,Ipswich,19,1,4,14,14,44
2,Arsenal,19,11,6,2,35,17
3,Everton,19,5,9,5,26,23
4,Newcastle,19,12,2,5,40,20
5,Nott'm Forest,19,9,5,5,26,16
6,West Ham,19,5,5,9,23,34
7,Brentford,19,9,4,6,40,35
8,Chelsea,19,12,5,2,35,18
9,Leicester,19,4,3,12,15,34


In [9]:
team_away_Stats

Unnamed: 0,team,away_games,away_wins,away_draws,away_losses,goals_for_away,goals_against_away
0,Man United,19,4,6,9,21,26
1,Ipswich,19,3,6,10,22,38
2,Arsenal,19,9,8,2,34,17
3,Everton,19,6,6,7,16,21
4,Newcastle,19,8,4,7,28,27
5,Nott'm Forest,19,10,3,6,32,30
6,West Ham,19,6,5,8,23,28
7,Brentford,19,7,4,8,26,22
8,Chelsea,19,8,4,7,29,25
9,Leicester,19,2,4,13,18,46


In [10]:
X_train, X_test , Y_train, Y_test = train_test_split()

ValueError: At least one array required as input