In [30]:
import pandas as pd
from datetime import datetime
import os

In [89]:
# === CONFIG ===
DATA_URL = "https://www.football-data.co.uk/mmz4281/1718/E0.csv"
LEAGUE_NAME = "Premier League"
SEASON = "2017/18"
CSV_DIR = "../data/raw"
os.makedirs(CSV_DIR, exist_ok=True)  # Ensure folder exists

In [90]:
# === 1. EXTRACT ===
print("ðŸ“¥ Downloading data...")
df = pd.read_csv(DATA_URL)

ðŸ“¥ Downloading data...


In [91]:
df.head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,BbAv<2.5,BbAH,BbAHh,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,PSCH,PSCD,PSCA
0,E0,11/08/2017,Arsenal,Leicester,4,3,H,2,2,D,...,2.32,21,-1.0,1.91,1.85,2.1,2.02,1.49,4.73,7.25
1,E0,12/08/2017,Brighton,Man City,0,2,A,0,0,D,...,2.27,20,1.5,1.95,1.91,2.01,1.96,11.75,6.15,1.29
2,E0,12/08/2017,Chelsea,Burnley,2,3,A,0,3,A,...,2.23,20,-1.75,2.03,1.97,1.95,1.9,1.33,5.4,12.25
3,E0,12/08/2017,Crystal Palace,Huddersfield,0,3,A,0,2,A,...,1.72,18,-0.75,2.1,2.05,1.86,1.83,1.79,3.56,5.51
4,E0,12/08/2017,Everton,Stoke,1,0,H,1,0,H,...,1.76,19,-0.75,1.94,1.9,2.01,1.98,1.82,3.49,5.42


In [92]:
# === 2. TRANSFORM ===
print("ðŸ§¹ Cleaning data...")
df = df.rename(columns={
    'Date': 'date',
    'HomeTeam': 'home_team',
    'AwayTeam': 'away_team',
    'FTHG': 'home_goals',
    'FTAG': 'away_goals',
    'FTR': 'result',
    'HS': 'home_shots',
    'AS': 'away_shots',
    'HST': 'home_shots_on_target',
    'AST': 'away_shots_on_target',
    'B365H': 'odds_home',
    'B365D': 'odds_draw',
    'B365A': 'odds_away'
})

keep_cols = [
    'date', 'home_team', 'away_team', 'home_goals', 'away_goals', 'result',
    'home_shots', 'away_shots', 'home_shots_on_target', 'away_shots_on_target',
    'odds_home', 'odds_draw', 'odds_away'
]
df = df[keep_cols]

# Add metadata
df['league'] = LEAGUE_NAME
df['season'] = SEASON

# Convert date
try:
    df['date'] = pd.to_datetime(df['date'], dayfirst=True)
except Exception:
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

ðŸ§¹ Cleaning data...


In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   date                  380 non-null    datetime64[ns]
 1   home_team             380 non-null    object        
 2   away_team             380 non-null    object        
 3   home_goals            380 non-null    int64         
 4   away_goals            380 non-null    int64         
 5   result                380 non-null    object        
 6   home_shots            380 non-null    int64         
 7   away_shots            380 non-null    int64         
 8   home_shots_on_target  380 non-null    int64         
 9   away_shots_on_target  380 non-null    int64         
 10  odds_home             380 non-null    float64       
 11  odds_draw             380 non-null    float64       
 12  odds_away             380 non-null    float64       
 13  league              

In [94]:
# === 3. SAVE RAW CSV ===
# Create dynamic filename
season_fmt = SEASON.replace("/", "-")
league_fmt = LEAGUE_NAME.replace(" ", "_")
csv_filename = os.path.join(CSV_DIR, f"{league_fmt}_{season_fmt}.csv")

df.to_csv(csv_filename, index=False)
print(f"ðŸ’¾ Saved raw matches to {csv_filename}")

ðŸ’¾ Saved raw matches to ../data/raw\Premier_League_2017-18.csv


In [95]:
premier_df = pd.read_csv("../data/raw/Premier_League_2017-18.csv")

In [96]:
premier_df.head()

Unnamed: 0,date,home_team,away_team,home_goals,away_goals,result,home_shots,away_shots,home_shots_on_target,away_shots_on_target,odds_home,odds_draw,odds_away,league,season
0,2017-08-11,Arsenal,Leicester,4,3,H,27,6,10,3,1.53,4.5,6.5,Premier League,2017/18
1,2017-08-12,Brighton,Man City,0,2,A,6,14,2,4,11.0,5.5,1.33,Premier League,2017/18
2,2017-08-12,Chelsea,Burnley,2,3,A,19,10,6,5,1.25,6.5,15.0,Premier League,2017/18
3,2017-08-12,Crystal Palace,Huddersfield,0,3,A,14,8,4,6,1.83,3.6,5.0,Premier League,2017/18
4,2017-08-12,Everton,Stoke,1,0,H,9,9,4,1,1.7,3.8,5.75,Premier League,2017/18
