# 02 - Data Preparation

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
pd.set_option('display.max_columns',None)

## Reading Data

In [3]:
data = os.path.join('..','data', 'processed','all_concat_football_data.csv')
data

'../data/processed/all_concat_football_data.csv'

In [4]:
df = pd.read_csv(data)
df.head(30)

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,Season
0,13/08/2005,Aston Villa,Bolton,2.0,2.0,D,2.0,2.0,D,M Riley,3.0,13.0,2.0,6.0,14.0,16.0,7.0,8.0,0.0,2.0,0.0,0.0,2.3,3.25,3.0,2005
1,13/08/2005,Everton,Man United,0.0,2.0,A,0.0,1.0,A,G Poll,10.0,12.0,5.0,5.0,15.0,14.0,8.0,6.0,3.0,1.0,0.0,0.0,5.0,3.4,1.72,2005
2,13/08/2005,Fulham,Birmingham,0.0,0.0,D,0.0,0.0,D,R Styles,15.0,7.0,7.0,4.0,12.0,13.0,6.0,6.0,1.0,2.0,0.0,0.0,2.37,3.25,2.87,2005
3,13/08/2005,Man City,West Brom,0.0,0.0,D,0.0,0.0,D,C Foy,15.0,13.0,8.0,3.0,13.0,11.0,3.0,6.0,2.0,3.0,0.0,0.0,1.72,3.4,5.0,2005
4,13/08/2005,Middlesbrough,Liverpool,0.0,0.0,D,0.0,0.0,D,M Halsey,4.0,16.0,2.0,7.0,17.0,11.0,5.0,0.0,2.0,3.0,1.0,0.0,2.87,3.2,2.4,2005
5,13/08/2005,Portsmouth,Tottenham,0.0,2.0,A,0.0,1.0,A,B Knight,11.0,11.0,7.0,6.0,13.0,23.0,7.0,2.0,0.0,2.0,0.0,0.0,2.75,3.2,2.5,2005
6,13/08/2005,Sunderland,Charlton,1.0,3.0,A,1.0,1.0,D,H Webb,12.0,14.0,8.0,4.0,15.0,17.0,5.0,5.0,2.0,0.0,0.0,1.0,2.37,3.2,2.9,2005
7,13/08/2005,West Ham,Blackburn,3.0,1.0,H,0.0,1.0,A,A Wiley,13.0,11.0,5.0,5.0,11.0,14.0,2.0,6.0,0.0,1.0,0.0,1.0,2.5,3.2,2.75,2005
8,14/08/2005,Arsenal,Newcastle,2.0,0.0,H,0.0,0.0,D,S Bennett,15.0,2.0,12.0,1.0,15.0,17.0,8.0,3.0,0.0,1.0,0.0,1.0,1.4,4.2,8.0,2005
9,14/08/2005,Wigan,Chelsea,0.0,1.0,A,0.0,0.0,D,M Clattenburg,12.0,20.0,5.0,9.0,14.0,7.0,2.0,6.0,1.0,0.0,0.0,0.0,10.0,4.5,1.33,2005


In [5]:
def fixing_columns_teams_referees(df):
    # Lowercase all columns and replace spaces with underscores
    df.columns = df.columns.str.lower().str.replace(' ', '_')
      
    # Defining the renaming dictionary
    columns = {
        'hometeam': 'home_team',
        'awayteam': 'away_team',
        'fthg': 'home_total_goals',
        'ftag': 'away_total_goals',
        'ftr': 'full_time_result',
        'hthg': 'home_half_goals',
        'htag': 'away_half_goals',
        'htr': 'half_time_result',
        'hs': 'home_total_shots',
        'as': 'away_total_shots',
        'hst': 'home_shots_on_target',
        'ast': 'away_shots_on_target',
        'hf': 'home_fouls',
        'af': 'away_fouls',
        'hc': 'home_corners',
        'ac': 'away_corners',
        'hy': 'home_yellow_cards',
        'ay': 'away_yellow_cards',
        'hr': 'home_red_cards',
        'ar': 'away_red_cards',
        'b365h': 'market_home_odds',
        'b365d': 'market_draw_odds',
        'b365a': 'market_away_odds'
    }
    
    df.rename(columns=columns, inplace=True)
    
    for col in ['home_team', 'away_team']:
        if col in df.columns:
            df[col] = df[col].str.lower().str.replace("'", "")  # Remove apostrophe specifically
            

    if 'referee' in df.columns:
        df['referee'] = df['referee'].str.lower().replace(' ', '_')
    
    return df


In [6]:
df = fixing_columns_teams_referees(df)
df.head(3)

Unnamed: 0,date,home_team,away_team,home_total_goals,away_total_goals,full_time_result,home_half_goals,away_half_goals,half_time_result,referee,home_total_shots,away_total_shots,home_shots_on_target,away_shots_on_target,home_fouls,away_fouls,home_corners,away_corners,home_yellow_cards,away_yellow_cards,home_red_cards,away_red_cards,market_home_odds,market_draw_odds,market_away_odds,season
0,13/08/2005,aston villa,bolton,2.0,2.0,D,2.0,2.0,D,m riley,3.0,13.0,2.0,6.0,14.0,16.0,7.0,8.0,0.0,2.0,0.0,0.0,2.3,3.25,3.0,2005
1,13/08/2005,everton,man united,0.0,2.0,A,0.0,1.0,A,g poll,10.0,12.0,5.0,5.0,15.0,14.0,8.0,6.0,3.0,1.0,0.0,0.0,5.0,3.4,1.72,2005
2,13/08/2005,fulham,birmingham,0.0,0.0,D,0.0,0.0,D,r styles,15.0,7.0,7.0,4.0,12.0,13.0,6.0,6.0,1.0,2.0,0.0,0.0,2.37,3.25,2.87,2005


In [7]:
df.columns

Index(['date', 'home_team', 'away_team', 'home_total_goals',
       'away_total_goals', 'full_time_result', 'home_half_goals',
       'away_half_goals', 'half_time_result', 'referee', 'home_total_shots',
       'away_total_shots', 'home_shots_on_target', 'away_shots_on_target',
       'home_fouls', 'away_fouls', 'home_corners', 'away_corners',
       'home_yellow_cards', 'away_yellow_cards', 'home_red_cards',
       'away_red_cards', 'market_home_odds', 'market_draw_odds',
       'market_away_odds', 'season'],
      dtype='object')

In [8]:
df['home_team'].value_counts().to_frame().reset_index().rename(columns={'index': 'team', 'home_team': 'count'}).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43
count,tottenham,man united,everton,man city,liverpool,chelsea,arsenal,west ham,newcastle,aston villa,fulham,crystal palace,southampton,sunderland,west brom,stoke,leicester,wolves,burnley,wigan,bournemouth,brighton,watford,blackburn,swansea,bolton,norwich,portsmouth,hull,middlesbrough,birmingham,sheffield united,brentford,qpr,reading,leeds,nottm forest,charlton,huddersfield,cardiff,derby,blackpool,luton,ipswich
count,372,372,371,371,371,371,371,353,333,315,258,220,219,209,209,190,182,181,171,152,143,143,133,133,133,133,114,95,95,95,76,76,68,57,57,57,48,38,38,38,19,19,19,11


## Exploratory data analysis

### Missing Values

In [9]:
## checking for missing values
df.isnull().sum()

date                    0
home_team               0
away_team               0
home_total_goals        0
away_total_goals        0
full_time_result        0
home_half_goals         0
away_half_goals         0
half_time_result        0
referee                 0
home_total_shots        0
away_total_shots        0
home_shots_on_target    0
away_shots_on_target    0
home_fouls              0
away_fouls              0
home_corners            0
away_corners            0
home_yellow_cards       0
away_yellow_cards       0
home_red_cards          0
away_red_cards          0
market_home_odds        0
market_draw_odds        0
market_away_odds        0
season                  0
dtype: int64

In [10]:
# checking row with NaN values
df[df.isnull().any(axis=1)]

Unnamed: 0,date,home_team,away_team,home_total_goals,away_total_goals,full_time_result,home_half_goals,away_half_goals,half_time_result,referee,home_total_shots,away_total_shots,home_shots_on_target,away_shots_on_target,home_fouls,away_fouls,home_corners,away_corners,home_yellow_cards,away_yellow_cards,home_red_cards,away_red_cards,market_home_odds,market_draw_odds,market_away_odds,season


In [11]:
# removing missing row and reset index
df = df.dropna().reset_index(drop=True)

### Duplicates

In [12]:
# checking for duplicates
df.duplicated().sum()

np.int64(0)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7429 entries, 0 to 7428
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   date                  7429 non-null   object 
 1   home_team             7429 non-null   object 
 2   away_team             7429 non-null   object 
 3   home_total_goals      7429 non-null   float64
 4   away_total_goals      7429 non-null   float64
 5   full_time_result      7429 non-null   object 
 6   home_half_goals       7429 non-null   float64
 7   away_half_goals       7429 non-null   float64
 8   half_time_result      7429 non-null   object 
 9   referee               7429 non-null   object 
 10  home_total_shots      7429 non-null   float64
 11  away_total_shots      7429 non-null   float64
 12  home_shots_on_target  7429 non-null   float64
 13  away_shots_on_target  7429 non-null   float64
 14  home_fouls            7429 non-null   float64
 15  away_fouls           

In [14]:
def feature_engineering(df):
  # ---------------- Feature Engineering ----------------

  # ---------------- Goal Difference ----------------

  # Goal Difference
  df['goal_difference'] = df['home_total_goals'] - df['away_total_goals']

  # Aggregated Match Statistics
  df['total_shots'] = df['home_total_shots'] + df['away_total_shots']
  df['total_shots_on_target'] = df['home_shots_on_target'] + df['away_shots_on_target']
  df['total_fouls'] = df['home_fouls'] + df['away_fouls']
  df['total_corners'] = df['home_corners'] + df['away_corners']
  df['home_shot_accuracy'] = df['home_shots_on_target'] / df['home_total_shots'].replace(0, 1)
  df['away_shot_accuracy'] = df['away_shots_on_target'] / df['away_total_goals'].replace(0, 1)

  # Time-Based Features
  df['original_date'] = df['date']
  df['date'] = pd.to_datetime(df['date'], format='%d/%m/%y', errors='coerce')
  df['date'] = df['date'].combine_first(pd.to_datetime(df['original_date'], format='%d/%m/%Y', errors='coerce'))
  df['date'] = df['date'].dt.strftime('%d/%m/%y')
  df['date'] = pd.to_datetime(df['date'], format='%d/%m/%y')
  df.drop(columns=['original_date'], inplace=True)
  df['day_of_week'] = df['date'].dt.dayofweek
  df['month'] = df['date'].dt.month
  df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 6.0)
  df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 6.0)
  df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12.0)
  df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12.0)

  # Team-Based Features
  df['ratio_h_a_shots'] = df['home_total_shots'] / df['away_total_shots'].replace(0, 1)
  df['ratio_h_a_fouls'] = df['home_fouls'] / df['away_fouls'].replace(0, 1)
  df['ratio_a_h_shots'] = df['away_total_shots'] / df['home_total_shots'].replace(0, 1)
  df['ratio_a_h_fouls'] = df['away_fouls'] / df['home_fouls'].replace(0, 1)

  # Betting Odds-Based Features
  df['implied_home_win_prob'] = 1 / df['market_home_odds']
  df['implied_draw_prob'] = 1 / df['market_draw_odds']
  df['implied_away_win_prob'] = 1 / df['market_away_odds']
  total_prob = df['implied_home_win_prob'] + df['implied_draw_prob'] + df['implied_away_win_prob']
  df['implied_home_win_prob'] /= total_prob
  df['implied_draw_prob'] /= total_prob
  df['implied_away_win_prob'] /= total_prob

  # Rolling Averages
  features = ['home_total_goals', 'away_total_goals', 'home_total_shots', 'away_total_shots', 
        'home_shots_on_target', 'away_shots_on_target', 'home_fouls', 'away_fouls',
        'home_corners', 'away_corners', 'home_yellow_cards', 'away_yellow_cards',
        'home_red_cards', 'away_red_cards', 'home_shot_accuracy', 'away_shot_accuracy',
        'ratio_h_a_shots', 'ratio_h_a_fouls', 'ratio_a_h_shots', 
        'ratio_a_h_fouls', 'goal_difference']
  new_columns = []
  for i in [3, 5]:
    for feature in features:
      home_rolling = (
        df.sort_values(['season', 'home_team', 'date'])
          .groupby(['season', 'home_team'])[feature]
          .apply(lambda x: x.shift(1).rolling(window=i).mean())
          .reset_index(level=[0,1], drop=True)
          .fillna(0)
      )
      away_rolling = (
        df.sort_values(['season', 'away_team', 'date'])
          .groupby(['season', 'away_team'])[feature]
          .apply(lambda x: x.shift(1).rolling(window=i).mean())
          .reset_index(level=[0,1], drop=True)
          .fillna(0)
      )
      new_columns.append(home_rolling.rename(f'home_roll_{i}_avg_{feature}'))
      new_columns.append(away_rolling.rename(f'away_roll_{i}_avg_{feature}'))
  df = pd.concat([df] + new_columns, axis=1)

  # Cumulative Points
  home_points = df['full_time_result'].apply(lambda x: 3 if x == 'H' else (1 if x == 'D' else 0))
  away_points = df['full_time_result'].apply(lambda x: 3 if x == 'A' else (1 if x == 'D' else 0))
  df = pd.concat([df, home_points.rename('home_points'), away_points.rename('away_points')], axis=1)
  df['home_cumulative_points'] = df.groupby(['season', 'home_team'])['home_points'].transform('cumsum')
  df['away_cumulative_points'] = df.groupby(['season', 'away_team'])['away_points'].transform('cumsum')
  df.drop(columns=['home_points', 'away_points'], inplace=True)
  
  
  # removing uneccessary columns
  columns_to_drop = ['date', 'home_total_goals',
      'away_total_goals', 'home_half_goals',
      'away_half_goals', 'half_time_result', 'home_total_shots',
      'away_total_shots', 'home_shots_on_target', 'away_shots_on_target',
      'home_fouls', 'away_fouls', 'home_corners', 'away_corners',
      'home_yellow_cards', 'away_yellow_cards', 'home_red_cards',
      'away_red_cards','goal_difference','total_shots',
      'total_shots_on_target','total_fouls','total_corners','home_shot_accuracy',
      'away_shot_accuracy','ratio_h_a_shots','ratio_h_a_fouls',
      'ratio_a_h_shots','ratio_a_h_fouls','referee','market_home_odds',
      'market_draw_odds','market_away_odds']

  df_preparation = df.drop(columns=columns_to_drop)

  return df_preparation


df = feature_engineering(df)

df.head()

Unnamed: 0,home_team,away_team,full_time_result,season,day_of_week,month,day_of_week_sin,day_of_week_cos,month_sin,month_cos,implied_home_win_prob,implied_draw_prob,implied_away_win_prob,home_roll_3_avg_home_total_goals,away_roll_3_avg_home_total_goals,home_roll_3_avg_away_total_goals,away_roll_3_avg_away_total_goals,home_roll_3_avg_home_total_shots,away_roll_3_avg_home_total_shots,home_roll_3_avg_away_total_shots,away_roll_3_avg_away_total_shots,home_roll_3_avg_home_shots_on_target,away_roll_3_avg_home_shots_on_target,home_roll_3_avg_away_shots_on_target,away_roll_3_avg_away_shots_on_target,home_roll_3_avg_home_fouls,away_roll_3_avg_home_fouls,home_roll_3_avg_away_fouls,away_roll_3_avg_away_fouls,home_roll_3_avg_home_corners,away_roll_3_avg_home_corners,home_roll_3_avg_away_corners,away_roll_3_avg_away_corners,home_roll_3_avg_home_yellow_cards,away_roll_3_avg_home_yellow_cards,home_roll_3_avg_away_yellow_cards,away_roll_3_avg_away_yellow_cards,home_roll_3_avg_home_red_cards,away_roll_3_avg_home_red_cards,home_roll_3_avg_away_red_cards,away_roll_3_avg_away_red_cards,home_roll_3_avg_home_shot_accuracy,away_roll_3_avg_home_shot_accuracy,home_roll_3_avg_away_shot_accuracy,away_roll_3_avg_away_shot_accuracy,home_roll_3_avg_ratio_h_a_shots,away_roll_3_avg_ratio_h_a_shots,home_roll_3_avg_ratio_h_a_fouls,away_roll_3_avg_ratio_h_a_fouls,home_roll_3_avg_ratio_a_h_shots,away_roll_3_avg_ratio_a_h_shots,home_roll_3_avg_ratio_a_h_fouls,away_roll_3_avg_ratio_a_h_fouls,home_roll_3_avg_goal_difference,away_roll_3_avg_goal_difference,home_roll_5_avg_home_total_goals,away_roll_5_avg_home_total_goals,home_roll_5_avg_away_total_goals,away_roll_5_avg_away_total_goals,home_roll_5_avg_home_total_shots,away_roll_5_avg_home_total_shots,home_roll_5_avg_away_total_shots,away_roll_5_avg_away_total_shots,home_roll_5_avg_home_shots_on_target,away_roll_5_avg_home_shots_on_target,home_roll_5_avg_away_shots_on_target,away_roll_5_avg_away_shots_on_target,home_roll_5_avg_home_fouls,away_roll_5_avg_home_fouls,home_roll_5_avg_away_fouls,away_roll_5_avg_away_fouls,home_roll_5_avg_home_corners,away_roll_5_avg_home_corners,home_roll_5_avg_away_corners,away_roll_5_avg_away_corners,home_roll_5_avg_home_yellow_cards,away_roll_5_avg_home_yellow_cards,home_roll_5_avg_away_yellow_cards,away_roll_5_avg_away_yellow_cards,home_roll_5_avg_home_red_cards,away_roll_5_avg_home_red_cards,home_roll_5_avg_away_red_cards,away_roll_5_avg_away_red_cards,home_roll_5_avg_home_shot_accuracy,away_roll_5_avg_home_shot_accuracy,home_roll_5_avg_away_shot_accuracy,away_roll_5_avg_away_shot_accuracy,home_roll_5_avg_ratio_h_a_shots,away_roll_5_avg_ratio_h_a_shots,home_roll_5_avg_ratio_h_a_fouls,away_roll_5_avg_ratio_h_a_fouls,home_roll_5_avg_ratio_a_h_shots,away_roll_5_avg_ratio_a_h_shots,home_roll_5_avg_ratio_a_h_fouls,away_roll_5_avg_ratio_a_h_fouls,home_roll_5_avg_goal_difference,away_roll_5_avg_goal_difference,home_cumulative_points,away_cumulative_points
0,aston villa,bolton,D,2005,5,8,-0.866025,0.5,-0.866025,-0.5,0.404145,0.28601,0.309845,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1
1,everton,man united,A,2005,5,8,-0.866025,0.5,-0.866025,-0.5,0.185958,0.273467,0.540575,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,3
2,fulham,birmingham,D,2005,5,8,-0.866025,0.5,-0.866025,-0.5,0.391387,0.285412,0.323201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1
3,man city,west brom,D,2005,5,8,-0.866025,0.5,-0.866025,-0.5,0.540575,0.273467,0.185958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1
4,middlesbrough,liverpool,D,2005,5,8,-0.866025,0.5,-0.866025,-0.5,0.323341,0.289997,0.386662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1


In [15]:
print(len(df.columns))

df.columns

99


Index(['home_team', 'away_team', 'full_time_result', 'season', 'day_of_week',
       'month', 'day_of_week_sin', 'day_of_week_cos', 'month_sin', 'month_cos',
       'implied_home_win_prob', 'implied_draw_prob', 'implied_away_win_prob',
       'home_roll_3_avg_home_total_goals', 'away_roll_3_avg_home_total_goals',
       'home_roll_3_avg_away_total_goals', 'away_roll_3_avg_away_total_goals',
       'home_roll_3_avg_home_total_shots', 'away_roll_3_avg_home_total_shots',
       'home_roll_3_avg_away_total_shots', 'away_roll_3_avg_away_total_shots',
       'home_roll_3_avg_home_shots_on_target',
       'away_roll_3_avg_home_shots_on_target',
       'home_roll_3_avg_away_shots_on_target',
       'away_roll_3_avg_away_shots_on_target', 'home_roll_3_avg_home_fouls',
       'away_roll_3_avg_home_fouls', 'home_roll_3_avg_away_fouls',
       'away_roll_3_avg_away_fouls', 'home_roll_3_avg_home_corners',
       'away_roll_3_avg_home_corners', 'home_roll_3_avg_away_corners',
       'away_roll_3_avg

## Saving stats for current season

In [16]:
# current season 2024/2025
teams_stats_2024 = df[df['season'] == 2024]

# This will be useful for the fnal model to get the averages nedded for the predictions and will simplify the input data
teams_stats_2024.to_csv(os.path.join('..','data', 'processed','teams_stats_2024.csv'), index=False)

In [17]:
# I don't need the season column anymore
df = df.drop(columns=['season'])

## Saving stats for all seasons

In [18]:
# save the prepared data
df.to_csv(os.path.join('..','data','processed','prepared_football_data.csv'), index=False)