In [1]:
import pandas as pd

In [2]:
matches = pd.read_csv("../data/raw/matches.csv", index_col=0)

In [3]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,2023-08-13,16:30,Premier League,Matchweek 1,Sun,Away,D,1.0,1.0,Chelsea,...,Match Report,,13.0,1.0,17.8,0.0,0,0,2022,Liverpool
1,2023-08-19,15:00,Premier League,Matchweek 2,Sat,Home,W,3.0,1.0,Bournemouth,...,Match Report,,25.0,9.0,16.8,1.0,0,1,2022,Liverpool
2,2023-08-27,16:30,Premier League,Matchweek 3,Sun,Away,W,2.0,1.0,Newcastle Utd,...,Match Report,,9.0,4.0,17.2,1.0,0,0,2022,Liverpool
3,2023-09-03,14:00,Premier League,Matchweek 4,Sun,Home,W,3.0,0.0,Aston Villa,...,Match Report,,17.0,4.0,14.7,0.0,0,0,2022,Liverpool
4,2023-09-16,12:30,Premier League,Matchweek 5,Sat,Away,W,3.0,1.0,Wolves,...,Match Report,,16.0,5.0,15.8,0.0,0,0,2022,Liverpool


In [4]:
matches.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf              float64
ga              float64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh              float64
sot             float64
dist            float64
fk              float64
pk                int64
pkatt             int64
season            int64
team             object
dtype: object

In [5]:
# Clean the data
# Drop unnnecessary columns
matches.drop(["comp", "notes"], axis=1, inplace=True)

In [6]:
# Convert data types
matches["date"] = pd.to_datetime(matches["date"])
matches['gf'] = pd.to_numeric(matches['gf'])

In [7]:
# Sort by date time descending
matches.sort_values(by="date", ascending=False, inplace=True)

In [11]:
# Calculate the rolling mean of gf of each team
# Sort the DataFrame by 'date'
matches = matches.sort_values(by='date')

# Reset the index to avoid issues with duplicate indices
matches.reset_index(drop=True, inplace=True)

# Now, calculate the rolling mean
matches['gf_avg_last_10'] = matches.groupby('team')['gf'].rolling(window=10, min_periods=1).mean().reset_index(level=0, drop=True)
matches['sh_avg_last_10'] = matches.groupby('team')['sh'].rolling(window=10, min_periods=1).mean().reset_index(level=0, drop=True)

matches['sh_sot_ratio'] = matches['sh'] / matches['sot']
matches['sh_sot_ratio_avg_last_10'] = matches.groupby('team')['sh_sot_ratio'].rolling(window=10, min_periods=1).mean().reset_index(level=0, drop=True)

In [12]:
# Categorize data
matches["target"] = matches["result"].astype("category").cat.codes
matches["venue_code"] = matches["venue"].astype("category").cat.codes
matches["opp_code"] = matches["opponent"].astype("category").cat.codes
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")
matches["day_code"] = matches["date"].dt.dayofweek

In [13]:
# Create a new column 'gf_avg_category_codes' by cutting 'gf_avg_last_10' into quintiles
matches['gf_avg_category_codes'] = pd.qcut(matches['gf_avg_last_10'], q=5, labels=False)
matches['sh_avg_category_codes'] = pd.qcut(matches['sh_avg_last_10'], q=5, labels=False)
matches['sh_sot_ratio_avg_category_codes'] = pd.qcut(matches['sh_sot_ratio_avg_last_10'], q=5, labels=False)

In [10]:
# Output the cleaned data
# Save the cleaned data to a new CSV file
matches.to_csv('../data/processed/cleaned_matches.csv', index=False)