# LaLiga Matchday Prediction Model - Preprocessing

## Importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

## Load data

In [2]:
raw_data = pd.read_csv('data/interim/laliga_2017_2025_matchdata.csv')

In [3]:
raw_data.head()

Unnamed: 0,Season,Wk,Day,Date,Time,Home Position,Home,xG,Score,xG.1,Away,Away Position,Attendance,Venue,Referee
0,2017-2018,1,Fri,2017-08-18,20:15,0,Leganes,1.3,1–0,1.1,Alaves,0,9231.0,Estadio Municipal de Butarque,José Munuera
1,2017-2018,1,Fri,2017-08-18,22:15,0,Valencia,1.4,1–0,0.2,Las Palmas,0,35971.0,Estadio de Mestalla,Jesús Gil
2,2017-2018,1,Sat,2017-08-19,18:15,0,Celta Vigo,1.8,2–3,2.1,Real Sociedad,0,16961.0,Estadio de Balaídos,Antonio Matéu Lahoz
3,2017-2018,1,Sat,2017-08-19,20:15,0,Girona,2.2,2–2,0.7,Atletico Madrid,0,11511.0,Estadi Municipal de Montilivi,Juan Martínez
4,2017-2018,1,Sat,2017-08-19,22:15,0,Sevilla,2.4,1–1,1.3,Espanyol,0,30487.0,Estadio Ramón Sánchez Pizjuán,Alejandro Hernández


In [4]:
raw_data.describe(include='all')

Unnamed: 0,Season,Wk,Day,Date,Time,Home Position,Home,xG,Score,xG.1,Away,Away Position,Attendance,Venue,Referee
count,3040,3040.0,3040,3040,3040,3040.0,3040,3040.0,3040,3040.0,3040,3040.0,2552.0,3040,3040
unique,8,,7,1078,26,,29,,45,,29,,,40,40
top,2017-2018,,Sun,2023-06-04,21:00,,Valencia,,1–1,,Real Sociedad,,,Estadio de Mestalla,Pablo González
freq,380,,1156,10,747,,152,,386,,152,,,152,157
mean,,19.5,,,,10.319737,,1.460263,,1.110033,,10.127632,27758.926332,,
std,,10.96766,,,,5.945547,,0.812278,,0.704539,,5.920736,18575.621864,,
min,,1.0,,,,0.0,,0.0,,0.0,,0.0,13.0,,
25%,,10.0,,,,5.0,,0.9,,0.6,,5.0,14008.25,,
50%,,19.5,,,,10.0,,1.3,,1.0,,10.0,19895.0,,
75%,,29.0,,,,15.0,,1.9,,1.5,,15.0,39573.75,,


In [5]:
df = raw_data.copy()

In [6]:
df.columns.values

array(['Season', 'Wk', 'Day', 'Date', 'Time', 'Home Position', 'Home',
       'xG', 'Score', 'xG.1', 'Away', 'Away Position', 'Attendance',
       'Venue', 'Referee'], dtype=object)

In [7]:
df = df.drop(columns = ['Season', 'Day', 'Attendance', 'Venue', 'Referee'])

In [8]:
df

Unnamed: 0,Wk,Date,Time,Home Position,Home,xG,Score,xG.1,Away,Away Position
0,1,2017-08-18,20:15,0,Leganes,1.3,1–0,1.1,Alaves,0
1,1,2017-08-18,22:15,0,Valencia,1.4,1–0,0.2,Las Palmas,0
2,1,2017-08-19,18:15,0,Celta Vigo,1.8,2–3,2.1,Real Sociedad,0
3,1,2017-08-19,20:15,0,Girona,2.2,2–2,0.7,Atletico Madrid,0
4,1,2017-08-19,22:15,0,Sevilla,2.4,1–1,1.3,Espanyol,0
...,...,...,...,...,...,...,...,...,...,...
3035,38,2025-05-24,21:00,8,Rayo Vallecano,2.2,0–0,0.6,Mallorca,10
3036,38,2025-05-24,16:15,2,Real Madrid,2.6,2–0,0.8,Real Sociedad,11
3037,38,2025-05-25,21:00,4,Athletic Club,1.2,0–3,3.5,Barcelona,1
3038,38,2025-05-25,14:00,15,Girona,0.0,0–4,2.7,Atletico Madrid,3


In [9]:
df['xG_Home'] = df['xG']
df['xG_Away'] = df['xG.1']

In [10]:
df = df.drop(columns = ['xG', 'xG.1'])

In [11]:
df.head()

Unnamed: 0,Wk,Date,Time,Home Position,Home,Score,Away,Away Position,xG_Home,xG_Away
0,1,2017-08-18,20:15,0,Leganes,1–0,Alaves,0,1.3,1.1
1,1,2017-08-18,22:15,0,Valencia,1–0,Las Palmas,0,1.4,0.2
2,1,2017-08-19,18:15,0,Celta Vigo,2–3,Real Sociedad,0,1.8,2.1
3,1,2017-08-19,20:15,0,Girona,2–2,Atletico Madrid,0,2.2,0.7
4,1,2017-08-19,22:15,0,Sevilla,1–1,Espanyol,0,2.4,1.3


In [12]:
df['Score'].unique()

array(['1–0', '2–3', '2–2', '1–1', '0–0', '2–0', '0–3', '0–1', '2–1',
       '3–0', '0–2', '1–5', '1–2', '5–0', '2–4', '3–1', '1–3', '6–1',
       '3–3', '0–4', '4–1', '4–0', '4–4', '3–2', '3–6', '2–5', '0–5',
       '3–5', '6–0', '7–1', '1–4', '4–2', '5–1', '5–2', '6–3', '5–4',
       '8–2', '2–6', '3–4', '4–3', '1–6', '5–3', '6–2', '0–7', '7–0'],
      dtype=object)

In [13]:
df[['G_Home', 'G_Away']] = df['Score'].str.split('–', expand=True)
df['G_Home'] = df['G_Home'].astype(int)
df['G_Away'] = df['G_Away'].astype(int)

In [14]:
df.head()

Unnamed: 0,Wk,Date,Time,Home Position,Home,Score,Away,Away Position,xG_Home,xG_Away,G_Home,G_Away
0,1,2017-08-18,20:15,0,Leganes,1–0,Alaves,0,1.3,1.1,1,0
1,1,2017-08-18,22:15,0,Valencia,1–0,Las Palmas,0,1.4,0.2,1,0
2,1,2017-08-19,18:15,0,Celta Vigo,2–3,Real Sociedad,0,1.8,2.1,2,3
3,1,2017-08-19,20:15,0,Girona,2–2,Atletico Madrid,0,2.2,0.7,2,2
4,1,2017-08-19,22:15,0,Sevilla,1–1,Espanyol,0,2.4,1.3,1,1


In [15]:
df = df.drop(columns = ['Score'])

In [16]:
df.dtypes

Wk                 int64
Date              object
Time              object
Home Position      int64
Home              object
Away              object
Away Position      int64
xG_Home          float64
xG_Away          float64
G_Home             int64
G_Away             int64
dtype: object

In [17]:
df['Date'] = pd.to_datetime(df['Date'])

In [18]:
df['Month'] = df['Date'].dt.month

In [19]:
df['Day'] = df['Date'].dt.weekday

In [20]:
df

Unnamed: 0,Wk,Date,Time,Home Position,Home,Away,Away Position,xG_Home,xG_Away,G_Home,G_Away,Month,Day
0,1,2017-08-18,20:15,0,Leganes,Alaves,0,1.3,1.1,1,0,8,4
1,1,2017-08-18,22:15,0,Valencia,Las Palmas,0,1.4,0.2,1,0,8,4
2,1,2017-08-19,18:15,0,Celta Vigo,Real Sociedad,0,1.8,2.1,2,3,8,5
3,1,2017-08-19,20:15,0,Girona,Atletico Madrid,0,2.2,0.7,2,2,8,5
4,1,2017-08-19,22:15,0,Sevilla,Espanyol,0,2.4,1.3,1,1,8,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3035,38,2025-05-24,21:00,8,Rayo Vallecano,Mallorca,10,2.2,0.6,0,0,5,5
3036,38,2025-05-24,16:15,2,Real Madrid,Real Sociedad,11,2.6,0.8,2,0,5,5
3037,38,2025-05-25,21:00,4,Athletic Club,Barcelona,1,1.2,3.5,0,3,5,6
3038,38,2025-05-25,14:00,15,Girona,Atletico Madrid,3,0.0,2.7,0,4,5,6


In [21]:
df['Time'] = pd.to_datetime(df['Time'], format='%H:%M').dt.time

In [22]:
df.dtypes

Wk                        int64
Date             datetime64[ns]
Time                     object
Home Position             int64
Home                     object
Away                     object
Away Position             int64
xG_Home                 float64
xG_Away                 float64
G_Home                    int64
G_Away                    int64
Month                     int32
Day                       int32
dtype: object

In [23]:
# Convert 'Time' (which is in HH:MM:SS) to datetime
df['Time_dt'] = pd.to_datetime(df['Time'].astype(str), format='%H:%M:%S')

# Extract hour and minute
df['Hour'] = df['Time_dt'].dt.hour
df['Minute'] = df['Time_dt'].dt.minute

# Total minutes since midnight
df['Minutes_Since_Midnight'] = df['Hour'] * 60 + df['Minute']

# Cyclical encoding
import numpy as np
df['Time_Sin'] = np.sin(2 * np.pi * df['Minutes_Since_Midnight'] / 1440)
df['Time_Cos'] = np.cos(2 * np.pi * df['Minutes_Since_Midnight'] / 1440)


In [24]:
df.head()

Unnamed: 0,Wk,Date,Time,Home Position,Home,Away,Away Position,xG_Home,xG_Away,G_Home,G_Away,Month,Day,Time_dt,Hour,Minute,Minutes_Since_Midnight,Time_Sin,Time_Cos
0,1,2017-08-18,20:15:00,0,Leganes,Alaves,0,1.3,1.1,1,0,8,4,1900-01-01 20:15:00,20,15,1215,-0.83147,0.55557
1,1,2017-08-18,22:15:00,0,Valencia,Las Palmas,0,1.4,0.2,1,0,8,4,1900-01-01 22:15:00,22,15,1335,-0.442289,0.896873
2,1,2017-08-19,18:15:00,0,Celta Vigo,Real Sociedad,0,1.8,2.1,2,3,8,5,1900-01-01 18:15:00,18,15,1095,-0.997859,0.065403
3,1,2017-08-19,20:15:00,0,Girona,Atletico Madrid,0,2.2,0.7,2,2,8,5,1900-01-01 20:15:00,20,15,1215,-0.83147,0.55557
4,1,2017-08-19,22:15:00,0,Sevilla,Espanyol,0,2.4,1.3,1,1,8,5,1900-01-01 22:15:00,22,15,1335,-0.442289,0.896873


In [25]:
df = df.drop(columns=['Time_dt', 'Hour', 'Minute', 'Minutes_Since_Midnight'])

In [26]:
df.head()

Unnamed: 0,Wk,Date,Time,Home Position,Home,Away,Away Position,xG_Home,xG_Away,G_Home,G_Away,Month,Day,Time_Sin,Time_Cos
0,1,2017-08-18,20:15:00,0,Leganes,Alaves,0,1.3,1.1,1,0,8,4,-0.83147,0.55557
1,1,2017-08-18,22:15:00,0,Valencia,Las Palmas,0,1.4,0.2,1,0,8,4,-0.442289,0.896873
2,1,2017-08-19,18:15:00,0,Celta Vigo,Real Sociedad,0,1.8,2.1,2,3,8,5,-0.997859,0.065403
3,1,2017-08-19,20:15:00,0,Girona,Atletico Madrid,0,2.2,0.7,2,2,8,5,-0.83147,0.55557
4,1,2017-08-19,22:15:00,0,Sevilla,Espanyol,0,2.4,1.3,1,1,8,5,-0.442289,0.896873


In [27]:
df = df.drop(columns=['Date','Time'])

In [28]:
df.head()

Unnamed: 0,Wk,Home Position,Home,Away,Away Position,xG_Home,xG_Away,G_Home,G_Away,Month,Day,Time_Sin,Time_Cos
0,1,0,Leganes,Alaves,0,1.3,1.1,1,0,8,4,-0.83147,0.55557
1,1,0,Valencia,Las Palmas,0,1.4,0.2,1,0,8,4,-0.442289,0.896873
2,1,0,Celta Vigo,Real Sociedad,0,1.8,2.1,2,3,8,5,-0.997859,0.065403
3,1,0,Girona,Atletico Madrid,0,2.2,0.7,2,2,8,5,-0.83147,0.55557
4,1,0,Sevilla,Espanyol,0,2.4,1.3,1,1,8,5,-0.442289,0.896873


In [29]:
df['Home'].unique()

array(['Leganes', 'Valencia', 'Celta Vigo', 'Girona', 'Sevilla',
       'Athletic Club', 'Barcelona', 'Deportivo La Coruna', 'Levante',
       'Malaga', 'Real Betis', 'Real Sociedad', 'Alaves', 'Las Palmas',
       'Eibar', 'Espanyol', 'Getafe', 'Real Madrid', 'Villarreal',
       'Atletico Madrid', 'Rayo Vallecano', 'Valladolid', 'Huesca',
       'Mallorca', 'Granada', 'Osasuna', 'Cadiz', 'Elche', 'Almeria'],
      dtype=object)

In [30]:
df['Away'].unique()

array(['Alaves', 'Las Palmas', 'Real Sociedad', 'Atletico Madrid',
       'Espanyol', 'Getafe', 'Real Betis', 'Real Madrid', 'Villarreal',
       'Eibar', 'Celta Vigo', 'Barcelona', 'Malaga',
       'Deportivo La Coruna', 'Athletic Club', 'Leganes', 'Sevilla',
       'Valencia', 'Levante', 'Girona', 'Valladolid', 'Huesca',
       'Rayo Vallecano', 'Osasuna', 'Granada', 'Mallorca', 'Cadiz',
       'Elche', 'Almeria'], dtype=object)

In [31]:
team_map = {
    'Alaves': 0, 'Las Palmas': 1, 'Real Sociedad': 2, 'Atletico Madrid': 3, 'Espanyol': 4,
    'Getafe': 5, 'Real Betis': 6, 'Real Madrid': 7, 'Villarreal': 8, 'Eibar': 9,
    'Celta Vigo': 10, 'Barcelona': 11, 'Malaga': 12, 'Deportivo La Coruna': 13,
    'Athletic Club': 14, 'Leganes': 15, 'Sevilla': 16, 'Valencia': 17, 'Levante': 18,
    'Girona': 19, 'Valladolid': 20, 'Huesca': 21, 'Rayo Vallecano': 22, 'Osasuna': 23,
    'Granada': 24, 'Mallorca': 25, 'Cadiz': 26, 'Elche': 27, 'Almeria': 28,
    'Real Oviedo': 29
}

In [32]:
def map_team(team):
    return team_map.get(team, len(team_map)+1)

df['Home_code'] = df['Home'].apply(map_team)
df['Away_code'] = df['Away'].apply(map_team)

In [34]:
df.head()

Unnamed: 0,Wk,Home Position,Home,Away,Away Position,xG_Home,xG_Away,G_Home,G_Away,Month,Day,Time_Sin,Time_Cos,Home_code,Away_code
0,1,0,Leganes,Alaves,0,1.3,1.1,1,0,8,4,-0.83147,0.55557,15,0
1,1,0,Valencia,Las Palmas,0,1.4,0.2,1,0,8,4,-0.442289,0.896873,17,1
2,1,0,Celta Vigo,Real Sociedad,0,1.8,2.1,2,3,8,5,-0.997859,0.065403,10,2
3,1,0,Girona,Atletico Madrid,0,2.2,0.7,2,2,8,5,-0.83147,0.55557,19,3
4,1,0,Sevilla,Espanyol,0,2.4,1.3,1,1,8,5,-0.442289,0.896873,16,4


## Binary encoding team names

In [44]:
def binary_encode(number, num_bits=5):
    return [int(bit) for bit in bin(number)[2:].zfill(num_bits)]

In [45]:
df_encoded_home = df['Home_code'].apply(lambda x: binary_encode(x)).apply(pd.Series)
df_encoded_away = df['Away_code'].apply(lambda x: binary_encode(x)).apply(pd.Series)

In [47]:
df_encoded_home.columns = [f'Home_code_{i}' for i in range(df_encoded_home.shape[1])]
df_encoded_away.columns = [f'Away_code_{i}' for i in range(df_encoded_away.shape[1])]

In [49]:
df_encoded = pd.concat([df, df_encoded_home, df_encoded_away], axis=1)

In [51]:
df_encoded.head()

Unnamed: 0,Wk,Home Position,Home,Away,Away Position,xG_Home,xG_Away,G_Home,G_Away,Month,...,Home_code_0,Home_code_1,Home_code_2,Home_code_3,Home_code_4,Away_code_0,Away_code_1,Away_code_2,Away_code_3,Away_code_4
0,1,0,Leganes,Alaves,0,1.3,1.1,1,0,8,...,0,1,1,1,1,0,0,0,0,0
1,1,0,Valencia,Las Palmas,0,1.4,0.2,1,0,8,...,1,0,0,0,1,0,0,0,0,1
2,1,0,Celta Vigo,Real Sociedad,0,1.8,2.1,2,3,8,...,0,1,0,1,0,0,0,0,1,0
3,1,0,Girona,Atletico Madrid,0,2.2,0.7,2,2,8,...,1,0,0,1,1,0,0,0,1,1
4,1,0,Sevilla,Espanyol,0,2.4,1.3,1,1,8,...,1,0,0,0,0,0,0,1,0,0


In [52]:
df_encoded.columns.values

array(['Wk', 'Home Position', 'Home', 'Away', 'Away Position', 'xG_Home',
       'xG_Away', 'G_Home', 'G_Away', 'Month', 'Day', 'Time_Sin',
       'Time_Cos', 'Home_code', 'Away_code', 'Home_code_0', 'Home_code_1',
       'Home_code_2', 'Home_code_3', 'Home_code_4', 'Away_code_0',
       'Away_code_1', 'Away_code_2', 'Away_code_3', 'Away_code_4'],
      dtype=object)

In [53]:
df_encoded = df_encoded.drop(columns = ['Home','Away','Home_code','Away_code'])

In [56]:
df_encoded

Unnamed: 0,Wk,Home Position,Away Position,xG_Home,xG_Away,G_Home,G_Away,Month,Day,Time_Sin,...,Home_code_0,Home_code_1,Home_code_2,Home_code_3,Home_code_4,Away_code_0,Away_code_1,Away_code_2,Away_code_3,Away_code_4
0,1,0,0,1.3,1.1,1,0,8,4,-0.831470,...,0,1,1,1,1,0,0,0,0,0
1,1,0,0,1.4,0.2,1,0,8,4,-0.442289,...,1,0,0,0,1,0,0,0,0,1
2,1,0,0,1.8,2.1,2,3,8,5,-0.997859,...,0,1,0,1,0,0,0,0,1,0
3,1,0,0,2.2,0.7,2,2,8,5,-0.831470,...,1,0,0,1,1,0,0,0,1,1
4,1,0,0,2.4,1.3,1,1,8,5,-0.442289,...,1,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3035,38,8,10,2.2,0.6,0,0,5,5,-0.707107,...,1,0,1,1,0,1,1,0,0,1
3036,38,2,11,2.6,0.8,2,0,5,5,-0.896873,...,0,0,1,1,1,0,0,0,1,0
3037,38,4,1,1.2,3.5,0,3,5,6,-0.707107,...,0,1,1,1,0,0,1,0,1,1
3038,38,15,3,0.0,2.7,0,4,5,6,-0.500000,...,1,0,0,1,1,0,0,0,1,1


In [57]:
df_encoded.columns.values

array(['Wk', 'Home Position', 'Away Position', 'xG_Home', 'xG_Away',
       'G_Home', 'G_Away', 'Month', 'Day', 'Time_Sin', 'Time_Cos',
       'Home_code_0', 'Home_code_1', 'Home_code_2', 'Home_code_3',
       'Home_code_4', 'Away_code_0', 'Away_code_1', 'Away_code_2',
       'Away_code_3', 'Away_code_4'], dtype=object)

In [58]:
df_encoded = df_encoded[['Month', 'Day', 'Time_Sin', 'Time_Cos', 'Wk', 
       'Home Position','Home_code_0', 'Home_code_1', 'Home_code_2', 'Home_code_3',
       'Home_code_4','xG_Home','G_Home', 'G_Away', 'xG_Away', 'Away_code_0', 'Away_code_1', 'Away_code_2',
       'Away_code_3', 'Away_code_4', 'Away Position', ]]

In [61]:
df_encoded

Unnamed: 0,Month,Day,Time_Sin,Time_Cos,Wk,Home Position,Home_code_0,Home_code_1,Home_code_2,Home_code_3,Home_code_4,xG_Home,G_Home,G_Away,xG_Away,Away_code_0,Away_code_1,Away_code_2,Away_code_3,Away_code_4,Away Position
0,8,4,-0.831470,0.555570,1,0,0,1,1,1,1,1.3,1,0,1.1,0,0,0,0,0,0
1,8,4,-0.442289,0.896873,1,0,1,0,0,0,1,1.4,1,0,0.2,0,0,0,0,1,0
2,8,5,-0.997859,0.065403,1,0,0,1,0,1,0,1.8,2,3,2.1,0,0,0,1,0,0
3,8,5,-0.831470,0.555570,1,0,1,0,0,1,1,2.2,2,2,0.7,0,0,0,1,1,0
4,8,5,-0.442289,0.896873,1,0,1,0,0,0,0,2.4,1,1,1.3,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3035,5,5,-0.707107,0.707107,38,8,1,0,1,1,0,2.2,0,0,0.6,1,1,0,0,1,10
3036,5,5,-0.896873,-0.442289,38,2,0,0,1,1,1,2.6,2,0,0.8,0,0,0,1,0,11
3037,5,6,-0.707107,0.707107,38,4,0,1,1,1,0,1.2,0,3,3.5,0,1,0,1,1,1
3038,5,6,-0.500000,-0.866025,38,15,1,0,0,1,1,0.0,0,4,2.7,0,0,0,1,1,3


In [66]:
df_with_targets = df_encoded.copy()

In [67]:
df_with_targets['Target'] = df_with_targets.apply(
    lambda row: 1 if row['G_Home'] > row['G_Away'] else 0,
    axis=1
)

In [68]:
df_preprocessed = df_with_targets.copy()

In [69]:
df_preprocessed.head()

Unnamed: 0,Month,Day,Time_Sin,Time_Cos,Wk,Home Position,Home_code_0,Home_code_1,Home_code_2,Home_code_3,Home_code_4,xG_Home,G_Home,G_Away,xG_Away,Away_code_0,Away_code_1,Away_code_2,Away_code_3,Away_code_4,Away Position,Target
0,8,4,-0.83147,0.55557,1,0,0,1,1,1,1,1.3,1,0,1.1,0,0,0,0,0,0,1
1,8,4,-0.442289,0.896873,1,0,1,0,0,0,1,1.4,1,0,0.2,0,0,0,0,1,0,1
2,8,5,-0.997859,0.065403,1,0,0,1,0,1,0,1.8,2,3,2.1,0,0,0,1,0,0,0
3,8,5,-0.83147,0.55557,1,0,1,0,0,1,1,2.2,2,2,0.7,0,0,0,1,1,0,0
4,8,5,-0.442289,0.896873,1,0,1,0,0,0,0,2.4,1,1,1.3,0,0,1,0,0,0,0


In [71]:
df_preprocessed.to_csv('Laliga_fixtures_preprocessed.csv', index=False)