<a href="https://colab.research.google.com/github/Tommy-Las/WatfordFC/blob/main/Data_cleaning_transformation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import packages and data

In [1]:
import pandas as pd
# Turn off warnings
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder

In [65]:
# Mount drive

from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

Import GPS data

In [2]:
# file_path = '/content/drive/MyDrive/WATFORD FC/Datos GPS/GPS 2018-2023.xlsx'
file_path = 'data/GPS 2018-2023_NoContact.xlsx'
df_gps = pd.read_excel(file_path)

Import speed data

In [3]:
# file_path = '/content/drive/MyDrive/WATFORD FC/Datos GPS/max_speed.xlsx'
file_path = 'data/max_speed.xlsx'
df_speed = pd.read_excel(file_path)

Import wellbeing data


In [None]:
#file_path = '/content/drive/MyDrive/WATFORD FC/Datos Wellbeing/wellbeing ssp.xlsx'
#df_wellbeing = pd.read_excel(file_path)

# Merge Sprint values

The sprint values are separated in different rows per session,

we want to merge in into a single row per session.

## Transform data before merging sprint rows

In [4]:
df_speed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123180 entries, 0 to 123179
Data columns (total 11 columns):
 #   Column                                       Non-Null Count   Dtype  
---  ------                                       --------------   -----  
 0   DATE                                         123180 non-null  object 
 1   ID                                           123180 non-null  int64  
 2   Player Position                              123155 non-null  object 
 3   Max Speed                                    116475 non-null  float64
 4   Sprints                                      113560 non-null  float64
 5   MINUTES                                      113654 non-null  float64
 6   Season                                       123180 non-null  object 
 7   Max Speed Season                             123180 non-null  float64
 8   Avg Speed Season                             122357 non-null  float64
 9   % Max Speed                                  122357 non-nul

We want to remove rows after 08-07-2021 since we don't have data for sprints and speed

In [5]:
from datetime import date
# Convert DATE column values into Pandas datetime object
df_speed['DATE'] = pd.to_datetime(df_speed['DATE'], dayfirst=True)

# Filter rows after date: '2021-08-07'
date_filter = pd.Timestamp('2021-08-07')
df_speed = df_speed[df_speed['DATE'] > date_filter]

# Filter rows after date: '2023-05-11'
date_filter = pd.Timestamp('2023-05-11')
df_speed = df_speed[df_speed['DATE'] < date_filter]

date_filter = pd.Timestamp('2021-06-28')
df_gps = df_gps[df_gps['DATE'] > date_filter]

In [6]:
df_speed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 55390 entries, 58257 to 113646
Data columns (total 11 columns):
 #   Column                                       Non-Null Count  Dtype         
---  ------                                       --------------  -----         
 0   DATE                                         55390 non-null  datetime64[ns]
 1   ID                                           55390 non-null  int64         
 2   Player Position                              55365 non-null  object        
 3   Max Speed                                    55377 non-null  float64       
 4   Sprints                                      55362 non-null  float64       
 5   MINUTES                                      55389 non-null  float64       
 6   Season                                       55390 non-null  object        
 7   Max Speed Season                             55390 non-null  float64       
 8   Avg Speed Season                             55390 non-null  float64       


Replace NULL sprint values to 0

In [7]:
# Replace NULL values from Sprint to 0
df_speed['Sprints'] = df_speed['Sprints'].fillna(0)

In [8]:
df_speed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 55390 entries, 58257 to 113646
Data columns (total 11 columns):
 #   Column                                       Non-Null Count  Dtype         
---  ------                                       --------------  -----         
 0   DATE                                         55390 non-null  datetime64[ns]
 1   ID                                           55390 non-null  int64         
 2   Player Position                              55365 non-null  object        
 3   Max Speed                                    55377 non-null  float64       
 4   Sprints                                      55390 non-null  float64       
 5   MINUTES                                      55389 non-null  float64       
 6   Season                                       55390 non-null  object        
 7   Max Speed Season                             55390 non-null  float64       
 8   Avg Speed Season                             55390 non-null  float64       


## Merge sprint values

Print values to verify later

In [9]:
df_speed.sort_values(by='DATE', ascending=False).head(15)

Unnamed: 0,DATE,ID,Player Position,Max Speed,Sprints,MINUTES,Season,Max Speed Season,Avg Speed Season,% Max Speed,%Speed diference against max. Speed average
58257,2023-05-10,23085,CENTRE MIDFIELDER,22.91,3.0,30.19,2022-2023,32.12,20.304388,71.326276,12.832753
58258,2023-05-10,87583,CENTRE MIDFIELDER,24.7,20.0,40.53,2022-2023,34.27,21.189939,72.074701,16.564751
58259,2023-05-09,96811,CENTRE MIDFIELDER,32.19,10.0,70.51,2022-2023,36.04,20.882454,89.317425,54.148548
58260,2023-05-09,89871,STRIKER,25.46,0.0,79.87,2022-2023,34.12,21.753402,74.618992,17.039162
58300,2023-05-08,55555,FULL BACK,30.47,18.0,72.62,2022-2023,34.94,22.07125,87.20664,38.052897
58289,2023-05-08,48692,CENTRE BACK,32.56,35.0,69.15,2022-2023,36.21,21.02927,89.919912,54.831811
58290,2023-05-08,87583,CENTRE MIDFIELDER,22.69,11.0,34.13,2022-2023,34.27,21.189939,66.209513,7.079117
58291,2023-05-08,37537,CENTRE MIDFIELDER,28.65,5.0,47.05,2022-2023,34.09,21.974456,84.042241,30.378655
58292,2023-05-08,37537,CENTRE MIDFIELDER,30.59,3.0,22.1,2022-2023,34.09,21.974456,89.73306,39.207088
58293,2023-05-08,37537,CENTRE MIDFIELDER,30.59,8.0,69.15,2022-2023,34.09,21.974456,89.73306,39.207088


Do a group by player and date, and select the maximum value for each session

In [10]:
df_speed = df_speed.groupby(['DATE', 'ID']).agg('max').reset_index()

In [11]:
df_speed.sort_values(by='DATE', ascending=False).head(15)

Unnamed: 0,DATE,ID,Player Position,Max Speed,Sprints,MINUTES,Season,Max Speed Season,Avg Speed Season,% Max Speed,%Speed diference against max. Speed average
8262,2023-05-10,87583,CENTRE MIDFIELDER,24.7,20.0,40.53,2022-2023,34.27,21.189939,72.074701,16.564751
8261,2023-05-10,23085,CENTRE MIDFIELDER,22.91,3.0,30.19,2022-2023,32.12,20.304388,71.326276,12.832753
8260,2023-05-09,96811,CENTRE MIDFIELDER,32.19,10.0,70.51,2022-2023,36.04,20.882454,89.317425,54.148548
8259,2023-05-09,89871,STRIKER,25.46,0.0,79.87,2022-2023,34.12,21.753402,74.618992,17.039162
8249,2023-05-08,55555,FULL BACK,30.47,18.0,72.62,2022-2023,34.94,22.07125,87.20664,38.052897
8240,2023-05-08,10103,FULL BACK,21.24,1.0,44.84,2022-2023,34.95,22.20223,60.772532,-4.333932
8241,2023-05-08,12086,WINGER,35.04,41.0,88.27,2022-2023,35.44,21.901184,98.871332,59.991349
8242,2023-05-08,19817,FULL BACK,31.68,0.0,128.04,2022-2023,35.77,21.60546,88.565837,46.629599
8243,2023-05-08,21079,WINGER,34.51,51.0,88.04,2022-2023,36.97,22.708654,93.345956,51.968499
8244,2023-05-08,23085,CENTRE MIDFIELDER,23.71,7.0,58.52,2022-2023,32.12,20.304388,73.816936,16.772788


# Handle duplicates for GPS Data

In [12]:
# Define columns to sum
columns_to_sum = ['Total D', '>19.8', '> 25 Km/h', 'ACC', 'DEC']

# Define columns to select the first value
columns_to_first = ['DATE', 'Column2', 'PLAYER', 'Injury', 'season', 'LEAGUE', 'preseason-season', 'MANAGER']

# Group by the duplicate subset and aggregate
df_gps_aggregated = (
    df_gps[df_gps.duplicated(subset=['PLAYER', 'DATE'], keep=False)]
    .groupby(['PLAYER', 'DATE'], as_index=False)
    .agg({**{col: 'sum' for col in columns_to_sum},
          **{col: 'first' for col in columns_to_first}})
)

# Ensure non-duplicated rows are preserved by combining them back
df_gps_combined = pd.concat([
    df_gps[~df_gps.duplicated(subset=['PLAYER', 'DATE'], keep=False)],
    df_gps_aggregated
], ignore_index=True)

# Merge GPS and Speed dataframes

## Prepare the data before the merge

Verify column names for both Dataframes

In [13]:
df_gps_combined.columns

Index(['DATE', 'Column2', 'PLAYER', 'Injury', 'season', 'LEAGUE',
       'preseason-season', 'MANAGER', 'Total D', '>19.8', '> 25 Km/h', 'ACC',
       'DEC'],
      dtype='object')

In [14]:
df_speed.columns

Index(['DATE', 'ID', 'Player Position', 'Max Speed', 'Sprints', 'MINUTES',
       'Season', 'Max Speed Season', 'Avg Speed Season', '% Max Speed',
       '%Speed diference against max. Speed average'],
      dtype='object')

Change datatypes before the merge

In [15]:
# Convert DATE column from GPS df into datetime pandas obkect
df_gps_combined['DATE'] = pd.to_datetime(df_gps_combined['DATE'], dayfirst=True)

# Drop NULL values for 'PLAYER'
df_gps_combined = df_gps_combined.dropna(subset=['PLAYER'])

# Convert ID and PLAYER columns to the same data type - integers
df_gps_combined['PLAYER'] = df_gps_combined['PLAYER'].astype(int)
df_speed['ID'] = df_speed['ID'].astype(int)

## Do the merge of both dataframes

In [16]:
# Perform an inner join on matching DATE and PLAYER/ID values
df_merged = df_gps_combined.merge(df_speed, left_on=['DATE', 'PLAYER'], right_on=['DATE', 'ID'], how='inner')

In [17]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7835 entries, 0 to 7834
Data columns (total 23 columns):
 #   Column                                       Non-Null Count  Dtype         
---  ------                                       --------------  -----         
 0   DATE                                         7835 non-null   datetime64[ns]
 1   Column2                                      7835 non-null   object        
 2   PLAYER                                       7835 non-null   int64         
 3   Injury                                       7835 non-null   float64       
 4   season                                       7835 non-null   object        
 5   LEAGUE                                       7835 non-null   object        
 6   preseason-season                             7835 non-null   object        
 7   MANAGER                                      7835 non-null   object        
 8   Total D                                      7835 non-null   object        
 9

Verify class imbalances

In [18]:
print("No Injury: " + str(df_merged[df_merged["Injury"] == 0].shape[0]))
print("Injury: " + str(df_merged[df_merged["Injury"] == 1].shape[0]))
print(f"%: {((df_merged[df_merged['Injury'] == 1].shape[0] / df_merged.shape[0]) * 100):.2f}%")

No Injury: 7798
Injury: 37
%: 0.47%


# Merge df with 'weight' in wellbeing dataframe

In [None]:
# df_wellbeing = df_wellbeing[["Weight", "Date", "PLAYER"]]
# df_wellbeing.info()

In [None]:
# # Sort by PLAYER and DATE to ensure correct order for backfilling
# df_wellbeing = df_wellbeing.sort_values(by=['PLAYER', 'Date'])

# # Fill the NULL values in 'Weight' with the previous valid value for each player
# df_wellbeing['Weight'] = df_wellbeing.groupby('PLAYER')['Weight'].bfill()
# df_wellbeing['Weight'] = df_wellbeing.groupby('PLAYER')['Weight'].ffill()

# df_wellbeing.info()

In [None]:
# # Merge the DataFrames
# df_merged = df_merged.merge(df_wellbeing, left_on=['DATE', 'PLAYER'], right_on=['Date', 'PLAYER'], how='left')

# # Sort by PLAYER and DATE to ensure chronological order
# df_merged = df_merged.sort_values(by=['PLAYER', 'DATE'])

# # Back-fill null values in 'Weight' for each PLAYER
# df_merged['Weight'] = df_merged.groupby('PLAYER')['Weight'].bfill()
# df_merged['Weight'] = df_merged.groupby('PLAYER')['Weight'].ffill()

# # Optional: Drop the 'Date' column from the right DataFrame if not needed
# df_merged = df_merged.drop(columns=['Date'])

# # Display the final DataFrame
# df_merged.info()

In [None]:
#df_merged.groupby('PLAYER')['Weight'].apply(lambda x: x.isnull().sum())

# Transform numeric columns to the same data types

In [19]:
cols = ['Total D', '>19.8', '> 25 Km/h', 'ACC',
       'DEC', 'ID', 'Max Speed', 'Sprints', 'MINUTES', 'Max Speed Season',
       'Avg Speed Season', '% Max Speed',
       '%Speed diference against max. Speed average']

df_merged[cols] = df_merged[cols].astype(float)

In [20]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7835 entries, 0 to 7834
Data columns (total 23 columns):
 #   Column                                       Non-Null Count  Dtype         
---  ------                                       --------------  -----         
 0   DATE                                         7835 non-null   datetime64[ns]
 1   Column2                                      7835 non-null   object        
 2   PLAYER                                       7835 non-null   int64         
 3   Injury                                       7835 non-null   float64       
 4   season                                       7835 non-null   object        
 5   LEAGUE                                       7835 non-null   object        
 6   preseason-season                             7835 non-null   object        
 7   MANAGER                                      7835 non-null   object        
 8   Total D                                      7835 non-null   float64       
 9

# Values that are 0 for ACC, DEC, Max Speed

In [21]:
# Count of rows containing 0 for each column
zero_counts = (df_merged == 0).sum()

# Creating a DataFrame for the results
result_df = pd.DataFrame({
    "Column Name": zero_counts.index,
    "Number of Rows with 0": zero_counts.values
})

result_df

Unnamed: 0,Column Name,Number of Rows with 0
0,DATE,0
1,Column2,382
2,PLAYER,0
3,Injury,7798
4,season,0
5,LEAGUE,0
6,preseason-season,0
7,MANAGER,0
8,Total D,3
9,>19.8,420


In [22]:
# Columns to impute
metrics_imputation_zeroes = ['Total D', 'ACC', 'DEC', 'Max Speed', 'MINUTES']

# Columns to use for KNN calculation
columns_for_knn = ['Total D', '>19.8', '> 25 Km/h', 'ACC', 'DEC', 'ID',
                   'Max Speed', 'Sprints', 'MINUTES', '% Max Speed']

# Replace 0 with NaN in the columns to be imputed
df_merged[metrics_imputation_zeroes] = df_merged[metrics_imputation_zeroes].replace(0, np.nan)

# Ensure the columns used for KNN calculation are numeric
columns_for_knn_numeric = df_merged[columns_for_knn].select_dtypes(include=[np.number]).columns.tolist()

# Initialize the KNNImputer
knn_imputer = KNNImputer(n_neighbors=5)  # Adjust n_neighbors if needed

# Perform KNN imputation using the broader set of columns for calculation
imputed_values = knn_imputer.fit_transform(df_merged[columns_for_knn_numeric])

# Update only the specified columns to impute
df_merged[metrics_imputation_zeroes] = imputed_values[:,
    [columns_for_knn_numeric.index(col) for col in metrics_imputation_zeroes]]

In [23]:
# Count of rows containing 0 for each column
zero_counts = (df_merged == 0).sum()

# Creating a DataFrame for the results
result_df = pd.DataFrame({
    "Column Name": zero_counts.index,
    "Number of Rows with 0": zero_counts.values
})

result_df

Unnamed: 0,Column Name,Number of Rows with 0
0,DATE,0
1,Column2,382
2,PLAYER,0
3,Injury,7798
4,season,0
5,LEAGUE,0
6,preseason-season,0
7,MANAGER,0
8,Total D,0
9,>19.8,420


In [24]:
df_merged[df_merged['% Max Speed'] == 0]

Unnamed: 0,DATE,Column2,PLAYER,Injury,season,LEAGUE,preseason-season,MANAGER,Total D,>19.8,> 25 Km/h,ACC,DEC,ID,Player Position,Max Speed,Sprints,MINUTES,Season,Max Speed Season,Avg Speed Season,% Max Speed,%Speed diference against max. Speed average
3268,2022-09-12,M-1,37990,0.0,2022-2023,CHAMPIONSHIP,SEASON,EDWARDS,5215.0,549.0,50.0,51.0,18.0,37990.0,FULL BACK,32.174,0.0,74.792,2022-2023,37.28,22.669514,0.0,-100.0
5185,2022-04-02,MD,65042,0.0,2021-2022,PREMIER LEAGUE,SEASON,HODGSON,800.0,600.0,0.0,9.4,7.6,65042.0,CENTRE MIDFIELDER,23.592,0.0,5.0,2021-2022,34.92,20.048756,0.0,-100.0
5187,2022-04-02,MD,23085,0.0,2021-2022,PREMIER LEAGUE,SEASON,HODGSON,800.0,600.0,0.0,3.6,6.2,23085.0,CENTRE MIDFIELDER,17.784,0.0,5.0,2021-2022,30.73,20.278653,0.0,-100.0
5191,2022-04-02,MD,17316,0.0,2021-2022,PREMIER LEAGUE,SEASON,HODGSON,800.0,600.0,0.0,6.2,5.6,17316.0,WINGER,21.586,0.0,5.0,2021-2022,33.73,20.441103,0.0,-100.0
5195,2022-04-02,MD,42579,0.0,2021-2022,PREMIER LEAGUE,SEASON,HODGSON,800.0,600.0,0.0,23.0,12.6,42579.0,CENTRE BACK,21.274,0.0,5.0,2021-2022,29.76,19.989098,0.0,-100.0
5199,2022-04-02,MD,23081,0.0,2021-2022,PREMIER LEAGUE,SEASON,HODGSON,800.0,600.0,0.0,3.6,6.2,23081.0,CENTRE BACK,17.784,0.0,5.0,2021-2022,34.94,20.537766,0.0,-100.0
5201,2022-04-02,MD,37709,0.0,2021-2022,PREMIER LEAGUE,SEASON,HODGSON,800.0,600.0,0.0,14.8,6.8,37709.0,CENTRE BACK,21.886,0.0,5.0,2021-2022,36.56,22.374467,0.0,-100.0
6701,2021-11-24,M-5,10452,0.0,2021-2022,PREMIER LEAGUE,SEASON,RANIERI,2354.4,0.0,0.0,18.8,18.0,10452.0,FULL BACK,13.458,0.0,15.0,2021-2022,34.54,22.291234,0.0,-100.0
6831,2021-11-07,MD,21079,0.0,2021-2022,PREMIER LEAGUE,SEASON,RANIERI,10554.0,734.0,361.0,95.0,108.0,21079.0,WINGER,35.094,75.0,100.45,2021-2022,36.54,21.962986,0.0,-100.0
7400,2022-10-19,MD (AWAY),77765,0.0,2022-2023,CHAMPIONSHIP,SEASON,BILIC,1646.0,77.0,1.0,13.0,23.0,77765.0,CENTRE MIDFIELDER,20.33,2.0,18.0,2022-2023,32.08,21.29152,0.0,-100.0


In [25]:
# Calculate '% Max Speed' for all rows
df_merged['% Max Speed'] = (df_merged['Max Speed'] / df_merged['Max Speed Season']) * 100

# Calculate '% Speed difference against max. Speed average' for all rows
df_merged['%Speed diference against max. Speed average'] = (
    (df_merged['Max Speed'] - df_merged['Avg Speed Season']) / df_merged['Avg Speed Season']
) * 100

In [26]:
df_merged[(df_merged['PLAYER'] == 65042) & (df_merged['DATE'] == '2022-04-02')]

Unnamed: 0,DATE,Column2,PLAYER,Injury,season,LEAGUE,preseason-season,MANAGER,Total D,>19.8,> 25 Km/h,ACC,DEC,ID,Player Position,Max Speed,Sprints,MINUTES,Season,Max Speed Season,Avg Speed Season,% Max Speed,%Speed diference against max. Speed average
5185,2022-04-02,MD,65042,0.0,2021-2022,PREMIER LEAGUE,SEASON,HODGSON,800.0,600.0,0.0,9.4,7.6,65042.0,CENTRE MIDFIELDER,23.592,0.0,5.0,2021-2022,34.92,20.048756,67.560137,17.673139


# Reduce data volume/size

Drop rows from preseason since its not the same intensity as regular season


In [None]:
# df_merged = df_merged[df_merged['preseason-season'] != 'PRESEASON']

# df_merged.reset_index(drop=True, inplace=True)

Class imbalance

In [None]:
# print("No Injury: " + str(df_merged[df_merged["Injury"] == 0].shape[0]))
# print("Injury: " + str(df_merged[df_merged["Injury"] == 1].shape[0]))
# print(f"%: {((df_merged[df_merged['Injury'] == 1].shape[0] / df_merged.shape[0]) * 100):.2f}%")

## Players who have never been injured

In [27]:
# Group by ID and sum the Injury column
injury_counts = df_merged.groupby('ID').agg({'Injury': 'sum'})

# Get the IDs where the sum of Injury is greater than 0
ids_to_remove = injury_counts[injury_counts['Injury'] == 0].index.tolist()

# Remove these IDs from the original DataFrame
df_filtered = df_merged[~df_merged['ID'].isin(ids_to_remove)]

# df_filtered = df_merged.copy()

In [28]:
print("No Injury: " + str(df_filtered[df_filtered["Injury"] == 0].shape[0]))
print("Injury: " + str(df_filtered[df_filtered["Injury"] == 1].shape[0]))
print(f"%: {((df_filtered[df_filtered['Injury'] == 1].shape[0] / df_filtered.shape[0]) * 100):.2f}%")

No Injury: 4686
Injury: 37
%: 0.78%


Verify at least one player had an injury

In [29]:
df_filtered.groupby('ID').agg({'Injury': 'sum'})

Unnamed: 0_level_0,Injury
ID,Unnamed: 1_level_1
10103.0,4.0
10452.0,2.0
12086.0,1.0
17316.0,2.0
18096.0,3.0
20083.0,1.0
21079.0,2.0
23081.0,2.0
23085.0,2.0
25467.0,1.0


# Change column names

In [30]:
column_rename_dict = {
    'Column2': 'Microcycle',
    'DATE': 'DATE',
    'ID': 'PlayerID',
    'Total D': 'TD',
    '>19.8': 'HSR',
    '> 25 Km/h': '+25 Km/h',
    'ACC': 'ACC',
    'DEC': 'DEC',
    'Max Speed': 'Max Speed',
    'Max Speed Season': 'Max Speed Season',
    'Avg Speed Season': 'Avg Speed Season',
    '% Max Speed': '% Max Speed',
    '%Speed diference against max. Speed average': 'Speed Diff Max Avg',
    'Injury': 'Injury',
    'MINUTES': 'Mins',
    'Sprints': 'Sprints',
    'Total D_Rel': 'TD_Rel',
    '>19.8_Rel': 'HSR_Rel',
    '> 25 Km/h_Rel': '+25 Km/h_Rel',
    'ACC_Rel': 'ACC_Rel',
    'DEC_Rel': 'DEC_Rel',
    'Sprints_Rel': 'Sprints_Rel'
}

df_filtered.rename(columns=column_rename_dict, inplace=True)

df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4723 entries, 0 to 7802
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   DATE                4723 non-null   datetime64[ns]
 1   Microcycle          4723 non-null   object        
 2   PLAYER              4723 non-null   int64         
 3   Injury              4723 non-null   float64       
 4   season              4723 non-null   object        
 5   LEAGUE              4723 non-null   object        
 6   preseason-season    4723 non-null   object        
 7   MANAGER             4723 non-null   object        
 8   TD                  4723 non-null   float64       
 9   HSR                 4723 non-null   float64       
 10  +25 Km/h            4723 non-null   float64       
 11  ACC                 4723 non-null   float64       
 12  DEC                 4723 non-null   float64       
 13  PlayerID            4723 non-null   float64       
 1

# Drop unnecesary columns

In [31]:
df_filtered = df_filtered.drop(columns=['season', 'LEAGUE', 'MANAGER', 'PLAYER', 'Player Position', 'Season'])

# Calculate relative values

We want the max and avg values for players that played over 85 in a Match Day

Remove spaces in 'Microcycle' column

In [37]:
df_filtered['Microcycle'] = df_filtered['Microcycle'].str.replace(' ', '', regex=False)
df_filtered["Microcycle"].unique()

array(['M+2', 'M+1', 'MD', 'M-1', 'M-2', 'M-3', 'M-5', 'M+3', 'M-4',
       'MD(AWAY)', 'MD(HOME)', nan], dtype=object)

In [36]:
def calculate_relative_values(df_original, metrics):
    # Array that holds match days
    microcycle_values = ['MD', 'MD(HOME)', 'MD(AWAY)']

    # Do a copy of the original DF
    df_copy = df_original.copy()

    # Filter players with +85 mins and match day
    df_filtered_85_md = df_copy[
        (df_copy['Mins'] > 85) &
        (df_copy['Microcycle'].isin(microcycle_values))
    ]

    # Players who never completed 85 mins in matchday
    df_filtered_85_only_mins = df_copy[df_copy['Mins'] > 85]

    # Combine both DataFrames
    df_filtered_85 = pd.concat([df_filtered_85_md, df_filtered_85_only_mins])

    # Temporary dictionaries to hold max and avg values per player
    player_max = {}
    player_avg = {}

    # Calculate max and avg for each player
    for player in df_filtered_85['PlayerID'].unique():
        player_data = df_filtered_85[df_filtered_85['PlayerID'] == player]
        player_max[player] = player_data[metrics].max()
        player_avg[player] = player_data[metrics].mean()

    # Add relative values to the original DataFrame
    for metric in metrics:

        df_original[f"{metric}_Rel"] = np.nan

        for player in df_original['PlayerID'].unique():
            if player in player_max and player in player_avg:
                max_value = player_max[player][metric]
                avg_value = player_avg[player][metric]

                # Compute relative value
                df_original.loc[df_original['PlayerID'] == player, f"{metric}_Rel"] = (
                    (df_original.loc[df_original['PlayerID'] == player, metric] * 100) /
                    ((max_value + avg_value) / 2)
                ).round(2)

    return df_original

Call function to calculate relative values

In [38]:
metrics_rel = ['TD', 'HSR', '+25 Km/h', 'ACC', 'DEC']

df_rel = calculate_relative_values(df_filtered, metrics_rel)

In [39]:
df_rel.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4723 entries, 0 to 7802
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   DATE                4723 non-null   datetime64[ns]
 1   Microcycle          4508 non-null   object        
 2   Injury              4723 non-null   float64       
 3   preseason-season    4723 non-null   object        
 4   TD                  4723 non-null   float64       
 5   HSR                 4723 non-null   float64       
 6   +25 Km/h            4723 non-null   float64       
 7   ACC                 4723 non-null   float64       
 8   DEC                 4723 non-null   float64       
 9   PlayerID            4723 non-null   float64       
 10  Max Speed           4723 non-null   float64       
 11  Sprints             4723 non-null   float64       
 12  Mins                4723 non-null   float64       
 13  Max Speed Season    4723 non-null   float64       
 1

# Function for densities

In [None]:
cols_mins = ['TD', 'HSR', '+25 Km/h', 'ACC', 'DEC', 'Sprints']

# Create new columns by dividing by the 'Minutes' column
for col in cols_mins:
    df_rel[f'{col}/Mins'] = df_rel[col] / df_rel['Mins']

In [None]:
df_rel['Sprints/Mins'] = df_rel['Sprints/Mins'].fillna(0)

# Find Max and Avg values before current date

In [41]:
def calculate_max_avg(df, metrics):

  # Create copies of the original DataFrame to ensure no unintended changes
  df = df.copy()

  # Loop through each metric to calculate max and avg values
  for metric in metrics:
      # Initialize the max and avg columns for the metric
      max_col = f"{metric}_max"
      avg_col = f"{metric}_avg"
      df[max_col] = None
      df[avg_col] = None

      # Iterate through each row to calculate max and avg based on previous dates
      for idx, row in df.iterrows():
          player_id = row['PlayerID']
          current_date = row['DATE']

          # Filter rows for the same player and only for dates before the current row's date
          player_data_before_date = df[
              (df['PlayerID'] == player_id) &
              (df['DATE'] < current_date)
          ]

          # Compute max and avg for the current metric based on the filtered data
          if not player_data_before_date.empty:
              df.at[idx, max_col] = player_data_before_date[metric].max()
              df.at[idx, avg_col] = player_data_before_date[metric].mean()
          else:
          # Fallback: Use max and avg across all rows for the same player
              player_data = df[df['PlayerID'] == player_id]
              df.at[idx, max_col] = player_data[metric].max()
              df.at[idx, avg_col] = player_data[metric].mean()

  return df


In [42]:
cols = ['TD', 'HSR', '+25 Km/h', 'ACC', 'DEC','TD/Mins', 'HSR/Mins', '+25 Km/h/Mins', 'ACC/Mins', 'Sprints/Mins']
#df_max_avg = calculate_max_avg(df_rel, cols)
df_max_avg = df_rel.copy()

In [43]:
df_max_avg.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4723 entries, 0 to 7802
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   DATE                4723 non-null   datetime64[ns]
 1   Microcycle          4508 non-null   object        
 2   Injury              4723 non-null   float64       
 3   preseason-season    4723 non-null   object        
 4   TD                  4723 non-null   float64       
 5   HSR                 4723 non-null   float64       
 6   +25 Km/h            4723 non-null   float64       
 7   ACC                 4723 non-null   float64       
 8   DEC                 4723 non-null   float64       
 9   PlayerID            4723 non-null   float64       
 10  Max Speed           4723 non-null   float64       
 11  Sprints             4723 non-null   float64       
 12  Mins                4723 non-null   float64       
 13  Max Speed Season    4723 non-null   float64       
 1

# Function for player loads

In [None]:
#df_max_avg

In [44]:
def calcular_acumulado(df, columnas_calcular, dias):
    # Create an empty list to store processed player DataFrames
    processed_players = []

    # Process each player separately
    for player_id in df['PlayerID'].unique():
        # Filter data for the current player
        player_data = df[df['PlayerID'] == player_id].copy()

        # Create a full date range for the player (from the first to the last recorded date)
        full_date_range = pd.date_range(start=player_data['DATE'].min(), end=player_data['DATE'].max(), freq='D')

        # Set 'DATE' as the index and reindex to fill missing dates with zeros
        player_data = player_data.set_index('DATE').reindex(full_date_range, fill_value=0).reset_index()
        player_data.rename(columns={'index': 'DATE'}, inplace=True)
        player_data['PlayerID'] = player_id

        # Perform rolling calculations for each metric, excluding the current day
        for dia in dias:
            for col in columnas_calcular:
                # Check if the column exists to avoid errors
                if col in player_data.columns:
                    # Rolling sum, mean, std: exclude the current day using shift(1)
                    player_data[f'{col}-{dia}'] = (
                        player_data[col].rolling(window=dia, min_periods=1).sum()
                    )
                    player_data[f'{col}-{dia}-avg'] = (
                        player_data[col].rolling(window=dia, min_periods=1).mean()
                    )
                    player_data[f'{col}-{dia}-std'] = (
                        player_data[col].rolling(window=dia, min_periods=1).std()
                    )

                    # EWMA: Exclude the current day using shift(1)
                    player_data[f'{col}_EWMA-{dia}'] = (
                        player_data[col].ewm(span=dia, adjust=False).mean()
                    )

        # Drop rows where all calculated values are zero (rest days)
        mask_non_zero = (player_data[columnas_calcular].sum(axis=1) > 0)
        player_data = player_data[mask_non_zero]

        # Append the processed player's data to the list
        processed_players.append(player_data)

    # Concatenate all processed player DataFrames into a single DataFrame
    df_resultado = pd.concat(processed_players, ignore_index=True)

    return df_resultado


In [45]:
cols_calculate = ['TD', 'HSR', '+25 Km/h', 'ACC', 'DEC', 'Sprints', 'Mins']
#columnas_calcular = ['Total D', '>19.8', '> 25 Km/h', 'ACC', 'DEC', 'Sprints'] # Indicamos las columnas que queremos añadir en el df
cumulative_df = calcular_acumulado(df_max_avg, cols_calculate, [7,28]) # loads -1, -3, -7, -21

In [46]:
cumulative_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4723 entries, 0 to 4722
Data columns (total 78 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   DATE                4723 non-null   datetime64[ns]
 1   Microcycle          4508 non-null   object        
 2   Injury              4723 non-null   float64       
 3   preseason-season    4723 non-null   object        
 4   TD                  4723 non-null   float64       
 5   HSR                 4723 non-null   float64       
 6   +25 Km/h            4723 non-null   float64       
 7   ACC                 4723 non-null   float64       
 8   DEC                 4723 non-null   float64       
 9   PlayerID            4723 non-null   float64       
 10  Max Speed           4723 non-null   float64       
 11  Sprints             4723 non-null   float64       
 12  Mins                4723 non-null   float64       
 13  Max Speed Season    4723 non-null   float64     

In [47]:
cumulative_df = cumulative_df.dropna(subset=['TD-7', 'TD-7-std'])
cumulative_df.reset_index(drop=True, inplace=True)

In [48]:
cumulative_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4700 entries, 0 to 4699
Data columns (total 78 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   DATE                4700 non-null   datetime64[ns]
 1   Microcycle          4485 non-null   object        
 2   Injury              4700 non-null   float64       
 3   preseason-season    4700 non-null   object        
 4   TD                  4700 non-null   float64       
 5   HSR                 4700 non-null   float64       
 6   +25 Km/h            4700 non-null   float64       
 7   ACC                 4700 non-null   float64       
 8   DEC                 4700 non-null   float64       
 9   PlayerID            4700 non-null   float64       
 10  Max Speed           4700 non-null   float64       
 11  Sprints             4700 non-null   float64       
 12  Mins                4700 non-null   float64       
 13  Max Speed Season    4700 non-null   float64     

# Calculate different load metrics

In [49]:
def calculate_metrics_loads(df, metrics):

  # Calculate ACWR, MSWR for each metric
  for metric in metrics:

      # Calculate 7-day and 28-day averages for ACWR
      df[f'{metric}_ACWR'] = df[f'{metric}-7-avg'] / df[f'{metric}-28-avg']

      # Calculate mean and standard deviation for MSWR
      df[f'{metric}_MSWR'] = df[f'{metric}-7-avg'] / df[f'{metric}-7-std']

  return df

In [50]:
cols_calculate = ['TD', 'HSR', '+25 Km/h', 'ACC', 'DEC']
cumulative_df = calculate_metrics_loads(cumulative_df, cols_calculate)

In [51]:
cumulative_df.drop(columns=['Sprints-28-avg', 'Sprints-28-std', 'DEC-28-avg', 'DEC-28-std', 'ACC-28-avg', 'ACC-28-std','+25 Km/h-28-avg', '+25 Km/h-28-std', 'HSR-28-avg', 'HSR-28-std', 'TD-28-avg', 'TD-28-std',
                            'Sprints-7-avg', 'Sprints-7-std', 'DEC-7-avg', 'DEC-7-std', 'ACC-7-avg', 'ACC-7-std','+25 Km/h-7-avg', '+25 Km/h-7-std', 'HSR-7-avg', 'HSR-7-std', 'TD-7-avg', 'TD-7-std', 'TD-28', 'HSR-28', '+25 Km/h-28', 'ACC-28', 'DEC-28', 'Sprints-28',
                            '+25 Km/h_EWMA-28', 'Sprints_EWMA-28', 'TD_EWMA-28', 'HSR_EWMA-28', 'ACC_EWMA-28', 'DEC_EWMA-28','Sprints_EWMA-7'] , inplace=True)

In [52]:
cumulative_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4700 entries, 0 to 4699
Data columns (total 51 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   DATE                4700 non-null   datetime64[ns]
 1   Microcycle          4485 non-null   object        
 2   Injury              4700 non-null   float64       
 3   preseason-season    4700 non-null   object        
 4   TD                  4700 non-null   float64       
 5   HSR                 4700 non-null   float64       
 6   +25 Km/h            4700 non-null   float64       
 7   ACC                 4700 non-null   float64       
 8   DEC                 4700 non-null   float64       
 9   PlayerID            4700 non-null   float64       
 10  Max Speed           4700 non-null   float64       
 11  Sprints             4700 non-null   float64       
 12  Mins                4700 non-null   float64       
 13  Max Speed Season    4700 non-null   float64     

In [53]:
# Exclude 'Microcycle' column for counting NULLs
columns_to_check = cumulative_df.drop(columns=['Microcycle']).columns

# Drop rows where 2 or more null values exist (excluding 'Microcycle')
cumulative_df = cumulative_df[cumulative_df[columns_to_check].notna().sum(axis=1) > (len(columns_to_check) - 1)]

# Display the resulting DataFrame
cumulative_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4420 entries, 0 to 4699
Data columns (total 51 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   DATE                4420 non-null   datetime64[ns]
 1   Microcycle          4215 non-null   object        
 2   Injury              4420 non-null   float64       
 3   preseason-season    4420 non-null   object        
 4   TD                  4420 non-null   float64       
 5   HSR                 4420 non-null   float64       
 6   +25 Km/h            4420 non-null   float64       
 7   ACC                 4420 non-null   float64       
 8   DEC                 4420 non-null   float64       
 9   PlayerID            4420 non-null   float64       
 10  Max Speed           4420 non-null   float64       
 11  Sprints             4420 non-null   float64       
 12  Mins                4420 non-null   float64       
 13  Max Speed Season    4420 non-null   float64       
 1

In [54]:
cumulative_df.describe()

  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,DATE,Injury,TD,HSR,+25 Km/h,ACC,DEC,PlayerID,Max Speed,Sprints,Mins,Max Speed Season,Avg Speed Season,% Max Speed,Speed Diff Max Avg,TD_Rel,HSR_Rel,+25 Km/h_Rel,ACC_Rel,DEC_Rel,TD-7,TD_EWMA-7,HSR-7,HSR_EWMA-7,+25 Km/h-7,+25 Km/h_EWMA-7,ACC-7,ACC_EWMA-7,DEC-7,DEC_EWMA-7,Sprints-7,Mins-7,Mins-7-avg,Mins-7-std,Mins_EWMA-7,Mins-28,Mins-28-avg,Mins-28-std,Mins_EWMA-28,TD_ACWR,TD_MSWR,HSR_ACWR,HSR_MSWR,+25 Km/h_ACWR,+25 Km/h_MSWR,ACC_ACWR,ACC_MSWR,DEC_ACWR,DEC_MSWR
count,4420,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0,4420.0
mean,2022-07-22 22:00:06.515837184,0.008371,4906.442149,195.471606,41.49095,48.440414,39.762489,35015.097059,27.526193,14.467421,67.263528,35.11681,21.467554,78.43152,28.247688,49.782072,35.746034,22.763645,55.101665,44.34107,22743.302421,3452.392053,886.76233,134.065868,178.450452,26.902994,226.988543,34.622507,182.738445,27.817875,64.776244,316.127835,45.686411,35.91777,48.258312,1047.084599,39.297594,35.068877,41.259482,1.29686,1.295921,1.31566,0.818534,1.376118,0.592141,1.317989,inf,1.317277,inf
min,2021-08-11 00:00:00,0.0,1.0,0.0,0.0,1.0,1.0,10103.0,1.81,0.0,4.04,29.76,17.940344,5.89001,-91.074358,0.01,0.0,0.0,0.96,0.93,1900.0,640.492729,10.0,2.5,1.0,0.044495,15.0,2.386349,4.0,1.0,0.0,21.57,3.081429,2.064752,6.56529,35.0,1.25,2.064752,2.663997,0.16445,0.377964,0.022619,0.377964,0.003663,0.377964,0.128123,0.377964,0.077266,0.377964
25%,2022-03-04 00:00:00,0.0,3183.25,44.0,0.0,31.0,21.0,18096.0,24.81,3.0,52.17,34.38,20.742552,71.027972,16.118448,32.0,8.32,0.0,35.79,23.84,19032.0,2900.819788,600.75,89.83968,75.0,11.562911,182.0,27.772797,140.0,21.589981,39.0,271.81,39.312143,31.586705,41.437065,797.235,31.104286,32.831023,35.175815,0.949614,0.943256,0.865459,0.639914,0.755082,0.465799,0.938765,0.935361,0.933504,0.834056
50%,2022-07-30 00:00:00,0.0,4438.0,128.0,11.0,47.0,36.0,26485.0,27.79,10.0,68.0,35.07,21.624487,79.41266,29.509026,45.475,23.2,6.535,53.06,39.99,23218.0,3370.376947,842.0,125.681779,147.0,21.286812,229.0,34.155545,186.0,27.482725,65.0,328.245,47.199286,37.005766,48.8479,1152.985,42.736286,36.043506,43.324547,1.102181,1.132633,1.113584,0.754957,1.143286,0.575406,1.118312,1.129915,1.127817,1.017657
75%,2022-12-16 00:00:00,0.0,5995.5,301.25,58.0,65.0,53.0,48692.0,30.66,22.0,84.41,36.56,22.20223,87.548929,42.526793,61.405,56.4025,34.1425,73.79,60.9075,26854.25,3954.281352,1133.25,167.858708,248.0,35.831669,274.0,40.919253,225.0,33.446163,91.0,377.39,54.418571,41.087233,55.925039,1338.505,48.618304,38.793656,48.353158,1.391326,1.356926,1.517677,0.885731,1.709327,0.670409,1.450442,1.323057,1.459125,1.232561
max,2023-05-09 00:00:00,1.0,13416.0,1172.0,429.0,178.0,146.0,89871.0,37.28,78.0,141.81,37.28,23.993491,100.0,82.440073,145.41,279.77,276.45,167.75,142.58,55983.0,7872.073864,2560.0,444.817348,1076.0,166.200211,681.0,101.33383,513.0,80.204649,186.0,583.84,91.21,57.396142,90.48,1824.21,91.21,47.226359,89.951379,4.0,329.582471,4.0,97.345034,4.0,14.142136,4.0,inf,4.0,inf
std,,0.09112,2406.533007,192.922491,64.806127,23.999099,24.845096,22306.980578,4.297703,14.401387,22.736652,1.616108,0.889361,11.935752,19.446512,24.115666,35.056689,33.049248,26.218124,26.915038,6788.242793,935.36535,399.698947,61.805877,136.183783,21.668831,76.424845,10.754141,67.420935,9.772935,37.081294,89.42556,13.000089,7.491266,11.972785,373.377404,13.108493,5.724062,11.422674,0.657054,5.056033,0.762359,1.588541,0.927028,0.306408,0.680823,,0.689268,


Do KNN imputation for EMWA and ACWR values

In [None]:
#cumulative_df.columns

In [None]:
# # Exclude unwanted columns for KNN calculation
# columns_to_exclude = ['DATE', 'Microcycle', 'Injury', 'preseason-season']
# columns_for_knn = [col for col in cumulative_df.columns if col not in columns_to_exclude]

# # Apply KNN Imputation only to selected columns
# imputer = KNNImputer(n_neighbors=3)
# imputed_values = imputer.fit_transform(cumulative_df[columns_for_knn])

# # Replace imputed columns in the original dataframe
# cumulative_df[columns_for_knn] = imputed_values

# cumulative_df.info()

# Do the shift for 7 previous days injury

In [55]:
def shift_injuries_7_days(df):
  copy_df = df.copy()

    # Initialize a new column for the 7-day injury prediction label
  copy_df['Injury_7_day'] = 0

  # Iterate over the df to assign labels to rows in the 7 days before an injury
  for idx, row in copy_df.iterrows():
      if row['Injury'] == 1:
          # Get the player's ID
          player_id = row['PlayerID']
          injury_date = row['DATE']

          # Assign injury label to the previous 7 days for the same player
          # Change 0 to 1 to not include the day of the injury
          for i in range(0, 8):
              prev_date = pd.to_datetime(injury_date) - pd.Timedelta(days=i)
              mask = (copy_df['PlayerID'] == player_id) & (copy_df['DATE'] == prev_date)
              copy_df.loc[mask, 'Injury_7_day'] = 1

  # Drop the original injury rows
  # copy_df = copy_df[copy_df['Injury'] == 0]

  return copy_df

In [56]:
cumulative_df_inj = shift_injuries_7_days(cumulative_df)

In [57]:
cumulative_df_inj

Unnamed: 0,DATE,Microcycle,Injury,preseason-season,TD,HSR,+25 Km/h,ACC,DEC,PlayerID,Max Speed,Sprints,Mins,Max Speed Season,Avg Speed Season,% Max Speed,Speed Diff Max Avg,TD_Rel,HSR_Rel,+25 Km/h_Rel,ACC_Rel,DEC_Rel,TD-7,TD_EWMA-7,HSR-7,HSR_EWMA-7,+25 Km/h-7,+25 Km/h_EWMA-7,ACC-7,ACC_EWMA-7,DEC-7,DEC_EWMA-7,Sprints-7,Mins-7,Mins-7-avg,Mins-7-std,Mins_EWMA-7,Mins-28,Mins-28-avg,Mins-28-std,Mins_EWMA-28,TD_ACWR,TD_MSWR,HSR_ACWR,HSR_MSWR,+25 Km/h_ACWR,+25 Km/h_MSWR,ACC_ACWR,ACC_MSWR,DEC_ACWR,DEC_MSWR,Injury_7_day
0,2021-08-11,M-3,0.0,SEASON,7423.0,426.0,20.0,52.0,38.0,23085.0,26.79,24.0,90.85,30.73,20.278653,87.178653,32.109364,68.55,73.81,26.13,64.23,43.80,13074.0,6094.000000,623.0,254.250000,54.0,30.500000,115.0,60.250000,74.0,36.500000,43.0,171.20,85.600000,7.424621,82.975000,171.20,85.600000,7.424621,81.074138,1.000000,5.217107,1.000000,1.923701,1.000000,2.727412,1.000000,7.392480,1.000000,26.162951,0
1,2021-08-12,M-2,0.0,SEASON,1268.0,0.0,0.0,11.0,5.0,23085.0,16.57,0.0,43.58,30.73,20.278653,53.921250,-18.288460,11.71,0.00,0.00,13.59,5.76,14342.0,4887.500000,623.0,190.687500,54.0,22.875000,126.0,47.937500,79.0,28.625000,43.0,214.78,71.593333,24.821818,73.126250,214.78,71.593333,24.821818,78.488335,1.000000,1.508831,1.000000,0.974045,1.000000,1.053370,1.000000,1.532602,1.000000,1.423250,0
2,2021-08-13,M-1,0.0,SEASON,4135.0,112.0,2.0,47.0,20.0,23085.0,25.47,11.0,58.35,30.73,20.278653,82.883176,25.600056,38.19,19.40,2.61,58.05,23.05,18477.0,4699.375000,735.0,171.015625,56.0,17.656250,173.0,47.703125,99.0,26.468750,54.0,273.13,68.282500,21.321231,69.432187,273.13,68.282500,21.321231,77.099485,1.000000,1.771797,1.000000,1.017839,1.000000,0.870478,1.000000,1.920958,1.000000,1.603462,0
3,2021-08-15,M+1,0.0,SEASON,6379.0,86.0,0.0,49.0,48.0,23085.0,23.37,13.0,79.48,30.73,20.278653,76.049463,15.244339,58.91,14.90,0.00,60.52,55.33,24856.0,4238.148438,821.0,117.696289,56.0,9.931641,222.0,39.083008,147.0,26.888672,67.0,352.61,58.768333,33.491946,58.925605,352.61,58.768333,33.491946,72.313156,1.000000,1.405105,1.000000,0.855593,1.000000,0.647982,1.000000,1.464844,1.000000,1.267713,0
4,2021-08-17,M-4,0.0,SEASON,6287.0,209.0,9.0,62.0,44.0,23085.0,26.80,19.0,83.69,30.73,20.278653,87.211194,32.158677,58.06,36.21,11.76,76.58,50.72,25492.0,3955.708496,833.0,118.454163,31.0,7.836548,221.0,37.484192,155.0,26.124878,67.0,355.95,50.850000,38.250687,54.068153,436.30,54.537500,36.917203,68.454591,0.935482,1.140354,0.924272,0.762941,0.545055,0.581739,0.889336,1.179096,0.927450,1.048327,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4695,2022-05-16,M+1,0.0,SEASON,5436.0,284.0,41.0,69.0,57.0,42579.0,28.31,22.0,71.47,29.76,19.989098,95.127688,41.627200,68.87,76.52,143.23,85.76,78.58,19238.0,2802.919311,589.0,90.668364,41.0,10.352093,231.0,35.853672,174.0,28.127370,41.0,304.27,43.467143,34.799417,43.606948,1037.12,37.040000,39.734446,36.675677,1.171477,1.218111,1.180952,0.731231,1.322581,0.377964,1.157605,1.205526,1.072750,1.107434,0
4696,2022-05-18,M-4,0.0,SEASON,4514.0,68.0,0.0,60.0,40.0,42579.0,23.36,8.0,85.65,29.76,19.989098,78.494624,16.863701,57.19,18.32,0.00,74.57,55.14,21785.0,2705.142112,451.0,68.000955,41.0,5.823052,274.0,35.167690,206.0,25.821645,43.0,366.80,52.400000,36.680632,45.941408,1005.08,35.895714,37.736982,37.698298,1.377098,1.345018,1.018634,0.633106,1.389831,0.377964,1.432305,1.397832,1.362434,1.357026,0
4697,2022-05-19,M-3,0.0,SEASON,4959.0,91.0,1.0,49.0,44.0,42579.0,25.33,9.0,66.66,29.76,19.989098,85.114247,26.719073,62.83,24.52,3.49,60.90,60.65,21321.0,3268.606584,467.0,73.750716,42.0,4.617289,280.0,38.625768,215.0,30.366234,43.0,362.49,51.784286,36.351665,51.121056,985.39,35.192500,36.936941,39.695657,1.327336,1.358465,1.082899,0.652644,1.460870,0.388650,1.442927,1.416873,1.431425,1.375551,0
4698,2022-05-20,M-2,0.0,SEASON,2775.0,1.0,0.0,49.0,24.0,42579.0,19.89,0.0,55.09,29.76,19.989098,66.834677,-0.495761,35.16,0.27,0.00,60.90,33.08,20770.0,3145.204938,455.0,55.563037,42.0,3.462967,281.0,41.219326,208.0,28.774676,41.0,358.03,51.147143,36.231757,52.113292,997.56,35.627143,37.102472,40.757336,1.302194,1.324423,1.068075,0.628829,1.460870,0.388650,1.446217,1.419433,1.396442,1.322393,0


In [58]:
print("No Injury: " + str(cumulative_df_inj[cumulative_df_inj["Injury_7_day"] == 0].shape[0]))
print("Injury: " + str(cumulative_df_inj[cumulative_df_inj["Injury_7_day"] == 1].shape[0]))
print(f"%: {((cumulative_df_inj[cumulative_df_inj['Injury_7_day'] == 1].shape[0] / cumulative_df_inj.shape[0]) * 100):.2f}%")

No Injury: 4219
Injury: 201
%: 4.55%


# Remove first 7 days of a player's data

In [59]:
# Get the minimum date for each player
min_dates = cumulative_df_inj.groupby('PlayerID')['DATE'].transform('min')

# Calculate the cutoff date for each row (min date + 7 days)
cutoff_dates = min_dates + pd.Timedelta(days=7)

# Filter out rows where the date is within the first 7 days for each player
final_df = cumulative_df_inj[cumulative_df_inj['DATE'] >= cutoff_dates]

final_df = cumulative_df_inj.copy()

In [60]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4420 entries, 0 to 4699
Data columns (total 52 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   DATE                4420 non-null   datetime64[ns]
 1   Microcycle          4215 non-null   object        
 2   Injury              4420 non-null   float64       
 3   preseason-season    4420 non-null   object        
 4   TD                  4420 non-null   float64       
 5   HSR                 4420 non-null   float64       
 6   +25 Km/h            4420 non-null   float64       
 7   ACC                 4420 non-null   float64       
 8   DEC                 4420 non-null   float64       
 9   PlayerID            4420 non-null   float64       
 10  Max Speed           4420 non-null   float64       
 11  Sprints             4420 non-null   float64       
 12  Mins                4420 non-null   float64       
 13  Max Speed Season    4420 non-null   float64       
 1

In [61]:
print("No Injury: " + str(final_df[final_df["Injury_7_day"] == 0].shape[0]))
print("Injury: " + str(final_df[final_df["Injury_7_day"] == 1].shape[0]))
print(f"%: {((final_df[final_df['Injury_7_day'] == 1].shape[0] / final_df.shape[0]) * 100):.2f}%")

No Injury: 4219
Injury: 201
%: 4.55%


# Intensity metric based on Microcycle type

In [62]:
final_df.Microcycle.unique()

array(['M-3', 'M-2', 'M-1', 'M+1', 'M-4', 'MD', 'M+3', 'M-5', 'M+2', nan,
       'MD(AWAY)', 'MD(HOME)'], dtype=object)

In [63]:
# Map Microcycle to categories
final_df['Category'] = final_df['Microcycle'].apply(
    lambda x: 'MATCH' if x in ['MD', 'MD(AWAY)', 'MD(HOME)'] else 'TRAINING'
)

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the categories
encoded_categories = encoder.fit_transform(final_df[['Category']])

# Add the encoded columns to the DataFrame with index alignment
encoded_df = pd.DataFrame(
    encoded_categories, 
    columns=encoder.get_feature_names_out(['Category']), 
    index=final_df.index  # Ensure index alignment
)

# Concatenate the original DataFrame and the one-hot encoded DataFrame
encoded_final_df = pd.concat([final_df, encoded_df], axis=1)

In [64]:
encoded_final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4420 entries, 0 to 4699
Data columns (total 55 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   DATE                4420 non-null   datetime64[ns]
 1   Microcycle          4215 non-null   object        
 2   Injury              4420 non-null   float64       
 3   preseason-season    4420 non-null   object        
 4   TD                  4420 non-null   float64       
 5   HSR                 4420 non-null   float64       
 6   +25 Km/h            4420 non-null   float64       
 7   ACC                 4420 non-null   float64       
 8   DEC                 4420 non-null   float64       
 9   PlayerID            4420 non-null   float64       
 10  Max Speed           4420 non-null   float64       
 11  Sprints             4420 non-null   float64       
 12  Mins                4420 non-null   float64       
 13  Max Speed Season    4420 non-null   float64       
 1

In [65]:
encoded_final_df.tail(20)

Unnamed: 0,DATE,Microcycle,Injury,preseason-season,TD,HSR,+25 Km/h,ACC,DEC,PlayerID,Max Speed,Sprints,Mins,Max Speed Season,Avg Speed Season,% Max Speed,Speed Diff Max Avg,TD_Rel,HSR_Rel,+25 Km/h_Rel,ACC_Rel,DEC_Rel,TD-7,TD_EWMA-7,HSR-7,HSR_EWMA-7,+25 Km/h-7,+25 Km/h_EWMA-7,ACC-7,ACC_EWMA-7,DEC-7,DEC_EWMA-7,Sprints-7,Mins-7,Mins-7-avg,Mins-7-std,Mins_EWMA-7,Mins-28,Mins-28-avg,Mins-28-std,Mins_EWMA-28,TD_ACWR,TD_MSWR,HSR_ACWR,HSR_MSWR,+25 Km/h_ACWR,+25 Km/h_MSWR,ACC_ACWR,ACC_MSWR,DEC_ACWR,DEC_MSWR,Injury_7_day,Category,Category_MATCH,Category_TRAINING
4675,2022-04-12,M-4,0.0,SEASON,5974.0,107.0,2.0,74.0,56.0,42579.0,25.28,13.0,99.74,29.76,19.989098,84.946237,26.468937,75.69,28.83,6.99,91.97,77.2,23393.0,3392.960407,633.0,99.64873,66.0,8.74089,303.0,43.294426,247.0,35.244646,60.0,345.28,49.325714,37.353273,51.85417,1235.98,44.142143,35.595789,43.623621,1.165135,1.373211,0.826911,0.977109,1.970149,0.636709,1.195266,1.330198,1.217649,1.367734,0,TRAINING,0.0,1.0
4676,2022-04-13,M-3,0.0,SEASON,5252.0,335.0,0.0,47.0,56.0,42579.0,24.24,16.0,93.19,29.76,19.989098,81.451613,21.266101,66.54,90.26,0.0,58.42,77.2,25014.0,3857.720306,895.0,158.486548,64.0,6.555668,306.0,44.22082,269.0,40.433484,69.0,379.11,54.158571,40.889068,62.188127,1329.17,47.470357,35.672187,47.041992,1.169398,1.406593,1.053871,0.985,1.910448,0.609782,1.153629,1.342101,1.240489,1.42692,0,TRAINING,0.0,1.0
4677,2022-04-14,M-2,0.0,SEASON,3061.0,2.0,0.0,57.0,44.0,42579.0,21.02,0.0,76.92,29.76,19.989098,70.63172,5.15732,38.78,0.54,0.0,70.84,60.65,23950.0,3658.540229,871.0,119.364911,64.0,4.916751,319.0,47.415615,272.0,41.325113,67.0,394.1,56.3,41.74742,65.871096,1359.31,48.546786,36.102731,49.102544,1.130864,1.350313,1.025007,0.934042,1.910448,0.609782,1.154751,1.382677,1.207011,1.439012,0,TRAINING,0.0,1.0
4678,2022-04-15,M-1,0.0,SEASON,2406.0,11.0,0.0,32.0,28.0,42579.0,22.05,2.0,57.22,29.76,19.989098,74.092742,10.310129,30.48,2.96,0.0,39.77,38.6,22278.0,3345.405172,666.0,92.273683,35.0,3.687563,271.0,43.561711,244.0,37.993835,57.0,402.92,57.56,41.602095,63.708322,1377.7,49.203571,36.086688,49.662369,1.062147,1.252786,0.785377,0.719367,1.044776,0.404226,0.959292,1.316651,1.084927,1.335589,0,TRAINING,0.0,1.0
4679,2022-04-16,MD,0.0,SEASON,1017.0,167.0,0.0,13.0,12.6,42579.0,24.56,10.0,8.25,29.76,19.989098,82.526882,22.866974,12.88,44.99,0.0,16.16,17.37,23295.0,2763.303879,833.0,110.955262,35.0,2.765672,284.0,35.921283,256.6,31.645376,67.0,411.17,58.738571,39.776469,49.843741,1344.2,48.007143,36.889366,46.806343,1.146196,1.416113,0.943909,0.93552,1.044776,0.404226,1.01701,1.511009,1.140191,1.552624,0,MATCH,1.0,0.0
4680,2022-04-18,M+2,0.0,SEASON,4554.0,84.0,1.0,65.0,57.0,42579.0,25.37,11.0,58.62,29.76,19.989098,85.248656,26.919182,57.7,22.63,3.49,80.79,78.58,22264.0,2692.858432,706.0,83.412335,3.0,1.805691,288.0,36.455722,253.6,32.050524,52.0,393.94,56.277143,39.067906,42.692104,1402.82,50.100714,35.708442,44.615677,1.037356,1.437039,0.781406,0.834991,0.088889,0.544705,0.974619,1.501848,1.059758,1.565763,0,TRAINING,0.0,1.0
4681,2022-04-19,M-4,0.0,SEASON,6924.0,292.0,6.0,93.0,84.0,42579.0,26.19,36.0,117.69,29.76,19.989098,88.004032,31.021418,87.72,78.67,20.96,115.59,115.8,23214.0,3750.643824,891.0,135.559251,7.0,2.854268,307.0,50.591791,281.6,45.037893,75.0,411.89,58.841429,42.806343,61.441578,1435.71,51.275357,37.393376,49.655285,1.049552,1.363892,0.934452,0.90326,0.198582,0.447214,1.017399,1.379904,1.127302,1.394759,0,TRAINING,0.0,1.0
4682,2022-04-21,M-2,0.0,SEASON,3985.0,137.0,4.0,38.0,48.0,42579.0,26.5,12.0,86.35,29.76,19.989098,89.045699,32.572264,50.49,36.91,13.97,47.23,66.17,18886.0,3105.987151,691.0,110.502079,11.0,2.605526,241.0,37.957883,229.6,37.333815,71.0,328.13,46.875714,46.015262,56.148388,1343.78,47.992143,37.705171,48.997625,0.90737,1.039364,0.807007,0.908178,0.305556,0.644094,0.885216,0.992922,0.972675,1.035427,0,TRAINING,0.0,1.0
4683,2022-04-22,M-1,0.0,SEASON,3227.0,22.0,0.0,48.0,29.0,42579.0,23.67,3.0,42.92,29.76,19.989098,79.53629,18.414547,40.88,5.93,0.0,59.66,39.98,19707.0,3136.240363,702.0,88.376559,11.0,1.954144,257.0,40.468412,230.6,35.250361,72.0,313.83,44.832857,45.796391,52.841291,1361.58,48.627857,37.454483,48.578479,0.939111,1.083231,0.829542,0.934753,0.307692,0.644094,0.921973,1.048592,0.973818,1.040693,0,TRAINING,0.0,1.0
4684,2022-04-25,M+2,0.0,SEASON,6678.0,162.0,1.0,71.0,62.0,42579.0,25.7,20.0,81.72,29.76,19.989098,86.357527,28.570082,84.61,43.65,3.49,88.24,85.47,20814.0,2992.601403,613.0,77.783861,11.0,1.074405,250.0,34.822611,223.0,30.371246,71.0,328.68,46.954286,48.980623,42.72242,1348.67,48.166786,36.943187,44.84084,0.972958,0.965197,0.799218,0.772624,0.305556,0.644094,0.914913,0.948347,0.94073,0.936647,0,TRAINING,0.0,1.0


# Remove Preseason data

In [66]:
encoded_final_df = encoded_final_df[encoded_final_df['preseason-season'] != 'PRESEASON']

encoded_final_df.reset_index(drop=True, inplace=True)

In [67]:
print("No Injury: " + str(encoded_final_df[encoded_final_df["Injury_7_day"] == 0].shape[0]))
print("Injury: " + str(encoded_final_df[encoded_final_df["Injury_7_day"] == 1].shape[0]))
print(f"%: {((encoded_final_df[encoded_final_df['Injury_7_day'] == 1].shape[0] / encoded_final_df.shape[0]) * 100):.2f}%")

No Injury: 3942
Injury: 197
%: 4.76%


# Export data

In [107]:
encoded_final_df.to_excel('/content/drive/MyDrive/WATFORD FC/Datos GPS/datos_finales_tommy2.xlsx', index=False)

In [68]:
encoded_final_df.to_excel('data/cleaned_data_no_injured.xlsx', index=False)

In [None]:
# result_df.to_csv('/content/drive/MyDrive/WATFORD FC/Datos GPS/datos_finales_tommy.csv', index=False)