<a href="https://colab.research.google.com/github/Tommy-Las/WatfordFC/blob/main/Data_cleaning_transformation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import packages and data

In [1]:
import pandas as pd
# Turn off warnings
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer

In [2]:
# Mount drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Import GPS data

In [3]:
file_path = '/content/drive/MyDrive/WATFORD FC/Datos GPS/GPS 2018-2023.xlsx'
df_gps = pd.read_excel(file_path)

Import speed data

In [4]:
file_path = '/content/drive/MyDrive/WATFORD FC/Datos GPS/max_speed.xlsx'
df_speed = pd.read_excel(file_path)

Import wellbeing data


In [None]:
#file_path = '/content/drive/MyDrive/WATFORD FC/Datos Wellbeing/wellbeing ssp.xlsx'
#df_wellbeing = pd.read_excel(file_path)

# Merge Sprint values

The sprint values are separated in different rows per session,

we want to merge in into a single row per session.

## Transform data before merging sprint rows

In [5]:
df_speed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123180 entries, 0 to 123179
Data columns (total 11 columns):
 #   Column                                       Non-Null Count   Dtype  
---  ------                                       --------------   -----  
 0   DATE                                         123180 non-null  object 
 1   ID                                           123180 non-null  int64  
 2   Player Position                              123155 non-null  object 
 3   Max Speed                                    116475 non-null  float64
 4   Sprints                                      113560 non-null  float64
 5   MINUTES                                      113654 non-null  float64
 6   Season                                       123180 non-null  object 
 7   Max Speed Season                             123180 non-null  float64
 8   Avg Speed Season                             122357 non-null  float64
 9   % Max Speed                                  122357 non-nul

We want to remove rows after 08-07-2021 since we don't have data for sprints and speed

In [6]:
from datetime import date
# Convert DATE column values into Pandas datetime object
df_speed['DATE'] = pd.to_datetime(df_speed['DATE'], dayfirst=True)

# Filter rows after date: '2021-08-07'
date_filter = pd.Timestamp('2021-08-07')
df_speed = df_speed[df_speed['DATE'] > date_filter]

# Filter rows after date: '2023-05-11'
date_filter = pd.Timestamp('2023-05-11')
df_speed = df_speed[df_speed['DATE'] < date_filter]

date_filter = pd.Timestamp('2021-06-28')
df_gps = df_gps[df_gps['DATE'] > date_filter]

In [7]:
df_speed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 55390 entries, 58257 to 113646
Data columns (total 11 columns):
 #   Column                                       Non-Null Count  Dtype         
---  ------                                       --------------  -----         
 0   DATE                                         55390 non-null  datetime64[ns]
 1   ID                                           55390 non-null  int64         
 2   Player Position                              55365 non-null  object        
 3   Max Speed                                    55377 non-null  float64       
 4   Sprints                                      55362 non-null  float64       
 5   MINUTES                                      55389 non-null  float64       
 6   Season                                       55390 non-null  object        
 7   Max Speed Season                             55390 non-null  float64       
 8   Avg Speed Season                             55390 non-null  float64       


Replace NULL sprint values to 0

In [8]:
# Replace NULL values from Sprint to 0
df_speed['Sprints'] = df_speed['Sprints'].fillna(0)

In [9]:
df_speed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 55390 entries, 58257 to 113646
Data columns (total 11 columns):
 #   Column                                       Non-Null Count  Dtype         
---  ------                                       --------------  -----         
 0   DATE                                         55390 non-null  datetime64[ns]
 1   ID                                           55390 non-null  int64         
 2   Player Position                              55365 non-null  object        
 3   Max Speed                                    55377 non-null  float64       
 4   Sprints                                      55390 non-null  float64       
 5   MINUTES                                      55389 non-null  float64       
 6   Season                                       55390 non-null  object        
 7   Max Speed Season                             55390 non-null  float64       
 8   Avg Speed Season                             55390 non-null  float64       


## Merge sprint values

Print values to verify later

In [10]:
df_speed.sort_values(by='DATE', ascending=False).head(15)

Unnamed: 0,DATE,ID,Player Position,Max Speed,Sprints,MINUTES,Season,Max Speed Season,Avg Speed Season,% Max Speed,%Speed diference against max. Speed average
58257,2023-05-10,23085,CENTRE MIDFIELDER,22.91,3.0,30.19,2022-2023,32.12,20.304388,71.326276,12.832753
58258,2023-05-10,87583,CENTRE MIDFIELDER,24.7,20.0,40.53,2022-2023,34.27,21.189939,72.074701,16.564751
58259,2023-05-09,96811,CENTRE MIDFIELDER,32.19,10.0,70.51,2022-2023,36.04,20.882454,89.317425,54.148548
58260,2023-05-09,89871,STRIKER,25.46,0.0,79.87,2022-2023,34.12,21.753402,74.618992,17.039162
58300,2023-05-08,55555,FULL BACK,30.47,18.0,72.62,2022-2023,34.94,22.07125,87.20664,38.052897
58289,2023-05-08,48692,CENTRE BACK,32.56,35.0,69.15,2022-2023,36.21,21.02927,89.919912,54.831811
58290,2023-05-08,87583,CENTRE MIDFIELDER,22.69,11.0,34.13,2022-2023,34.27,21.189939,66.209513,7.079117
58291,2023-05-08,37537,CENTRE MIDFIELDER,28.65,5.0,47.05,2022-2023,34.09,21.974456,84.042241,30.378655
58292,2023-05-08,37537,CENTRE MIDFIELDER,30.59,3.0,22.1,2022-2023,34.09,21.974456,89.73306,39.207088
58293,2023-05-08,37537,CENTRE MIDFIELDER,30.59,8.0,69.15,2022-2023,34.09,21.974456,89.73306,39.207088


Do a group by player and date, and select the maximum value for each session

In [11]:
df_speed = df_speed.groupby(['DATE', 'ID']).agg('max').reset_index()

In [12]:
df_speed.sort_values(by='DATE', ascending=False).head(15)

Unnamed: 0,DATE,ID,Player Position,Max Speed,Sprints,MINUTES,Season,Max Speed Season,Avg Speed Season,% Max Speed,%Speed diference against max. Speed average
8262,2023-05-10,87583,CENTRE MIDFIELDER,24.7,20.0,40.53,2022-2023,34.27,21.189939,72.074701,16.564751
8261,2023-05-10,23085,CENTRE MIDFIELDER,22.91,3.0,30.19,2022-2023,32.12,20.304388,71.326276,12.832753
8260,2023-05-09,96811,CENTRE MIDFIELDER,32.19,10.0,70.51,2022-2023,36.04,20.882454,89.317425,54.148548
8259,2023-05-09,89871,STRIKER,25.46,0.0,79.87,2022-2023,34.12,21.753402,74.618992,17.039162
8249,2023-05-08,55555,FULL BACK,30.47,18.0,72.62,2022-2023,34.94,22.07125,87.20664,38.052897
8240,2023-05-08,10103,FULL BACK,21.24,1.0,44.84,2022-2023,34.95,22.20223,60.772532,-4.333932
8241,2023-05-08,12086,WINGER,35.04,41.0,88.27,2022-2023,35.44,21.901184,98.871332,59.991349
8242,2023-05-08,19817,FULL BACK,31.68,0.0,128.04,2022-2023,35.77,21.60546,88.565837,46.629599
8243,2023-05-08,21079,WINGER,34.51,51.0,88.04,2022-2023,36.97,22.708654,93.345956,51.968499
8244,2023-05-08,23085,CENTRE MIDFIELDER,23.71,7.0,58.52,2022-2023,32.12,20.304388,73.816936,16.772788


# Handle duplicates for GPS Data

In [13]:
# Define columns to sum
columns_to_sum = ['Total D', '>19.8', '> 25 Km/h', 'ACC', 'DEC']

# Define columns to select the first value
columns_to_first = ['DATE', 'Column2', 'PLAYER', 'Injury', 'season', 'LEAGUE', 'preseason-season', 'MANAGER']

# Group by the duplicate subset and aggregate
df_gps_aggregated = (
    df_gps[df_gps.duplicated(subset=['PLAYER', 'DATE'], keep=False)]
    .groupby(['PLAYER', 'DATE'], as_index=False)
    .agg({**{col: 'sum' for col in columns_to_sum},
          **{col: 'first' for col in columns_to_first}})
)

# Ensure non-duplicated rows are preserved by combining them back
df_gps_combined = pd.concat([
    df_gps[~df_gps.duplicated(subset=['PLAYER', 'DATE'], keep=False)],
    df_gps_aggregated
], ignore_index=True)

# Merge GPS and Speed dataframes

## Prepare the data before the merge

Verify column names for both Dataframes

In [14]:
df_gps_combined.columns

Index(['DATE', 'Column2', 'PLAYER', 'Injury', 'season', 'LEAGUE',
       'preseason-season', 'MANAGER', 'Total D', '>19.8', '> 25 Km/h', 'ACC',
       'DEC'],
      dtype='object')

In [15]:
df_speed.columns

Index(['DATE', 'ID', 'Player Position', 'Max Speed', 'Sprints', 'MINUTES',
       'Season', 'Max Speed Season', 'Avg Speed Season', '% Max Speed',
       '%Speed diference against max. Speed average'],
      dtype='object')

Change datatypes before the merge

In [16]:
# Convert DATE column from GPS df into datetime pandas obkect
df_gps_combined['DATE'] = pd.to_datetime(df_gps_combined['DATE'], dayfirst=True)

# Drop NULL values for 'PLAYER'
df_gps_combined = df_gps_combined.dropna(subset=['PLAYER'])

# Convert ID and PLAYER columns to the same data type - integers
df_gps_combined['PLAYER'] = df_gps_combined['PLAYER'].astype(int)
df_speed['ID'] = df_speed['ID'].astype(int)

## Do the merge of both dataframes

In [17]:
# Perform an inner join on matching DATE and PLAYER/ID values
df_merged = df_gps_combined.merge(df_speed, left_on=['DATE', 'PLAYER'], right_on=['DATE', 'ID'], how='inner')

In [18]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7835 entries, 0 to 7834
Data columns (total 23 columns):
 #   Column                                       Non-Null Count  Dtype         
---  ------                                       --------------  -----         
 0   DATE                                         7835 non-null   datetime64[ns]
 1   Column2                                      7835 non-null   object        
 2   PLAYER                                       7835 non-null   int64         
 3   Injury                                       7835 non-null   float64       
 4   season                                       7835 non-null   object        
 5   LEAGUE                                       7835 non-null   object        
 6   preseason-season                             7835 non-null   object        
 7   MANAGER                                      7835 non-null   object        
 8   Total D                                      7835 non-null   object        
 9

Verify class imbalances

In [19]:
print("No Injury: " + str(df_merged[df_merged["Injury"] == 0].shape[0]))
print("Injury: " + str(df_merged[df_merged["Injury"] == 1].shape[0]))
print(f"%: {((df_merged[df_merged['Injury'] == 1].shape[0] / df_merged.shape[0]) * 100):.2f}%")

No Injury: 7774
Injury: 61
%: 0.78%


# Merge df with 'weight' in wellbeing dataframe

In [None]:
# df_wellbeing = df_wellbeing[["Weight", "Date", "PLAYER"]]
# df_wellbeing.info()

In [None]:
# # Sort by PLAYER and DATE to ensure correct order for backfilling
# df_wellbeing = df_wellbeing.sort_values(by=['PLAYER', 'Date'])

# # Fill the NULL values in 'Weight' with the previous valid value for each player
# df_wellbeing['Weight'] = df_wellbeing.groupby('PLAYER')['Weight'].bfill()
# df_wellbeing['Weight'] = df_wellbeing.groupby('PLAYER')['Weight'].ffill()

# df_wellbeing.info()

In [None]:
# # Merge the DataFrames
# df_merged = df_merged.merge(df_wellbeing, left_on=['DATE', 'PLAYER'], right_on=['Date', 'PLAYER'], how='left')

# # Sort by PLAYER and DATE to ensure chronological order
# df_merged = df_merged.sort_values(by=['PLAYER', 'DATE'])

# # Back-fill null values in 'Weight' for each PLAYER
# df_merged['Weight'] = df_merged.groupby('PLAYER')['Weight'].bfill()
# df_merged['Weight'] = df_merged.groupby('PLAYER')['Weight'].ffill()

# # Optional: Drop the 'Date' column from the right DataFrame if not needed
# df_merged = df_merged.drop(columns=['Date'])

# # Display the final DataFrame
# df_merged.info()

In [None]:
#df_merged.groupby('PLAYER')['Weight'].apply(lambda x: x.isnull().sum())

# Transform numeric columns to the same data types

In [20]:
cols = ['Total D', '>19.8', '> 25 Km/h', 'ACC',
       'DEC', 'ID', 'Max Speed', 'Sprints', 'MINUTES', 'Max Speed Season',
       'Avg Speed Season', '% Max Speed',
       '%Speed diference against max. Speed average']

df_merged[cols] = df_merged[cols].astype(float)

In [21]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7835 entries, 0 to 7834
Data columns (total 23 columns):
 #   Column                                       Non-Null Count  Dtype         
---  ------                                       --------------  -----         
 0   DATE                                         7835 non-null   datetime64[ns]
 1   Column2                                      7835 non-null   object        
 2   PLAYER                                       7835 non-null   int64         
 3   Injury                                       7835 non-null   float64       
 4   season                                       7835 non-null   object        
 5   LEAGUE                                       7835 non-null   object        
 6   preseason-season                             7835 non-null   object        
 7   MANAGER                                      7835 non-null   object        
 8   Total D                                      7835 non-null   float64       
 9

# Values that are 0 for ACC, DEC, Max Speed

In [22]:
# Count of rows containing 0 for each column
zero_counts = (df_merged == 0).sum()

# Creating a DataFrame for the results
result_df = pd.DataFrame({
    "Column Name": zero_counts.index,
    "Number of Rows with 0": zero_counts.values
})

result_df

Unnamed: 0,Column Name,Number of Rows with 0
0,DATE,0
1,Column2,382
2,PLAYER,0
3,Injury,7774
4,season,0
5,LEAGUE,0
6,preseason-season,0
7,MANAGER,0
8,Total D,3
9,>19.8,420


In [23]:
# Columns to impute
metrics_imputation_zeroes = ['Total D', 'ACC', 'DEC', 'Max Speed', 'MINUTES']

# Columns to use for KNN calculation
columns_for_knn = ['Total D', '>19.8', '> 25 Km/h', 'ACC', 'DEC', 'ID',
                   'Max Speed', 'Sprints', 'MINUTES', '% Max Speed']

# Replace 0 with NaN in the columns to be imputed
df_merged[metrics_imputation_zeroes] = df_merged[metrics_imputation_zeroes].replace(0, np.nan)

# Ensure the columns used for KNN calculation are numeric
columns_for_knn_numeric = df_merged[columns_for_knn].select_dtypes(include=[np.number]).columns.tolist()

# Initialize the KNNImputer
knn_imputer = KNNImputer(n_neighbors=5)  # Adjust n_neighbors if needed

# Perform KNN imputation using the broader set of columns for calculation
imputed_values = knn_imputer.fit_transform(df_merged[columns_for_knn_numeric])

# Update only the specified columns to impute
df_merged[metrics_imputation_zeroes] = imputed_values[:,
    [columns_for_knn_numeric.index(col) for col in metrics_imputation_zeroes]]

In [24]:
# Count of rows containing 0 for each column
zero_counts = (df_merged == 0).sum()

# Creating a DataFrame for the results
result_df = pd.DataFrame({
    "Column Name": zero_counts.index,
    "Number of Rows with 0": zero_counts.values
})

result_df

Unnamed: 0,Column Name,Number of Rows with 0
0,DATE,0
1,Column2,382
2,PLAYER,0
3,Injury,7774
4,season,0
5,LEAGUE,0
6,preseason-season,0
7,MANAGER,0
8,Total D,0
9,>19.8,420


In [25]:
df_merged[df_merged['% Max Speed'] == 0]

Unnamed: 0,DATE,Column2,PLAYER,Injury,season,LEAGUE,preseason-season,MANAGER,Total D,>19.8,> 25 Km/h,ACC,DEC,ID,Player Position,Max Speed,Sprints,MINUTES,Season,Max Speed Season,Avg Speed Season,% Max Speed,%Speed diference against max. Speed average
2830,2022-10-19,MD (AWAY),77765,1.0,2022-2023,CHAMPIONSHIP,SEASON,BILIC,1646.0,77.0,1.0,13.0,23.0,77765.0,CENTRE MIDFIELDER,20.33,2.0,18.0,2022-2023,32.08,21.29152,0.0,-100.0
3298,2022-09-12,M-1,37990,0.0,2022-2023,CHAMPIONSHIP,SEASON,EDWARDS,5215.0,549.0,50.0,51.0,18.0,37990.0,FULL BACK,32.174,0.0,74.792,2022-2023,37.28,22.669514,0.0,-100.0
5228,2022-04-02,MD,65042,0.0,2021-2022,PREMIER LEAGUE,SEASON,HODGSON,800.0,600.0,0.0,9.4,7.6,65042.0,CENTRE MIDFIELDER,23.592,0.0,5.0,2021-2022,34.92,20.048756,0.0,-100.0
5230,2022-04-02,MD,23085,0.0,2021-2022,PREMIER LEAGUE,SEASON,HODGSON,800.0,600.0,0.0,3.6,6.2,23085.0,CENTRE MIDFIELDER,17.784,0.0,5.0,2021-2022,30.73,20.278653,0.0,-100.0
5234,2022-04-02,MD,17316,0.0,2021-2022,PREMIER LEAGUE,SEASON,HODGSON,800.0,600.0,0.0,6.2,5.6,17316.0,WINGER,21.586,0.0,5.0,2021-2022,33.73,20.441103,0.0,-100.0
5238,2022-04-02,MD,42579,0.0,2021-2022,PREMIER LEAGUE,SEASON,HODGSON,800.0,600.0,0.0,23.0,12.6,42579.0,CENTRE BACK,21.274,0.0,5.0,2021-2022,29.76,19.989098,0.0,-100.0
5242,2022-04-02,MD,23081,0.0,2021-2022,PREMIER LEAGUE,SEASON,HODGSON,800.0,600.0,0.0,3.6,6.2,23081.0,CENTRE BACK,17.784,0.0,5.0,2021-2022,34.94,20.537766,0.0,-100.0
5244,2022-04-02,MD,37709,0.0,2021-2022,PREMIER LEAGUE,SEASON,HODGSON,800.0,600.0,0.0,14.8,6.8,37709.0,CENTRE BACK,21.886,0.0,5.0,2021-2022,36.56,22.374467,0.0,-100.0
6756,2021-11-24,M-5,10452,0.0,2021-2022,PREMIER LEAGUE,SEASON,RANIERI,2354.4,0.0,0.0,18.8,18.0,10452.0,FULL BACK,13.458,0.0,15.0,2021-2022,34.54,22.291234,0.0,-100.0
6887,2021-11-07,MD,21079,0.0,2021-2022,PREMIER LEAGUE,SEASON,RANIERI,10554.0,734.0,361.0,95.0,108.0,21079.0,WINGER,35.094,75.0,100.45,2021-2022,36.54,21.962986,0.0,-100.0


In [26]:
# Calculate '% Max Speed' for all rows
df_merged['% Max Speed'] = (df_merged['Max Speed'] / df_merged['Max Speed Season']) * 100

# Calculate '% Speed difference against max. Speed average' for all rows
df_merged['%Speed diference against max. Speed average'] = (
    (df_merged['Max Speed'] - df_merged['Avg Speed Season']) / df_merged['Avg Speed Season']
) * 100

In [27]:
df_merged[(df_merged['PLAYER'] == 65042) & (df_merged['DATE'] == '2022-04-02')]

Unnamed: 0,DATE,Column2,PLAYER,Injury,season,LEAGUE,preseason-season,MANAGER,Total D,>19.8,> 25 Km/h,ACC,DEC,ID,Player Position,Max Speed,Sprints,MINUTES,Season,Max Speed Season,Avg Speed Season,% Max Speed,%Speed diference against max. Speed average
5228,2022-04-02,MD,65042,0.0,2021-2022,PREMIER LEAGUE,SEASON,HODGSON,800.0,600.0,0.0,9.4,7.6,65042.0,CENTRE MIDFIELDER,23.592,0.0,5.0,2021-2022,34.92,20.048756,67.560137,17.673139


# Reduce data volume/size

Drop rows from preseason since its not the same intensity as regular season


In [None]:
# df_merged = df_merged[df_merged['preseason-season'] != 'PRESEASON']

# df_merged.reset_index(drop=True, inplace=True)

Class imbalance

In [None]:
# print("No Injury: " + str(df_merged[df_merged["Injury"] == 0].shape[0]))
# print("Injury: " + str(df_merged[df_merged["Injury"] == 1].shape[0]))
# print(f"%: {((df_merged[df_merged['Injury'] == 1].shape[0] / df_merged.shape[0]) * 100):.2f}%")

## Players who have never been injured

In [28]:
# Group by ID and sum the Injury column
injury_counts = df_merged.groupby('ID').agg({'Injury': 'sum'})

# Get the IDs where the sum of Injury is greater than 0
ids_to_remove = injury_counts[injury_counts['Injury'] == 0].index.tolist()

# Remove these IDs from the original DataFrame
#df_filtered = df_merged[~df_merged['ID'].isin(ids_to_remove)]

df_filtered = df_merged.copy()

In [29]:
print("No Injury: " + str(df_filtered[df_filtered["Injury"] == 0].shape[0]))
print("Injury: " + str(df_filtered[df_filtered["Injury"] == 1].shape[0]))
print(f"%: {((df_filtered[df_filtered['Injury'] == 1].shape[0] / df_filtered.shape[0]) * 100):.2f}%")

No Injury: 7774
Injury: 61
%: 0.78%


Verify at least one player had an injury

In [30]:
df_filtered.groupby('ID').agg({'Injury': 'sum'})

Unnamed: 0_level_0,Injury
ID,Unnamed: 1_level_1
10103.0,4.0
10452.0,3.0
11111.0,0.0
11721.0,2.0
11833.0,1.0
12086.0,1.0
12345.0,0.0
17316.0,3.0
18096.0,3.0
19817.0,0.0


# Change column names

In [31]:
column_rename_dict = {
    'Column2': 'Microcycle',
    'DATE': 'DATE',
    'ID': 'PlayerID',
    'Total D': 'TD',
    '>19.8': 'HSR',
    '> 25 Km/h': '+25 Km/h',
    'ACC': 'ACC',
    'DEC': 'DEC',
    'Max Speed': 'Max Speed',
    'Max Speed Season': 'Max Speed Season',
    'Avg Speed Season': 'Avg Speed Season',
    '% Max Speed': '% Max Speed',
    '%Speed diference against max. Speed average': 'Speed Diff Max Avg',
    'Injury': 'Injury',
    'MINUTES': 'Mins',
    'Sprints': 'Sprints',
    'Total D_Rel': 'TD_Rel',
    '>19.8_Rel': 'HSR_Rel',
    '> 25 Km/h_Rel': '+25 Km/h_Rel',
    'ACC_Rel': 'ACC_Rel',
    'DEC_Rel': 'DEC_Rel',
    'Sprints_Rel': 'Sprints_Rel'
}

df_filtered.rename(columns=column_rename_dict, inplace=True)

df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7835 entries, 0 to 7834
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   DATE                7835 non-null   datetime64[ns]
 1   Microcycle          7835 non-null   object        
 2   PLAYER              7835 non-null   int64         
 3   Injury              7835 non-null   float64       
 4   season              7835 non-null   object        
 5   LEAGUE              7835 non-null   object        
 6   preseason-season    7835 non-null   object        
 7   MANAGER             7835 non-null   object        
 8   TD                  7835 non-null   float64       
 9   HSR                 7835 non-null   float64       
 10  +25 Km/h            7835 non-null   float64       
 11  ACC                 7835 non-null   float64       
 12  DEC                 7835 non-null   float64       
 13  PlayerID            7835 non-null   float64     

# Drop unnecesary columns

In [32]:
df_filtered = df_filtered.drop(columns=['season', 'LEAGUE', 'MANAGER', 'PLAYER', 'Player Position', 'Season'])

# Calculate relative values

We want the max and avg values for players that played over 85 in a Match Day

Remove spaces in 'Microcycle' column

In [34]:
df_filtered['Microcycle'] = df_filtered['Microcycle'].str.replace(' ', '', regex=False)
df_filtered["Microcycle"].unique()

array(['M+2', 'M+1', 'MD', 'M-1', 'M-2', 'M-3', 'M-5', 'M+3', 'M-4',
       'MD(AWAY)', 'MD(HOME)', nan], dtype=object)

In [35]:
def calculate_relative_values(df_original, metrics):
    # Array that holds match days
    microcycle_values = ['MD', 'MD(HOME)', 'MD(AWAY)']

    # Do a copy of the original DF
    df_copy = df_original.copy()

    # Filter players with +85 mins and match day
    df_filtered_85_md = df_copy[
        (df_copy['Mins'] > 85) &
        (df_copy['Microcycle'].isin(microcycle_values))
    ]

    # Players who never completed 85 mins in matchday
    df_filtered_85_only_mins = df_copy[df_copy['Mins'] > 85]

    # Combine both DataFrames
    df_filtered_85 = pd.concat([df_filtered_85_md, df_filtered_85_only_mins])

    # Temporary dictionaries to hold max and avg values per player
    player_max = {}
    player_avg = {}

    # Calculate max and avg for each player
    for player in df_filtered_85['PlayerID'].unique():
        player_data = df_filtered_85[df_filtered_85['PlayerID'] == player]
        player_max[player] = player_data[metrics].max()
        player_avg[player] = player_data[metrics].mean()

    # Add relative values to the original DataFrame
    for metric in metrics:

        df_original[f"{metric}_Rel"] = np.nan

        for player in df_original['PlayerID'].unique():
            if player in player_max and player in player_avg:
                max_value = player_max[player][metric]
                avg_value = player_avg[player][metric]

                # Compute relative value
                df_original.loc[df_original['PlayerID'] == player, f"{metric}_Rel"] = (
                    (df_original.loc[df_original['PlayerID'] == player, metric] * 100) /
                    ((max_value + avg_value) / 2)
                ).round(2)

    return df_original

Call function to calculate relative values

In [36]:
metrics_rel = ['TD', 'HSR', '+25 Km/h', 'ACC', 'DEC']

df_rel = calculate_relative_values(df_filtered, metrics_rel)

In [37]:
df_rel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7835 entries, 0 to 7834
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   DATE                7835 non-null   datetime64[ns]
 1   Microcycle          7453 non-null   object        
 2   Injury              7835 non-null   float64       
 3   preseason-season    7835 non-null   object        
 4   TD                  7835 non-null   float64       
 5   HSR                 7835 non-null   float64       
 6   +25 Km/h            7835 non-null   float64       
 7   ACC                 7835 non-null   float64       
 8   DEC                 7835 non-null   float64       
 9   PlayerID            7835 non-null   float64       
 10  Max Speed           7835 non-null   float64       
 11  Sprints             7835 non-null   float64       
 12  Mins                7835 non-null   float64       
 13  Max Speed Season    7835 non-null   float64     

# Function for densities

In [None]:
cols_mins = ['TD', 'HSR', '+25 Km/h', 'ACC', 'DEC', 'Sprints']

# Create new columns by dividing by the 'Minutes' column
for col in cols_mins:
    df_rel[f'{col}/Mins'] = df_rel[col] / df_rel['Mins']

In [None]:
df_rel['Sprints/Mins'] = df_rel['Sprints/Mins'].fillna(0)

# Find Max and Avg values before current date

In [38]:
def calculate_max_avg(df, metrics):

  # Create copies of the original DataFrame to ensure no unintended changes
  df = df.copy()

  # Loop through each metric to calculate max and avg values
  for metric in metrics:
      # Initialize the max and avg columns for the metric
      max_col = f"{metric}_max"
      avg_col = f"{metric}_avg"
      df[max_col] = None
      df[avg_col] = None

      # Iterate through each row to calculate max and avg based on previous dates
      for idx, row in df.iterrows():
          player_id = row['PlayerID']
          current_date = row['DATE']

          # Filter rows for the same player and only for dates before the current row's date
          player_data_before_date = df[
              (df['PlayerID'] == player_id) &
              (df['DATE'] < current_date)
          ]

          # Compute max and avg for the current metric based on the filtered data
          if not player_data_before_date.empty:
              df.at[idx, max_col] = player_data_before_date[metric].max()
              df.at[idx, avg_col] = player_data_before_date[metric].mean()
          else:
          # Fallback: Use max and avg across all rows for the same player
              player_data = df[df['PlayerID'] == player_id]
              df.at[idx, max_col] = player_data[metric].max()
              df.at[idx, avg_col] = player_data[metric].mean()

  return df


In [39]:
cols = ['TD', 'HSR', '+25 Km/h', 'ACC', 'DEC','TD/Mins', 'HSR/Mins', '+25 Km/h/Mins', 'ACC/Mins', 'Sprints/Mins']
#df_max_avg = calculate_max_avg(df_rel, cols)
df_max_avg = df_rel.copy()

In [40]:
df_max_avg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7835 entries, 0 to 7834
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   DATE                7835 non-null   datetime64[ns]
 1   Microcycle          7453 non-null   object        
 2   Injury              7835 non-null   float64       
 3   preseason-season    7835 non-null   object        
 4   TD                  7835 non-null   float64       
 5   HSR                 7835 non-null   float64       
 6   +25 Km/h            7835 non-null   float64       
 7   ACC                 7835 non-null   float64       
 8   DEC                 7835 non-null   float64       
 9   PlayerID            7835 non-null   float64       
 10  Max Speed           7835 non-null   float64       
 11  Sprints             7835 non-null   float64       
 12  Mins                7835 non-null   float64       
 13  Max Speed Season    7835 non-null   float64     

# Function for player loads

In [None]:
#df_max_avg

In [81]:
def calcular_acumulado(df, columnas_calcular, dias):
    # Create an empty list to store processed player DataFrames
    processed_players = []

    # Process each player separately
    for player_id in df['PlayerID'].unique():
        # Filter data for the current player
        player_data = df[df['PlayerID'] == player_id].copy()

        # Create a full date range for the player (from the first to the last recorded date)
        full_date_range = pd.date_range(start=player_data['DATE'].min(), end=player_data['DATE'].max(), freq='D')

        # Set 'DATE' as the index and reindex to fill missing dates with zeros
        player_data = player_data.set_index('DATE').reindex(full_date_range, fill_value=0).reset_index()
        player_data.rename(columns={'index': 'DATE'}, inplace=True)
        player_data['PlayerID'] = player_id

        # Perform rolling calculations for each metric, excluding the current day
        for dia in dias:
            for col in columnas_calcular:
                # Check if the column exists to avoid errors
                if col in player_data.columns:
                    # Rolling sum, mean, std: exclude the current day using shift(1)
                    player_data[f'{col}-{dia}'] = (
                        player_data[col].rolling(window=dia, min_periods=1).sum()
                    )
                    player_data[f'{col}-{dia}-avg'] = (
                        player_data[col].rolling(window=dia, min_periods=1).mean()
                    )
                    player_data[f'{col}-{dia}-std'] = (
                        player_data[col].rolling(window=dia, min_periods=1).std()
                    )

                    # EWMA: Exclude the current day using shift(1)
                    player_data[f'{col}_EWMA-{dia}'] = (
                        player_data[col].ewm(span=dia, adjust=False).mean()
                    )

        # Drop rows where all calculated values are zero (rest days)
        mask_non_zero = (player_data[columnas_calcular].sum(axis=1) > 0)
        player_data = player_data[mask_non_zero]

        # Append the processed player's data to the list
        processed_players.append(player_data)

    # Concatenate all processed player DataFrames into a single DataFrame
    df_resultado = pd.concat(processed_players, ignore_index=True)

    return df_resultado


In [82]:
cols_calculate = ['TD', 'HSR', '+25 Km/h', 'ACC', 'DEC', 'Sprints']
#columnas_calcular = ['Total D', '>19.8', '> 25 Km/h', 'ACC', 'DEC', 'Sprints'] # Indicamos las columnas que queremos añadir en el df
cumulative_df = calcular_acumulado(df_max_avg, cols_calculate, [7,28]) # loads -1, -3, -7, -21

In [83]:
cumulative_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7835 entries, 0 to 7834
Data columns (total 70 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   DATE                7835 non-null   datetime64[ns]
 1   Microcycle          7453 non-null   object        
 2   Injury              7835 non-null   float64       
 3   preseason-season    7835 non-null   object        
 4   TD                  7835 non-null   float64       
 5   HSR                 7835 non-null   float64       
 6   +25 Km/h            7835 non-null   float64       
 7   ACC                 7835 non-null   float64       
 8   DEC                 7835 non-null   float64       
 9   PlayerID            7835 non-null   float64       
 10  Max Speed           7835 non-null   float64       
 11  Sprints             7835 non-null   float64       
 12  Mins                7835 non-null   float64       
 13  Max Speed Season    7835 non-null   float64     

In [84]:
cumulative_df = cumulative_df.dropna(subset=['TD-7', 'TD-7-std'])
cumulative_df.reset_index(drop=True, inplace=True)

In [85]:
cumulative_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7780 entries, 0 to 7779
Data columns (total 70 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   DATE                7780 non-null   datetime64[ns]
 1   Microcycle          7400 non-null   object        
 2   Injury              7780 non-null   float64       
 3   preseason-season    7780 non-null   object        
 4   TD                  7780 non-null   float64       
 5   HSR                 7780 non-null   float64       
 6   +25 Km/h            7780 non-null   float64       
 7   ACC                 7780 non-null   float64       
 8   DEC                 7780 non-null   float64       
 9   PlayerID            7780 non-null   float64       
 10  Max Speed           7780 non-null   float64       
 11  Sprints             7780 non-null   float64       
 12  Mins                7780 non-null   float64       
 13  Max Speed Season    7780 non-null   float64     

# Calculate different load metrics

In [86]:
def calculate_metrics_loads(df, metrics):

  # Calculate ACWR, MSWR for each metric
  for metric in metrics:

      # Calculate 7-day and 28-day averages for ACWR
      df[f'{metric}_ACWR'] = df[f'{metric}-7-avg'] / df[f'{metric}-28-avg']

      # Calculate mean and standard deviation for MSWR
      df[f'{metric}_MSWR'] = df[f'{metric}-7-avg'] / df[f'{metric}-7-std']

  return df

In [87]:
cols_calculate = ['TD', 'HSR', '+25 Km/h', 'ACC', 'DEC']
cumulative_df = calculate_metrics_loads(cumulative_df, cols_calculate)

In [88]:
cumulative_df.drop(columns=['Sprints-28-avg', 'Sprints-28-std', 'DEC-28-avg', 'DEC-28-std', 'ACC-28-avg', 'ACC-28-std','+25 Km/h-28-avg', '+25 Km/h-28-std', 'HSR-28-avg', 'HSR-28-std', 'TD-28-avg', 'TD-28-std',
                            'Sprints-7-avg', 'Sprints-7-std', 'DEC-7-avg', 'DEC-7-std', 'ACC-7-avg', 'ACC-7-std','+25 Km/h-7-avg', '+25 Km/h-7-std', 'HSR-7-avg', 'HSR-7-std', 'TD-7-avg', 'TD-7-std', 'TD-28', 'HSR-28', '+25 Km/h-28', 'ACC-28', 'DEC-28', 'Sprints-28',
                            '+25 Km/h_EWMA-28', 'Sprints_EWMA-28', 'TD_EWMA-28', 'HSR_EWMA-28', 'ACC_EWMA-28', 'DEC_EWMA-28','Sprints_EWMA-7'] , inplace=True)

In [89]:
cumulative_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7780 entries, 0 to 7779
Data columns (total 43 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   DATE                7780 non-null   datetime64[ns]
 1   Microcycle          7400 non-null   object        
 2   Injury              7780 non-null   float64       
 3   preseason-season    7780 non-null   object        
 4   TD                  7780 non-null   float64       
 5   HSR                 7780 non-null   float64       
 6   +25 Km/h            7780 non-null   float64       
 7   ACC                 7780 non-null   float64       
 8   DEC                 7780 non-null   float64       
 9   PlayerID            7780 non-null   float64       
 10  Max Speed           7780 non-null   float64       
 11  Sprints             7780 non-null   float64       
 12  Mins                7780 non-null   float64       
 13  Max Speed Season    7780 non-null   float64     

In [90]:
# Exclude 'Microcycle' column for counting NULLs
columns_to_check = cumulative_df.drop(columns=['Microcycle']).columns

# Drop rows where 2 or more null values exist (excluding 'Microcycle')
cumulative_df = cumulative_df[cumulative_df[columns_to_check].notna().sum(axis=1) > (len(columns_to_check) - 1)]

# Display the resulting DataFrame
cumulative_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7384 entries, 0 to 7779
Data columns (total 43 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   DATE                7384 non-null   datetime64[ns]
 1   Microcycle          7021 non-null   object        
 2   Injury              7384 non-null   float64       
 3   preseason-season    7384 non-null   object        
 4   TD                  7384 non-null   float64       
 5   HSR                 7384 non-null   float64       
 6   +25 Km/h            7384 non-null   float64       
 7   ACC                 7384 non-null   float64       
 8   DEC                 7384 non-null   float64       
 9   PlayerID            7384 non-null   float64       
 10  Max Speed           7384 non-null   float64       
 11  Sprints             7384 non-null   float64       
 12  Mins                7384 non-null   float64       
 13  Max Speed Season    7384 non-null   float64       
 1

In [91]:
cumulative_df.describe()

Unnamed: 0,DATE,Injury,TD,HSR,+25 Km/h,ACC,DEC,PlayerID,Max Speed,Sprints,Mins,Max Speed Season,Avg Speed Season,% Max Speed,Speed Diff Max Avg,TD_Rel,HSR_Rel,+25 Km/h_Rel,ACC_Rel,DEC_Rel,TD-7,TD_EWMA-7,HSR-7,HSR_EWMA-7,+25 Km/h-7,+25 Km/h_EWMA-7,ACC-7,ACC_EWMA-7,DEC-7,DEC_EWMA-7,Sprints-7,TD_ACWR,TD_MSWR,HSR_ACWR,HSR_MSWR,+25 Km/h_ACWR,+25 Km/h_MSWR,ACC_ACWR,ACC_MSWR,DEC_ACWR,DEC_MSWR
count,7384,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0,7384.0
mean,2022-07-24 06:24:22.621885184,0.008261,4871.122671,199.035076,40.256907,49.325116,41.394286,44227.339382,27.496211,12.551637,67.192777,34.895948,21.571852,78.869758,27.520244,50.342622,37.130389,23.470956,56.832053,46.154229,22657.262351,3454.477505,908.068391,137.835908,175.520585,26.514498,232.33806,35.602808,191.467702,29.260244,56.2653,1.296178,1.300529,1.306025,0.837482,1.361916,0.604022,1.314704,inf,1.315493,inf
min,2021-08-11 00:00:00,0.0,1.0,0.0,0.0,1.0,1.0,10103.0,1.81,0.0,3.78,28.34,17.940344,5.89001,-91.074358,0.01,0.0,0.0,0.87,0.83,1017.0,254.284478,10.0,2.5,1.0,0.044495,2.0,0.500612,4.0,1.0,0.0,0.114668,0.377964,0.022619,0.377964,0.003663,0.377964,0.0798,0.377964,0.053435,0.377964
25%,2022-02-26 00:00:00,0.0,3174.0,49.0,0.0,31.461184,22.0,21079.0,25.02,1.0,52.3,33.86,20.940727,71.821944,16.133108,32.5075,9.49,0.0,36.89,25.01,18876.75,2888.166487,612.0,92.916862,80.0,11.936534,184.0,28.382977,145.0,22.326532,17.0,0.942086,0.94734,0.863044,0.645722,0.75484,0.469029,0.938622,0.943683,0.932016,0.845495
50%,2022-08-02 00:00:00,0.0,4424.0,134.0,11.0,47.0,37.0,37990.0,27.775,8.0,67.55,34.95,21.716652,79.807054,28.559599,45.85,24.785,6.83,54.85,41.66,23106.0,3360.728699,867.0,129.072491,147.5,21.716603,233.5,34.991856,193.0,28.666322,57.0,1.09714,1.144895,1.097851,0.763548,1.133583,0.578596,1.105184,1.142051,1.11526,1.032574
75%,2022-12-25 00:00:00,0.0,5938.25,303.0,57.0,65.0,56.0,66786.0,30.48,19.0,83.89,36.27,22.20223,87.649493,41.205313,62.21,57.6325,35.075,75.34,62.75,26752.0,3947.767282,1157.0,172.830845,241.0,35.250154,280.0,42.00709,236.0,35.211873,85.0,1.382225,1.369275,1.473907,0.91466,1.676768,0.679035,1.424362,1.337908,1.442143,1.257532
max,2023-05-09 00:00:00,1.0,13837.0,1610.0,530.0,190.0,171.0,99471.0,37.56,78.0,141.81,37.56,26.496667,100.0,82.440073,176.81,347.32,510.09,282.14,321.43,55983.0,8428.0,2918.0,556.574653,1076.0,169.547887,773.0,112.065049,653.0,96.667821,191.0,4.0,329.582471,4.0,97.345034,4.0,33.234019,4.0,inf,4.0,inf
std,,0.090521,2390.556542,193.586835,62.227027,24.45512,25.758834,27618.179021,4.155647,14.004052,22.505024,1.65532,0.866055,11.757011,18.946697,24.5453,35.991325,34.377118,27.392481,28.400173,7116.111014,975.932158,415.764447,63.462339,129.455658,20.50012,82.686836,11.640157,75.452896,10.838164,39.606476,0.676678,4.032158,0.766786,1.403065,0.921951,0.475685,0.697934,,0.707531,


Do KNN imputation for EMWA and ACWR values

In [None]:
#cumulative_df.columns

In [None]:
# # Exclude unwanted columns for KNN calculation
# columns_to_exclude = ['DATE', 'Microcycle', 'Injury', 'preseason-season']
# columns_for_knn = [col for col in cumulative_df.columns if col not in columns_to_exclude]

# # Apply KNN Imputation only to selected columns
# imputer = KNNImputer(n_neighbors=3)
# imputed_values = imputer.fit_transform(cumulative_df[columns_for_knn])

# # Replace imputed columns in the original dataframe
# cumulative_df[columns_for_knn] = imputed_values

# cumulative_df.info()

# Do the shift for 7 previous days injury

In [92]:
def shift_injuries_7_days(df):
  copy_df = df.copy()

    # Initialize a new column for the 7-day injury prediction label
  copy_df['Injury_7_day'] = 0

  # Iterate over the df to assign labels to rows in the 7 days before an injury
  for idx, row in copy_df.iterrows():
      if row['Injury'] == 1:
          # Get the player's ID
          player_id = row['PlayerID']
          injury_date = row['DATE']

          # Assign injury label to the previous 7 days for the same player
          for i in range(1, 8):
              prev_date = pd.to_datetime(injury_date) - pd.Timedelta(days=i)
              mask = (copy_df['PlayerID'] == player_id) & (copy_df['DATE'] == prev_date)
              copy_df.loc[mask, 'Injury_7_day'] = 1

  # Drop the original injury rows
  copy_df = copy_df[copy_df['Injury'] == 0]

  return copy_df

In [93]:
cumulative_df_inj = shift_injuries_7_days(cumulative_df)

In [94]:
cumulative_df_inj.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7323 entries, 0 to 7779
Data columns (total 44 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   DATE                7323 non-null   datetime64[ns]
 1   Microcycle          6961 non-null   object        
 2   Injury              7323 non-null   float64       
 3   preseason-season    7323 non-null   object        
 4   TD                  7323 non-null   float64       
 5   HSR                 7323 non-null   float64       
 6   +25 Km/h            7323 non-null   float64       
 7   ACC                 7323 non-null   float64       
 8   DEC                 7323 non-null   float64       
 9   PlayerID            7323 non-null   float64       
 10  Max Speed           7323 non-null   float64       
 11  Sprints             7323 non-null   float64       
 12  Mins                7323 non-null   float64       
 13  Max Speed Season    7323 non-null   float64       
 1

In [98]:
print("No Injury: " + str(cumulative_df_inj[cumulative_df_inj["Injury_7_day"] == 0].shape[0]))
print("Injury: " + str(cumulative_df_inj[cumulative_df_inj["Injury_7_day"] == 1].shape[0]))
print(f"%: {((cumulative_df_inj[cumulative_df_inj['Injury_7_day'] == 1].shape[0] / cumulative_df_inj.shape[0]) * 100):.2f}%")

No Injury: 7055
Injury: 268
%: 3.66%


# Remove first 7 days of a player's data

In [104]:
# Get the minimum date for each player
min_dates = cumulative_df_inj.groupby('PlayerID')['DATE'].transform('min')

# Calculate the cutoff date for each row (min date + 7 days)
cutoff_dates = min_dates + pd.Timedelta(days=7)

# Filter out rows where the date is within the first 7 days for each player
final_df = cumulative_df_inj[cumulative_df_inj['DATE'] >= cutoff_dates]

final_df = cumulative_df_inj.copy()

In [105]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7323 entries, 0 to 7779
Data columns (total 44 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   DATE                7323 non-null   datetime64[ns]
 1   Microcycle          6961 non-null   object        
 2   Injury              7323 non-null   float64       
 3   preseason-season    7323 non-null   object        
 4   TD                  7323 non-null   float64       
 5   HSR                 7323 non-null   float64       
 6   +25 Km/h            7323 non-null   float64       
 7   ACC                 7323 non-null   float64       
 8   DEC                 7323 non-null   float64       
 9   PlayerID            7323 non-null   float64       
 10  Max Speed           7323 non-null   float64       
 11  Sprints             7323 non-null   float64       
 12  Mins                7323 non-null   float64       
 13  Max Speed Season    7323 non-null   float64       
 1

In [106]:
print("No Injury: " + str(final_df[final_df["Injury_7_day"] == 0].shape[0]))
print("Injury: " + str(final_df[final_df["Injury_7_day"] == 1].shape[0]))
print(f"%: {((final_df[final_df['Injury_7_day'] == 1].shape[0] / final_df.shape[0]) * 100):.2f}%")

No Injury: 7055
Injury: 268
%: 3.66%


# Intensity metric based on Microcycle type

# Remove Preseason data

In [102]:
final_df = final_df[final_df['preseason-season'] != 'PRESEASON']

final_df.reset_index(drop=True, inplace=True)

In [103]:
print("No Injury: " + str(final_df[final_df["Injury_7_day"] == 0].shape[0]))
print("Injury: " + str(final_df[final_df["Injury_7_day"] == 1].shape[0]))
print(f"%: {((final_df[final_df['Injury_7_day'] == 1].shape[0] / final_df.shape[0]) * 100):.2f}%")

No Injury: 6568
Injury: 257
%: 3.77%


# Export data

In [107]:
final_df.to_excel('/content/drive/MyDrive/WATFORD FC/Datos GPS/datos_finales_tommy2.xlsx', index=False)

In [None]:
# result_df.to_csv('/content/drive/MyDrive/WATFORD FC/Datos GPS/datos_finales_tommy.csv', index=False)