<a href="https://colab.research.google.com/github/Tommy-Las/WatfordFC/blob/main/ML_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import packages and data

In [29]:
import pandas as pd
pd.set_option('display.max_rows', None)
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from xgboost import XGBClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE, RFECV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, make_scorer
from openpyxl import Workbook
from openpyxl.styles import PatternFill
from openpyxl.utils.dataframe import dataframe_to_rows
from collections import defaultdict

In [2]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [3]:
# file_path = '/content/drive/MyDrive/WATFORD FC/Datos GPS/datos_finales_tommy.xlsx'
file_path = 'datos_finales_tommy.xlsx'
# Load the CSV into a DataFrame
df = pd.read_excel(file_path)

# Display the DataFrame
print(df.head())

        DATE Microcycle  Injury preseason-season      TD    HSR  +25 Km/h  \
0 2021-08-12        M-2       0           SEASON  1268.0    0.0         0   
1 2021-08-13        M-1       0           SEASON  4135.0  112.0         2   
2 2021-08-15        M+1       0           SEASON  6379.0   86.0         0   
3 2021-08-17        M-4       0           SEASON  6287.0  209.0         9   
4 2021-08-18        M-3       0           SEASON  6543.0  166.0         9   

    ACC   DEC  PlayerID  ...   TD_MSWR  HSR_ACWR  HSR_MSWR  +25 Km/h_ACWR  \
0  11.0   5.0     23085  ...  5.217107  1.000000  1.923701       1.000000   
1  47.0  20.0     23085  ...  1.508831  1.000000  0.974045       1.000000   
2  49.0  48.0     23085  ...  1.207543  1.000000  0.832277       1.000000   
3  62.0  44.0     23085  ...  1.140383  1.000000  0.757251       1.000000   
4  39.0  35.0     23085  ...  1.140354  0.924272  0.762941       0.545055   

   +25 Km/h_MSWR  ACC_ACWR  ACC_MSWR  DEC_ACWR   DEC_MSWR  Injury_7_day  


Chequear desbalanceo de clases

In [4]:
print("No Lesión: " + str(df[df["Injury_7_day"] == 0].shape[0]))
print("Lesión: " + str(df[df["Injury_7_day"] == 1].shape[0]))
print(f"Porcentaje de lesionados en el dataset: {((df[df['Injury_7_day'] == 1].shape[0] / df.shape[0]) * 100):.2f}%")

No Lesión: 6297
Lesión: 253
Porcentaje de lesionados en el dataset: 3.86%


## Initialize metrics

In [None]:
metrics = ['ACC', 'HSR', 'TD', '+25 Km/h', 'Sprints',
       'Mins']

metrics_den = ['TD/Mins', 'HSR/Mins', '+25 Km/h/Mins', 'ACC/Mins', 'Sprints/Mins']

metrics_rel = ['TD_Rel',
       'HSR_Rel', '+25 Km/h_Rel', 'ACC_Rel', 'Sprints_Rel']

metrics_den_3 = ['TD/Mins', 'HSR/Mins', '+25 Km/h/Mins', 'ACC/Mins', 'Sprints/Mins', '% Max Speed', 'TD-3', 'HSR-3', '+25 Km/h-3', 'ACC-3', 'Sprints-3']

metrics_den_7 = ['TD/Mins', 'HSR/Mins', '+25 Km/h/Mins', 'ACC/Mins', 'Sprints/Mins', '% Max Speed', 'TD-7', 'HSR-7', '+25 Km/h-7', 'ACC-7', 'Sprints-7']

metrics_den_rel_3 = ['TD/Mins', 'HSR/Mins', '+25 Km/h/Mins', 'ACC/Mins', 'Sprints/Mins', '% Max Speed', 'TD_Rel',
       'HSR_Rel', '+25 Km/h_Rel', 'ACC_Rel', 'Sprints_Rel','TD-3', 'HSR-3', '+25 Km/h-3', 'ACC-3', 'Sprints-3']

metrics_den_rel_7 = ['TD/Mins', 'HSR/Mins', '+25 Km/h/Mins', 'ACC/Mins', 'Sprints/Mins', '% Max Speed', 'TD_Rel',
       'HSR_Rel', '+25 Km/h_Rel', 'ACC_Rel', 'Sprints_Rel','TD-7', 'HSR-7', '+25 Km/h-7', 'ACC-7', 'Sprints-7']

metrics_3 = ['TD', 'HSR', '+25 Km/h', 'ACC', 'Sprints', '% Max Speed', 'TD-3', 'HSR-3', '+25 Km/h-3', 'ACC-3', 'Sprints-3']

metrics_7 = ['TD', 'HSR', '+25 Km/h', 'ACC', 'Sprints', '% Max Speed', 'TD-7', 'HSR-7', '+25 Km/h-7', 'ACC-7', 'Sprints-7']

metrics_rel_3 = ['TD', 'HSR', '+25 Km/h', 'ACC', 'Sprints', '% Max Speed', 'TD_Rel',
       'HSR_Rel', '+25 Km/h_Rel', 'ACC_Rel', 'Sprints_Rel','TD-3', 'HSR-3', '+25 Km/h-3', 'ACC-3', 'Sprints-3']

metrics_rel_7 = ['TD', 'HSR', '+25 Km/h', 'ACC', 'Sprints', '% Max Speed', 'TD_Rel',
       'HSR_Rel', '+25 Km/h_Rel', 'ACC_Rel', 'Sprints_Rel','TD-7', 'HSR-7', '+25 Km/h-7', 'ACC-7', 'Sprints-7']

In [None]:
metrics_sets = {
    "metrics_den_3": ['TD/Mins', 'HSR/Mins', '+25 Km/h/Mins', 'ACC/Mins', 'Sprints/Mins', '% Max Speed', 'TD-3', 'HSR-3', '+25 Km/h-3', 'ACC-3', 'Sprints-3'],
    "metrics_den_7": ['TD/Mins', 'HSR/Mins', '+25 Km/h/Mins', 'ACC/Mins', 'Sprints/Mins', '% Max Speed', 'TD-7', 'HSR-7', '+25 Km/h-7', 'ACC-7', 'Sprints-7'],
    "metrics_den_rel_3": ['TD/Mins', 'HSR/Mins', '+25 Km/h/Mins', 'ACC/Mins', 'Sprints/Mins', '% Max Speed', 'TD_Rel', 'HSR_Rel', '+25 Km/h_Rel', 'ACC_Rel', 'Sprints_Rel','TD-3', 'HSR-3', '+25 Km/h-3', 'ACC-3', 'Sprints-3'],
    "metrics_den_rel_7": ['TD/Mins', 'HSR/Mins', '+25 Km/h/Mins', 'ACC/Mins', 'Sprints/Mins', '% Max Speed', 'TD_Rel', 'HSR_Rel', '+25 Km/h_Rel', 'ACC_Rel', 'Sprints_Rel','TD-7', 'HSR-7', '+25 Km/h-7', 'ACC-7', 'Sprints-7'],
    "metrics_3": ['TD', 'HSR', '+25 Km/h', 'ACC', 'Sprints', '% Max Speed', 'TD-3', 'HSR-3', '+25 Km/h-3', 'ACC-3', 'Sprints-3'],
    "metrics_7": ['TD', 'HSR', '+25 Km/h', 'ACC', 'Sprints', '% Max Speed', 'TD-7', 'HSR-7', '+25 Km/h-7', 'ACC-7', 'Sprints-7'],
    "metrics_rel_3": ['TD', 'HSR', '+25 Km/h', 'ACC', 'Sprints', '% Max Speed', 'TD_Rel', 'HSR_Rel', '+25 Km/h_Rel', 'ACC_Rel', 'Sprints_Rel','TD-3', 'HSR-3', '+25 Km/h-3', 'ACC-3', 'Sprints-3'],
    "metrics_rel_7": ['TD', 'HSR', '+25 Km/h', 'ACC', 'Sprints', '% Max Speed', 'TD_Rel', 'HSR_Rel', '+25 Km/h_Rel', 'ACC_Rel', 'Sprints_Rel','TD-7', 'HSR-7', '+25 Km/h-7', 'ACC-7', 'Sprints-7']
}

metrics_standarize = ['TD', 'HSR', '+25 Km/h', 'ACC', 'DEC', 'Max Speed', 'Sprints', 'Max Speed Season',
       'Avg Speed Season', '% Max Speed', 'Speed Diff Max Avg', 'TD_Rel',
       'HSR_Rel', '+25 Km/h_Rel', 'ACC_Rel', 'DEC_Rel', 'Sprints_Rel',
       'TD/Mins', 'HSR/Mins', '+25 Km/h/Mins', 'ACC/Mins', 'DEC/Mins',
       'Sprints/Mins', 'TD_max', 'TD_avg', 'HSR_max', 'HSR_avg',
       '+25 Km/h_max', '+25 Km/h_avg', 'ACC_max', 'ACC_avg', 'DEC_max',
       'DEC_avg', 'Sprints_max', 'Sprints_avg', 'TD/Mins_max', 'TD/Mins_avg',
       'HSR/Mins_max', 'HSR/Mins_avg', '+25 Km/h/Mins_max',
       '+25 Km/h/Mins_avg', 'ACC/Mins_max', 'ACC/Mins_avg', 'Sprints/Mins_max',
       'Sprints/Mins_avg', 'TD-3', 'HSR-3', '+25 Km/h-3', 'ACC-3', 'DEC-3',
       'Sprints-3', 'TD-7', 'HSR-7', '+25 Km/h-7', 'ACC-7', 'DEC-7',
       'Sprints-7', 'TD-21', 'HSR-21', '+25 Km/h-21', 'ACC-21', 'DEC-21',
       'Sprints-21']

# Correlation of variables

In [88]:
# Calcular la matriz de correlación
correlation_matrix = df[metrics_den_rel_7].corr()

# Mostrar la matriz de correlación
plt.figure(figsize=(18, 9))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title("Matriz de Correlación")
plt.show()


NameError: name 'metrics_den_rel_7' is not defined

In [89]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

df_with_constant = add_constant(df[metrics_den_rel_3])

# Calcular el VIF para cada columna
vif_data = pd.DataFrame()
vif_data["Variable"] = df_with_constant.columns
vif_data["VIF"] = [variance_inflation_factor(df_with_constant.values, i)
                   for i in range(df_with_constant.shape[1])]

# Mostrar el VIF
print(vif_data)

NameError: name 'metrics_den_rel_3' is not defined

# ML Modelos

## Separate data into X, y and player_ids

In [30]:
metrics = ['TD', 'HSR',
       '+25 Km/h', 'ACC', 'DEC', 'Max Speed', 'Sprints', 'Mins','% Max Speed',
       'Speed Diff Max Avg', 'TD_Rel', 'HSR_Rel', '+25 Km/h_Rel', 'ACC_Rel',
       'DEC_Rel', 'TD-7', 'TD_EWMA-7', 'HSR-7', 'HSR_EWMA-7', '+25 Km/h-7',
       '+25 Km/h_EWMA-7', 'ACC-7', 'ACC_EWMA-7', 'DEC-7', 'DEC_EWMA-7',
       'Sprints-7', 'TD_ACWR', 'TD_MSWR', 'HSR_ACWR', 'HSR_MSWR',
       '+25 Km/h_ACWR', '+25 Km/h_MSWR', 'ACC_ACWR', 'DEC_ACWR']

print(len(metrics))

34


In [31]:
player_ids = df['PlayerID']
X = df[metrics]
y = df['Injury_7_day']

## Feature Engineering

**Feature Selection**

Removes features with low variance

In [7]:
# Initialize VarianceThreshold with a threshold
vt = VarianceThreshold(threshold=0.01)  # Removes features with variance < 0.01

# Apply VarianceThreshold to the entire dataset (X)
X_reduced = vt.fit_transform(X)  # Retains only high-variance features

# Get selected feature names (if X is a DataFrame)
removed_features = X.columns[~vt.get_support()]
print("Removed Features:", removed_features)

Removed Features: Index([], dtype='object')


Based on the response , there is no columns that need to be removed

## SMOTE Technique

In [32]:
# Split the data into training and test sets (test set remains untouched)
X_train, X_test, y_train, y_test, player_ids_train, player_ids_test = train_test_split(X, y, player_ids, test_size=0.2, random_state=42, stratify=y)

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to training data
X_train , y_train = smote.fit_resample(X_train, y_train)

print("Class distribution after SMOTE:")
print(pd.Series(y_train).value_counts())
print(pd.Series(y_train).value_counts())

Class distribution after SMOTE:
Injury_7_day
0    5038
1    5038
Name: count, dtype: int64
Injury_7_day
0    5038
1    5038
Name: count, dtype: int64


## XGBoost Model

In [None]:
# Base parameters for XGBClassifier
base_params = {
    'objective': 'binary:logistic',
    'colsample_bytree': 0.9,
    'eval_metric': 'error',
    'alpha': 5,
    'gamma': 5
}

# Initialize the XGBClassifier
model = XGBClassifier(**base_params)

# Create RFECV
rfecv = RFECV(
    estimator=model,
    step=1,  # Remove one feature at a time
    scoring='roc_auc',  # Use ROC-AUC to evaluate features
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    verbose=1
)

# Create a pipeline with RFECV and the XGBoost model
pipeline = Pipeline([
    ('feature_selection', rfecv),
    ('model', model)
])

# Parameter grid for GridSearchCV
param_grid = {
    'model__learning_rate': [0.07, 0.09, 0.1],
    'model__max_depth': [3, 4, 5],
    'model__n_estimators': [50, 100, 200],
    'model__scale_pos_weight': [8, 9, 10]
}

# GridSearchCV for hyperparameter tuning with RFECV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    verbose=1,
    n_jobs=-1
)

# Fit GridSearchCV on the full training dataset
grid_search.fit(X_train, y_train)

# Best model and parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_roc_auc = grid_search.best_score_

# Print best results
print(f"Best ROC-AUC (Grid Search with RFECV): {best_roc_auc:.4f}")
print(f"Best Parameters: {best_params}")

# Transform X_train and X_test using the fitted RFECV
selected_features = X_train.columns[rfecv.support_]
X_train_selected = pd.DataFrame(rfecv.transform(X_train), columns=selected_features)
X_test_selected = pd.DataFrame(rfecv.transform(X_test), columns=selected_features)

# Predict probabilities on the test set
y_pred_probs = best_model.named_steps['model'].predict_proba(X_test_selected)[:, 1]
print(f"Test ROC-AUC: {roc_auc_score(y_test, y_pred_probs):.4f}")

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting estimator with 34 features.Fitting estimator with 34 features.

Fitting estimator with 34 features.
Fitting estimator with 34 features.
Fitting estimator with 34 features.
Fitting estimator with 34 features.
Fitting estimator with 34 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 33 features.
Fitting estimator with 33 features.
Fitting estimator with 33 features.
Fitting estimator with 33 features.
Fitting estimator with 33 features.
Fitting estimator with 33 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 32 features.
Fitting estimator with 32 features.
Fitting estimator with 32 features.
Fitting estimator with 32 features.
Fitting estimator with 32 features.
Fitting estimator with 32 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 31 features.
Fi

Perform Recursive Feature Elimination with Cross-Validation (RFECV) to find the best variables

Use GridSearch to find the best hyperparameters

In [21]:
# Base parameters for XGBClassifier
base_params = {
    'objective': 'binary:logistic',
    'colsample_bytree': 0.9,
    'eval_metric': 'error',
    'alpha': 5,
    'gamma': 5
}

# Initialize the XGBClassifier
model = XGBClassifier(**base_params)

# Perform RFECV for feature selection
rfecv = RFECV(
    estimator=model,
    step=1,  # Remove one feature at a time
    scoring='roc_auc',  # Use ROC-AUC to evaluate features
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    verbose=1
)

# Fit RFECV to X_train
rfecv.fit(X_train, y_train)

# Get selected feature names
selected_features = X_train.columns[rfecv.support_]
print(f"Selected Features (RFECV): {selected_features.tolist()}")
print(f"Optimal number of features: {rfecv.n_features_}")

# Transform X_train and X_test to include only selected features
X_train_selected = pd.DataFrame(rfecv.transform(X_train), columns=selected_features)
X_test_selected = pd.DataFrame(rfecv.transform(X_test), columns=selected_features)

# Verify column consistency
assert list(X_train_selected.columns) == list(X_test_selected.columns), "Feature mismatch between training and testing sets."

# Parameter grid for GridSearchCV
param_grid = {
    'learning_rate': [0.07, 0.09, 0.1],
    'max_depth': [3, 4, 5],
    'n_estimators': [50, 100, 200],
    'scale_pos_weight': [8, 9, 10]
}

# GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='roc_auc',  # Optimize for ROC-AUC
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    verbose=1,
    n_jobs=-1
)

# Fit GridSearchCV with selected features
grid_search.fit(X_train_selected, y_train)

# Get the best model and parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_roc_auc = grid_search.best_score_

# Print best results
print(f"Best ROC-AUC (Grid Search): {best_roc_auc:.4f}")
print(f"Best Parameters: {best_params}")

Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 featu

## Transform test data

In [22]:
X_test = X_test_selected.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
player_ids_test = player_ids_test.reset_index(drop=True)

In [27]:
# Evaluate the best model on the test set
y_probs = best_model.predict_proba(X_test_selected)[:, 1]
test_roc_auc = roc_auc_score(y_test, y_probs)

print(f"Test ROC-AUC: {test_roc_auc:.4f}")

# Convert the probabilities of injury risk (Class 1) to percentages
injury_risk_percentage = y_probs * 100

# Categorize the injury risk percentages
risk_levels = [categorize_risk(percentage) for percentage in injury_risk_percentage]

# Create the `results` DataFrame with only relevant metrics and calculated values
results = pd.DataFrame({
    'Player_ID': player_ids_test,  # Add player IDs
    'Actual_Label': y_test,  # Add actual labels
    '%': injury_risk_percentage,
    'Risk_Level': risk_levels
}).join(X_test)  # Add only relevant max metrics



Test ROC-AUC: 0.6692


In [26]:
def categorize_risk(percentage):
    if percentage < 40:
        return 'Low Risk'
    elif 40 <= percentage < 70:
        return 'Medium Risk'
    else:
        return 'High Risk'

# Define risk level colors
color_mapping = {
    'High Risk': 'FF9999',  # Light Red
    'Medium Risk': 'FFFF99',  # Light Yellow
    'Low Risk': '99FF99'  # Light Green
}

# file_path_export = '/content/drive/MyDrive/WATFORD FC/Datos GPS/results.xlsx'
file_path_export = 'results.xlsx'

wb = Workbook()

# Write `results` to the first sheet
ws = wb.active
ws.title = "Injury Risk Results"
for r_idx, row in enumerate(dataframe_to_rows(results, index=True, header=True), start=1):
    for c_idx, value in enumerate(row, start=1):
        ws.cell(row=r_idx, column=c_idx, value=value)

# Conditional formatting for `results`
risk_level_col_idx = list(results.columns).index("Risk_Level") + 2
for row in ws.iter_rows(min_row=2, max_row=ws.max_row, min_col=risk_level_col_idx, max_col=risk_level_col_idx):
    for cell in row:
        risk_level = cell.value
        if risk_level in color_mapping:
            cell.fill = PatternFill(start_color=color_mapping[risk_level], end_color=color_mapping[risk_level], fill_type="solid")

# Save the workbook
wb.save(file_path_export)