In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score

# ✅ Load dataset
data = pd.read_excel('/content/your_file_converted copy.xlsx')

# Drop irrelevant columns
irrelevant_columns = ['Name', 'Date', 'Attendance', 'Profile', 'week', 'CMJ_Flag', 'CMJ_Flag_check', 'DayOfWeek']
data = data.drop(columns=irrelevant_columns, errors='ignore')

# ✅ Separate features (X) and target (y)
X = data.drop(columns=['TL'], errors='ignore')
y = data['TL']

# ✅ Save original feature names before encoding
original_features = X.columns.tolist()
joblib.dump(original_features, 'original_features.pkl')
print(f"Original Features: {original_features}")

# ✅ Encode categorical variables
categorical_columns = X.select_dtypes(include=['object']).columns
label_encoders = {col: LabelEncoder() for col in categorical_columns}
for col in categorical_columns:
    X[col] = label_encoders[col].fit_transform(X[col])

# ✅ Save label encoders
joblib.dump(label_encoders, 'label_encoders.pkl')

# ✅ Save feature order (to match during prediction)
feature_order = X.columns.tolist()
joblib.dump(feature_order, 'feature_order.pkl')
print(f"Feature Order after Encoding: {feature_order}")

# ✅ Standardize numerical features
scaler_X = StandardScaler()
X_standardized = scaler_X.fit_transform(X.select_dtypes(include=['float64', 'int64']))

# ✅ Save feature scaler
joblib.dump(scaler_X, 'scaler_X.pkl')

# ✅ Scale the target variable (y) to [0, 1]
scaler_y = MinMaxScaler(feature_range=(0, 1))
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()

# ✅ Save target scaler
joblib.dump(scaler_y, 'scaler_y.pkl')

# ✅ Perform PCA
pca = PCA()
X_pca = pca.fit_transform(X_standardized)

# ✅ Save PCA model
joblib.dump(pca, 'pca_model.pkl')

# ✅ Choose number of components based on 95% variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(cumulative_variance >= 0.95) + 1

# Apply PCA with chosen components
pca = PCA(n_components=n_components)
X_pca_reduced = pca.fit_transform(X_standardized)
joblib.dump(pca, 'updated_pca_model.pkl')

# ✅ Name PCA components based on original features
pca_feature_names = [f'PC{i+1}' for i in range(n_components)]
pca_components_df = pd.DataFrame(pca.components_, columns=feature_order, index=pca_feature_names)
print("PCA Component Contributions:")
print(pca_components_df)

# ✅ Save PCA feature importance
joblib.dump(pca_components_df, 'pca_feature_importance.pkl')
print(f"Number of PCA Components: {n_components}")

# ✅ Split data into train & test sets
X_train, X_test, y_train, y_test = train_test_split(X_pca_reduced, y_scaled, test_size=0.2, random_state=42)

# ✅ Hyperparameter tuning for RandomForest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

rf_model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

# ✅ Best model from hyperparameter tuning
best_rf_model = grid_search.best_estimator_

# ✅ Save trained model
joblib.dump(best_rf_model, 'random_forest_model.pkl')

# ✅ Make predictions & evaluate
y_pred = best_rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# ✅ Feature importance from Random Forest
feature_importance_rf = best_rf_model.feature_importances_
joblib.dump(feature_importance_rf, 'feature_importance_rf.pkl')
print(f"Feature Importance from Random Forest: {feature_importance_rf}")

print(f"Best Model Parameters: {grid_search.best_params_}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")



Original Features: ['Micro-cycle', 'Position', 'MatchDay', 'Sleep', 'Stress', 'Fatigue', 'Pain', 'Wellness', 'RPE', 'Duration', 'Acute', 'Chronic', 'ACWR', 'Total Duration', 'TD/min', 'Dist MP 20-35W', 'Dist MP 35-55W', 'Dist MP>55 W', 'Distance 14,4-19,8 km/h / min', 'Distance 19,8-25 km/h / min', 'Distance > 25 km/h / min', 'Dist Acc>3 / min', 'Dist Dec <-3 / min', 'Dist Acc 2-3 / min', 'Dist Dec 2-3 / min', 'Dist MP 20-35W / min', 'Dist MP 35-55W / min', 'Dist MP>55 W / min']
Feature Order after Encoding: ['Micro-cycle', 'Position', 'MatchDay', 'Sleep', 'Stress', 'Fatigue', 'Pain', 'Wellness', 'RPE', 'Duration', 'Acute', 'Chronic', 'ACWR', 'Total Duration', 'TD/min', 'Dist MP 20-35W', 'Dist MP 35-55W', 'Dist MP>55 W', 'Distance 14,4-19,8 km/h / min', 'Distance 19,8-25 km/h / min', 'Distance > 25 km/h / min', 'Dist Acc>3 / min', 'Dist Dec <-3 / min', 'Dist Acc 2-3 / min', 'Dist Dec 2-3 / min', 'Dist MP 20-35W / min', 'Dist MP 35-55W / min', 'Dist MP>55 W / min']
PCA Component Contrib

In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score

# ✅ Load dataset
data = pd.read_excel('/content/your_file_converted copy.xlsx')

# Drop irrelevant columns
irrelevant_columns = ['Name', 'Date', 'Attendance', 'Profile', 'week', 'CMJ_Flag', 'CMJ_Flag_check', 'DayOfWeek']
data = data.drop(columns=irrelevant_columns, errors='ignore')

# ✅ Separate features (X) and target (y)
X = data.drop(columns=['TL'], errors='ignore')
y = data['TL']

# ✅ Save original feature names before encoding
original_features = X.columns.tolist()
joblib.dump(original_features, 'original_features.pkl')

# ✅ Encode categorical variables
categorical_columns = X.select_dtypes(include=['object']).columns
label_encoders = {col: LabelEncoder() for col in categorical_columns}
for col in categorical_columns:
    X[col] = label_encoders[col].fit_transform(X[col])
joblib.dump(label_encoders, 'label_encoders.pkl')

# ✅ Save feature order
feature_order = X.columns.tolist()
joblib.dump(feature_order, 'feature_order.pkl')

# ✅ Standardize numerical features
scaler_X = StandardScaler()
X_standardized = scaler_X.fit_transform(X.select_dtypes(include=['float64', 'int64']))
joblib.dump(scaler_X, 'scaler_X.pkl')

# ✅ Scale the target variable (y) to [0, 1]
scaler_y = MinMaxScaler()
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()
joblib.dump(scaler_y, 'scaler_y.pkl')

# ✅ Perform PCA
pca = PCA()
X_pca = pca.fit_transform(X_standardized)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(cumulative_variance >= 0.95) + 1

pca = PCA(n_components=n_components)
X_pca_reduced = pca.fit_transform(X_standardized)
joblib.dump(pca, 'updated_pca_model.pkl')

# ✅ Split data
X_train, X_test, y_train, y_test = train_test_split(X_pca_reduced, y_scaled, test_size=0.2, random_state=42)

# ✅ Define models and hyperparameters
models = {
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "LinearRegression": LinearRegression(),
    "SVR": SVR()
}

param_grids = {
    "RandomForest": {'n_estimators': [50, 100], 'max_depth': [None, 10], 'min_samples_split': [2, 5]},
    "GradientBoosting": {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1]},
    "SVR": {'C': [0.1, 1], 'kernel': ['rbf', 'linear']}
}

best_models = {}
results = {}

for name, model in models.items():
    if name in param_grids:
        grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='r2', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_models[name] = best_model
        joblib.dump(best_model, f'{name}_best_model.pkl')
        print(f"Best parameters for {name}: {grid_search.best_params_}")
    else:
        model.fit(X_train, y_train)
        best_models[name] = model
        joblib.dump(model, f'{name}_model.pkl')

    y_train_pred = best_models[name].predict(X_train)
    y_test_pred = best_models[name].predict(X_test)

    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    results[name] = {
        "Train MSE": train_mse,
        "Test MSE": test_mse,
        "Train R2": train_r2,
        "Test R2": test_r2
    }

    print(f"{name} - Train MSE: {train_mse:.4f}, Test MSE: {test_mse:.4f}")
    print(f"{name} - Train R2: {train_r2:.4f}, Test R2: {test_r2:.4f}\n")

print("Final Results:", results)


Best parameters for RandomForest: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
RandomForest - Train MSE: 0.0007, Test MSE: 0.0047
RandomForest - Train R2: 0.9844, Test R2: 0.8987

Best parameters for GradientBoosting: {'learning_rate': 0.1, 'n_estimators': 100}
GradientBoosting - Train MSE: 0.0016, Test MSE: 0.0049
GradientBoosting - Train R2: 0.9659, Test R2: 0.8936

LinearRegression - Train MSE: 0.0093, Test MSE: 0.0073
LinearRegression - Train R2: 0.7978, Test R2: 0.8407

Best parameters for SVR: {'C': 1, 'kernel': 'rbf'}
SVR - Train MSE: 0.0036, Test MSE: 0.0042
SVR - Train R2: 0.9213, Test R2: 0.9094

Final Results: {'RandomForest': {'Train MSE': 0.0007162550760688895, 'Test MSE': 0.004650853086504874, 'Train R2': 0.9844437050942448, 'Test R2': 0.8987012418352572}, 'GradientBoosting': {'Train MSE': 0.0015716736318877027, 'Test MSE': 0.004883659597152485, 'Train R2': 0.9658649281099226, 'Test R2': 0.8936305569560252}, 'LinearRegression': {'Train MSE': 0.00930769

In [2]:
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score

# ✅ Load dataset
data = pd.read_excel('/content/your_file_converted copy.xlsx')

# Drop irrelevant columns
irrelevant_columns = ['Name', 'Date', 'Attendance', 'Profile', 'week', 'CMJ_Flag', 'CMJ_Flag_check', 'DayOfWeek']
data = data.drop(columns=irrelevant_columns, errors='ignore')

# ✅ Separate features (X) and target (y)
X = data.drop(columns=['TL'], errors='ignore')
y = data['TL']

# ✅ Save original feature names before encoding
original_features = X.columns.tolist()
joblib.dump(original_features, 'original_features.pkl')

# ✅ Encode categorical variables
categorical_columns = X.select_dtypes(include=['object']).columns
label_encoders = {col: LabelEncoder() for col in categorical_columns}
for col in categorical_columns:
    X[col] = label_encoders[col].fit_transform(X[col])
joblib.dump(label_encoders, 'label_encoders.pkl')

# ✅ Save feature order
feature_order = X.columns.tolist()
joblib.dump(feature_order, 'feature_order.pkl')

# ✅ Standardize numerical features
scaler_X = StandardScaler()
X_standardized = scaler_X.fit_transform(X.select_dtypes(include=['float64', 'int64']))
joblib.dump(scaler_X, 'scaler_X.pkl')

# ✅ Scale the target variable (y) to [0, 1]
scaler_y = MinMaxScaler()
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()
joblib.dump(scaler_y, 'scaler_y.pkl')

# ✅ Perform PCA
pca = PCA()
X_pca = pca.fit_transform(X_standardized)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(cumulative_variance >= 0.95) + 1

pca = PCA(n_components=n_components)
X_pca_reduced = pca.fit_transform(X_standardized)
joblib.dump(pca, 'updated_pca_model.pkl')

# ✅ Split data
X_train, X_test, y_train, y_test = train_test_split(X_pca_reduced, y_scaled, test_size=0.2, random_state=42)

# ✅ Define models and hyperparameters
models = {
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "LinearRegression": LinearRegression(),
    "SVR": SVR()
}

param_grids = {
    "RandomForest": {'n_estimators': [50, 100], 'max_depth': [None, 10], 'min_samples_split': [2, 5]},
    "GradientBoosting": {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1]},
    "SVR": {'C': [0.1, 1], 'kernel': ['rbf', 'linear']}
}

best_models = {}
results = {}

for name, model in models.items():
    if name in param_grids:
        grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='r2', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_models[name] = best_model
        joblib.dump(best_model, f'{name}_best_model.pkl')
        print(f"Best parameters for {name}: {grid_search.best_params_}")
    else:
        model.fit(X_train, y_train)
        best_models[name] = model
        joblib.dump(model, f'{name}_model.pkl')

    y_train_pred = best_models[name].predict(X_train)
    y_test_pred = best_models[name].predict(X_test)

    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    results[name] = {
        "Train MSE": train_mse,
        "Test MSE": test_mse,
        "Train R2": train_r2,
        "Test R2": test_r2
    }

    print(f"{name} - Train MSE: {train_mse:.4f}, Test MSE: {test_mse:.4f}")
    print(f"{name} - Train R2: {train_r2:.4f}, Test R2: {test_r2:.4f}\n")

# ✅ Save the best SVR model explicitly
joblib.dump(best_models["SVR"], "SVR_best_model.pkl")

print("Final Results:", results)



Best parameters for RandomForest: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
RandomForest - Train MSE: 0.0007, Test MSE: 0.0047
RandomForest - Train R2: 0.9844, Test R2: 0.8987

Best parameters for GradientBoosting: {'learning_rate': 0.1, 'n_estimators': 100}
GradientBoosting - Train MSE: 0.0016, Test MSE: 0.0049
GradientBoosting - Train R2: 0.9659, Test R2: 0.8936

LinearRegression - Train MSE: 0.0093, Test MSE: 0.0073
LinearRegression - Train R2: 0.7978, Test R2: 0.8407

Best parameters for SVR: {'C': 1, 'kernel': 'rbf'}
SVR - Train MSE: 0.0036, Test MSE: 0.0042
SVR - Train R2: 0.9213, Test R2: 0.9094

Final Results: {'RandomForest': {'Train MSE': 0.0007162550760688895, 'Test MSE': 0.004650853086504874, 'Train R2': 0.9844437050942448, 'Test R2': 0.8987012418352572}, 'GradientBoosting': {'Train MSE': 0.0015716736318877027, 'Test MSE': 0.004883659597152485, 'Train R2': 0.9658649281099226, 'Test R2': 0.8936305569560252}, 'LinearRegression': {'Train MSE': 0.00930769