In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
dataframe_study = pd.read_csv("../datasets/final_df.csv")

In [None]:
dataframe_study.head()

In [None]:
dataframe_study.drop(columns=["Unnamed: 0","Unnamed: 0_x"],inplace=True)

In [None]:
dataframe_study.columns

In [None]:
dataframe_study.drop(columns=["season_x",'Gls90', 'Ast90', 'G+A90', 'G-PK90', 'G+A-PK90', 'xG90',
       'xAG90', 'xG+xAG90', 'npxG90', 'npxG+xAG90', '_merge', 'Unnamed: 0_y','short_name', 'long_name',
       "matched_short_name"],inplace=True)

In [None]:
dataframe = dataframe_study.drop(columns=["from","until","missed_matches"])

In [None]:
dataframe["target"] = dataframe["days_out"].str.extract(r"(\d+)").astype(int)
dataframe.drop(columns =["days_out"] , inplace = True)

In [None]:
df_clean = dataframe.dropna()

In [None]:
df_clean.describe()

In [None]:
df_clean["target"].value_counts()

In [None]:
freqs = df_clean ["injury"].value_counts()
valid_cats = freqs[freqs > 5].index
df_filtrado = df_clean[df_clean["injury"].isin(valid_cats)]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(12, 8))
sns.histplot(df_filtrado["target"], kde=True, color="skyblue", bins=20)
plt.title(f"Histograma de target", fontsize=12)
plt.xlabel("target")
plt.ylabel("Frecuencia")

In [None]:
num_cols = df_filtrado.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = df_filtrado.select_dtypes(include=["object", "bool", "category"]).columns.tolist()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


n = len(num_cols)
ncols = 6  
nrows = (n // ncols) + 1

plt.figure(figsize=(30, 5*nrows))

for i, col in enumerate(num_cols, 1):
    plt.subplot(nrows, ncols, i)
    sns.histplot(df_clean[col], kde=True, color="skyblue", bins=20)
    plt.title(f"Histograma de {col}", fontsize=12)
    plt.xlabel(col)
    plt.ylabel("Frecuencia")

plt.tight_layout()
plt.show()


In [None]:
right_skew_columns = ['Gls', 'Ast', 'G+A', 'G-PK', 'PK',
'PKatt', 'CrdY', 'CrdR', 'xG', 'npxG', 'xAG', 'npxG+xAG', 'PrgC',
'PrgP', 'PrgR']
standard_columns = ['age','born', 'MP', 'Starts', 'Min', '90s','height_cm', 'weight_kg','pace', 'shooting', 'passing', 'dribbling',
       'defending', 'physic', 'movement_acceleration', 'movement_sprint_speed',
       'movement_reactions', 'movement_balance', 'power_stamina',
       'power_strength']

In [None]:
import numpy as np

In [None]:
df_clean[right_skew_columns] = np.log1p(df_clean[right_skew_columns])

scaler = StandardScaler()
df_clean[standard_columns+right_skew_columns] = scaler.fit_transform(df_clean[standard_columns+right_skew_columns])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

n = len(num_cols)
ncols = 6  
nrows = (n // ncols) + 1

plt.figure(figsize=(30, 5*nrows))

for i, col in enumerate(num_cols, 1):
    plt.subplot(nrows, ncols, i)
    sns.histplot(df_clean[col], kde=True, color="skyblue", bins=20)
    plt.title(f"Histograma de {col}", fontsize=12)
    plt.xlabel(col)
    plt.ylabel("Frecuencia")

plt.tight_layout()
plt.show()


In [None]:
df_clean.drop(columns=['last_season','season_y'],inplace=True)

In [None]:
df_clean.drop(columns=['player_positions'],inplace=True)

In [None]:
cat_cols = df_clean.select_dtypes(include=["object", "bool", "category"]).columns.tolist()
cat_cols

In [None]:
ordinal_vars = ['player', 'season', 'nation','injury']
onehot_vars = ['pos', 'work_rate', 'body_type']
target = 'target' 

In [None]:
df_test = df_clean[df_clean['season'] == '24/25']
df_train = df_clean[df_clean['season'] != '24/25']

In [None]:
X_train = df_train.drop(columns=target)
y_train = df_train[target]

X_test = df_test.drop(columns=target)
y_test = df_test[target]

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ordinal_vars),
        ('ohe', OneHotEncoder(handle_unknown='ignore'), onehot_vars)
    ]
)

In [None]:
X_train_p = preprocessor.fit_transform(X_train,y_train)

In [None]:
X_test_p = preprocessor.fit_transform(X_test,y_test)

In [None]:
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

In [None]:
def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
dummy = DummyRegressor(strategy="mean")
dummy.fit(X_train_p, y_train)

### Dummy

In [None]:
y_pred_train = dummy.predict(X_train_p)

mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = sqrt(mse_train)
mape_train = mape(y_train, y_pred_train)

print("MSE:", mse_train)
print("RMSE:", rmse_train)
print("MAPE:", mape_train)

In [None]:
y_pred_test = dummy.predict(X_test_p)

mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = sqrt(mse_test)
mape_test = mape(y_test, y_pred_test)

print("MSE:", mse_test)
print("RMSE:", rmse_test)
print("MAPE:", mape_test)

In [None]:
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression

### SVR

In [None]:
svr = SVR()
svr.fit(X_train_p, y_train)
y_pred_train = svr.predict(X_train_p)

mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = sqrt(mse_train)
mape_train = mape(y_train, y_pred_train)

print("MSE:", mse_train)
print("RMSE:", rmse_train)
print("MAPE:", mape_train)

In [None]:
y_pred_test = svr.predict(X_test_p)

mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = sqrt(mse_test)
mape_test = mape(y_test, y_pred_test)

print("MSE:", mse_test)
print("RMSE:", rmse_test)
print("MAPE:", mape_test)

### MLP

In [None]:
mlp = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)
mlp.fit(X_train_p, y_train)
y_pred_train = mlp.predict(X_train_p)

mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = sqrt(mse_train)
mape_train = mape(y_train, y_pred_train)

print("MSE:", mse_train)
print("RMSE:", rmse_train)
print("MAPE:", mape_train)

In [None]:
y_pred_test = mlp.predict(X_test_p)

mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = sqrt(mse_test)
mape_test = mape(y_test, y_pred_test)

print("MSE:", mse_test)
print("RMSE:", rmse_test)
print("MAPE:", mape_test)

### GB

In [None]:
gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train_p, y_train)
y_pred_train = gb.predict(X_train_p)

mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = sqrt(mse_train)
mape_train = mape(y_train, y_pred_train)

print("MSE:", mse_train)
print("RMSE:", rmse_train)
print("MAPE:", mape_train)

In [None]:
y_pred_test = gb.predict(X_test_p)

mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = sqrt(mse_test)
mape_test = mape(y_test, y_pred_test)

print("MSE:", mse_test)
print("RMSE:", rmse_test)
print("MAPE:", mape_test)

### Linear regresion

In [None]:
lr = LinearRegression()
lr.fit(X_train_p, y_train)
y_pred_train = lr.predict(X_train_p)

mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = sqrt(mse_train)
mape_train = mape(y_train, y_pred_train)

print("MSE:", mse_train)
print("RMSE:", rmse_train)
print("MAPE:", mape_train)

In [None]:
y_pred_test = lr.predict(X_test_p)

mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = sqrt(mse_test)
mape_test = mape(y_test, y_pred_test)

print("MSE:", mse_test)
print("RMSE:", rmse_test)
print("MAPE:", mape_test)

### Random Forest

In [None]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_p, y_train)
y_pred_train = rf.predict(X_train_p)

mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = sqrt(mse_train)
mape_train = mape(y_train, y_pred_train)

print("MSE:", mse_train)
print("RMSE:", rmse_train)
print("MAPE:", mape_train)

In [None]:
y_pred_test = rf.predict(X_test_p)

mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = sqrt(mse_test)
mape_test = mape(y_test, y_pred_test)

print("MSE:", mse_test)
print("RMSE:", rmse_test)
print("MAPE:", mape_test)