# Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score

In [2]:
players = pd.read_csv("../data/players_fe.csv")

# Train-Test Split

In [3]:
X = players.drop(columns=["log_market_value"])
y = players["log_market_value"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42,
)

# 1️⃣ Pipelines

## Preprocessing: Impute + Scale (num), Impute + OneHot (cat)

In [5]:
cat_cols = X_train.select_dtypes(include=["object","category"]).columns.tolist()
num_cols = X_train.select_dtypes(include=[np.number,"float","int"]).columns.tolist()

In [6]:
numeric_pre = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler())
])

In [7]:
categorical_pre = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pre, num_cols),
        ("cat", categorical_pre, cat_cols),
    ],
    remainder="drop"
)


## Drei Modell-Pipelines
- einfach: Ridge (linear, regul.)
- medium: RandomForest
- stark: HistGradientBoosting

In [9]:
pipe_ridge = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", Ridge(random_state=42))
])

In [10]:
pipe_rf = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", RandomForestRegressor(random_state=42, n_jobs=-1))
])

In [11]:
pipe_hgb = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", HistGradientBoostingRegressor(random_state=42))
])

# 2️⃣ Hyperparameter-Tuning

In [None]:
grid_ridge = {
    "model__alpha": [0.1, 0.3, 1.0, 3.0, 10.0, 30.0, 100.0],
}

grid_rf = {
    "model__n_estimators": [300, 400],
    "model__max_depth": [15, 25],
    "model__min_samples_leaf": [1, 3, 5],
    "model__max_features": ["sqrt", 0.8],
}

grid_hgb = {
    "model__learning_rate": [0.03, 0.1],
    "model__max_leaf_nodes": [31, 63, 127],
    "model__max_depth": [None, 10],
    "model__l2_regularization": [0.0, 0.1],
    "model__max_iter": [300],
}

In [13]:
def run_grid(name, pipe, grid):
    gs = GridSearchCV(
        estimator=pipe,
        param_grid=grid,
        scoring="neg_mean_absolute_error",
        cv=5,
        n_jobs=-1,
        verbose=0
    )
    gs.fit(X_train, y_train)
    # Test-Evaluation
    y_pred = gs.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"\n{name} — Best params: {gs.best_params_}")
    print(f"{name} — CV best neg-MAE: {gs.best_score_:.4f}")
    print(f"{name} — Test MAE:  {mae:.4f}")
    print(f"{name} — Test RMSE: {rmse:.4f}")
    print(f"{name} — Test R²:   {r2:.4f}")
    return {"name": name, "grid": gs, "mae": mae, "rmse": rmse, "r2": r2}

In [None]:
results = []
results.append(run_grid("Ridge (simple)", pipe_ridge, grid_ridge))
results.append(run_grid("RandomForest (medium)", pipe_rf, grid_rf))
results.append(run_grid("HistGB (strong)", pipe_hgb, grid_hgb))

rank = pd.DataFrame(results).sort_values("mae")
print("\n=== Model ranking (by Test MAE) ===")
rank[["name","mae","rmse","r2"]]


Ridge (simple) — Best params: {'model__alpha': 10.0}
Ridge (simple) — CV best neg-MAE: -0.7766
Ridge (simple) — Test MAE:  0.7631
Ridge (simple) — Test RMSE: 0.9814
Ridge (simple) — Test R²:   0.6066





RandomForest (medium) — Best params: {'model__max_depth': 25, 'model__max_features': 0.8, 'model__min_samples_leaf': 1, 'model__n_estimators': 400}
RandomForest (medium) — CV best neg-MAE: -0.6124
RandomForest (medium) — Test MAE:  0.6012
RandomForest (medium) — Test RMSE: 0.7868
RandomForest (medium) — Test R²:   0.7471





HistGB (strong) — Best params: {'model__l2_regularization': 0.1, 'model__learning_rate': 0.03, 'model__max_depth': None, 'model__max_iter': 300, 'model__max_leaf_nodes': 127}
HistGB (strong) — CV best neg-MAE: -0.5876
HistGB (strong) — Test MAE:  0.5737
HistGB (strong) — Test RMSE: 0.7542
HistGB (strong) — Test R²:   0.7676

=== Model ranking (by Test MAE) ===
                    name       mae      rmse        r2
2        HistGB (strong)  0.573684  0.754216  0.767642
1  RandomForest (medium)  0.601177  0.786824  0.747116
0         Ridge (simple)  0.763061  0.981375  0.606598


# 3️⃣ Evaluation
- Metriken: MAE, RMSE, R²
- Visualisierung: Predicted vs. Actual, Residualplots