In [None]:
import pandas as pd
import xgboost as xgb
import numpy as np

In [None]:
df = pd.read_csv('../data/transformed_data.csv')

In [None]:
df.head()

# Hyper paramétrisation

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
def display_train_test_metrics(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"MAE  : {mean_absolute_error(y_test, y_pred)}")
    print(f"RMSE : {np.sqrt(mean_squared_error(y_test, y_pred))}")
    print(f"R²   : {r2_score(y_test, y_pred)}")

In [None]:
def search_hyperparameters(max_depths, min_child_weights, n_estimators, learning_rates, X_train, y_train, X_test, y_test):
    for i in max_depths:
        for j in min_child_weights:
            for k in n_estimators:
                for l in learning_rates:
                    print(f'max_depth: {i}, min_child_weight: {j}, n_estimators: {k}, learning_rate: {l}')
                    xgb_model = xgb.XGBRegressor(max_depth=i, min_child_weight=j, n_estimators=k, learning_rate=l, random_state=42)
                    display_train_test_metrics(xgb_model, X_train, y_train, X_test, y_test)

# Model prédiction Magnitude

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop("mag", axis=1)
y = df["mag"]
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# search_hyperparameters([3, 6, 9], [1, 5, 10], [100, 500, 1000], [0.3, 0.15, 0.05], X_train_1, y_train_1, X_test_1, y_test_1)

In [None]:
X_train_1.shape

In [None]:
xgb_model_1 = xgb.XGBRegressor(random_state=42)

In [None]:
xgb_model_1.fit(X_train_1, y_train_1)

## Analyse


In [None]:
X_test_1.shape

In [None]:
xgb.plot_importance(xgb_model_1)

In [None]:


y_pred = xgb_model_1.predict(X_test_1)

print(f"MAE  : {mean_absolute_error(y_test_1, y_pred)}")
print(f"RMSE : {np.sqrt(mean_squared_error(y_test_1, y_pred))}")
print(f"R²   : {r2_score(y_test_1, y_pred)}")

# Model prédiction Depth

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop("depth", axis=1)
y = df["depth"]
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# search_hyperparameters([3, 6, 9], [1, 5, 10], [100, 500, 1000], [0.3, 0.15, 0.05], X_train_1, y_train_1, X_test_1, y_test_1)

In [None]:
X_train_2.shape

In [None]:
xgb_model_2 = xgb.XGBRegressor(random_state=42)

In [None]:
xgb_model_2.fit(X_train_2, y_train_2)

## Analyse

In [None]:
X_test_1.shape

In [None]:
xgb.plot_importance(xgb_model_1)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

y_pred = xgb_model_2.predict(X_test_2)

print(f"MAE  : {mean_absolute_error(y_test_2, y_pred)}")
print(f"RMSE : {np.sqrt(mean_squared_error(y_test_2, y_pred))}")
print(f"R²   : {r2_score(y_test_2, y_pred)}")

## Tests bruitage

In [None]:
def noise_importance(
        model,
        X_test,
        y_test,
        noise_levels,
        n_repeats=10,
        random_state=42
):
    rng = np.random.default_rng(random_state)
    n_features = X_test.shape[1]

    rmse_clean = np.sqrt(mean_squared_error(y_test, y_pred))

    results = np.zeros((n_features, len(noise_levels)))

    for j in range(n_features):
        for k, level in enumerate(noise_levels):
            rmse_runs = []

            for _ in range(n_repeats):
                X_noisy = X_test.copy()
                noise = rng.normal(
                    0,
                    level / 100,
                    size=X_test.shape[0]
                )
                X_noisy[:, j] += noise

                preds = model.predict(X_noisy)
                rmse = np.sqrt(mean_squared_error(y_test, preds))
                rmse_runs.append(rmse)

            results[j, k] = 100 * (np.mean(rmse_runs) - rmse_clean) / rmse_clean

    return results

In [None]:
noise_levels = np.array([1, 3, 5, 10, 15, 20])

importance_matrix = noise_importance(
    xgb_model_2,
    X_test_2.to_numpy(),
    y_test_2.to_numpy() if hasattr(y_test_2, "to_numpy") else y_test_2,
    noise_levels,
    n_repeats=20
)

In [None]:
feature_names = X_test_2.columns.to_list()
global_importance = importance_matrix.mean(axis=1)
ranking = np.argsort(global_importance)[::-1]
import matplotlib.pyplot as plt

top = 5
plt.figure()

for i in range(top):
    idx = ranking[i]
    plt.plot(noise_levels, importance_matrix[idx], label=feature_names[idx])

plt.xlabel("Niveau de bruit (%)")
plt.ylabel("Delta RMSE (%)")
plt.title("Impact du bruit – Top features")
plt.legend()
plt.show()