In [1]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import r2_score
# from xgboost import XGBRegressor

# # Assurez-vous que votre DataFrame df est bien chargé
# csv_file_path = '../data/silver.csv'
# df = pd.read_csv(csv_file_path)

# # Séparation des caractéristiques (features) et de la cible (target)
# X = df.drop(columns=['salary_in_usd'])
# y = df['salary_in_usd']

# # Séparation des données en ensembles d'entraînement (60%), de validation (20%) et de test (20%)
# X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
# X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# # Encodage des variables catégorielles
# X_train = pd.get_dummies(X_train)
# X_val = pd.get_dummies(X_val)
# X_test = pd.get_dummies(X_test)

# # Assurez-vous que les ensembles de validation et de test ont les mêmes colonnes que l'ensemble d'entraînement
# X_val = X_val.reindex(columns=X_train.columns, fill_value=0)
# X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# # Création et entraînement du modèle XGBoost
# model = XGBRegressor()
# model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10, verbose=False)

# # Prédiction sur les ensembles de validation et de test
# y_val_pred = model.predict(X_val)
# y_test_pred = model.predict(X_test)

# # Évaluation du modèle
# r2_val = r2_score(y_val, y_val_pred)
# r2_test = r2_score(y_test, y_test_pred)

# print(f"Validation R2: {r2_val}")
# print(f"Test R2: {r2_test}")


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
import mlflow
import mlflow.sklearn

# Assurez-vous que votre DataFrame df est bien chargé
csv_file_path = '../data/silver.csv'
df = pd.read_csv(csv_file_path)

# Séparation des caractéristiques (features) et de la cible (target)
X = df.drop(columns=['salary_in_usd'])
y = df['salary_in_usd']

# Séparation des données en ensembles d'entraînement (60%), de validation (20%) et de test (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Encodage des variables catégorielles
X_train = pd.get_dummies(X_train)
X_val = pd.get_dummies(X_val)
X_test = pd.get_dummies(X_test)

# Assurez-vous que les ensembles de validation et de test ont les mêmes colonnes que l'ensemble d'entraînement
X_val = X_val.reindex(columns=X_train.columns, fill_value=0)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Initialisation de MLflow
mlflow_tracking_uri = "http://127.0.0.1:5000" 
experiment_name = "xgboost_salary_prediction"
mlflow.set_tracking_uri(mlflow_tracking_uri)
mlflow.set_experiment(experiment_name)

# Enregistrement des paramètres et métriques avec MLflow
with mlflow.start_run() as run:
    # Définir les paramètres de recherche
    param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 4, 5],
        'subsample': [0.7, 0.8, 0.9],
        'colsample_bytree': [0.7, 0.8, 0.9]
    }

    # Création du modèle XGBoost
    xgb_model = XGBRegressor()

    # Utiliser GridSearchCV pour trouver les meilleurs hyperparamètres
    grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='r2', cv=3, verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

    # Meilleurs paramètres trouvés par GridSearchCV
    best_params = grid_search.best_params_
    print(f"Best parameters: {best_params}")

    # Enregistrer les hyperparamètres dans MLflow
    mlflow.log_params(best_params)

    # Entraîner le modèle final avec les meilleurs paramètres
    final_model = XGBRegressor(**best_params)
    final_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

    # Prédiction sur les ensembles de validation et de test
    y_val_pred = final_model.predict(X_val)
    y_test_pred = final_model.predict(X_test)

    # Évaluation du modèle
    r2_val = r2_score(y_val, y_val_pred)
    r2_test = r2_score(y_test, y_test_pred)

    # Enregistrer les métriques dans MLflow
    mlflow.log_metric("r2_val", r2_val)
    mlflow.log_metric("r2_test", r2_test)

    print(f"Validation R2: {r2_val}")
    print(f"Test R2: {r2_test}")

    # Enregistrer le modèle dans MLflow
    mlflow.sklearn.log_model(final_model, "xgboost_model")

    # Validation croisée avec le modèle final
    cross_val_r2 = cross_val_score(final_model, X, y, cv=5, scoring='r2')
    print(f"Cross-validated R2: {cross_val_r2.mean()} ± {cross_val_r2.std()}")
    mlflow.log_metric("cross_val_r2_mean", cross_val_r2.mean())
    mlflow.log_metric("cross_val_r2_std", cross_val_r2.std())


Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.9}
Validation R2: 0.2951529237390842
Test R2: 0.2505060369671258


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/utilisateur/Documents/Projets/data_science_job_salaries/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/utilisateur/Documents/Projets/data_science_job_salaries/venv/lib/python3.10/site-packages/xgboost/core.py", line 730, in inner_f
    return func(**kwargs)
  File "/home/utilisateur/Documents/Projets/data_science_job_salaries/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 1055, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "/home/utilisateur/Documents/Projets/data_science_job_salaries/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 521, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "/home/utilisateur/Documents/Projets/data_science_job_salaries/venv/lib/python3.10/site-packages/xgboost/sklearn.py", line 958, in _create_dmatrix
    return QuantileDMatrix(
  File "/home/utilisateur/Documents/Projets/data_science_job_salaries/venv/lib/python3.10/site-packages/xgboost/core.py", line 730, in inner_f
    return func(**kwargs)
  File "/home/utilisateur/Documents/Projets/data_science_job_salaries/venv/lib/python3.10/site-packages/xgboost/core.py", line 1529, in __init__
    self._init(
  File "/home/utilisateur/Documents/Projets/data_science_job_salaries/venv/lib/python3.10/site-packages/xgboost/core.py", line 1588, in _init
    it.reraise()
  File "/home/utilisateur/Documents/Projets/data_science_job_salaries/venv/lib/python3.10/site-packages/xgboost/core.py", line 576, in reraise
    raise exc  # pylint: disable=raising-bad-type
  File "/home/utilisateur/Documents/Projets/data_science_job_salaries/venv/lib/python3.10/site-packages/xgboost/core.py", line 557, in _handle_exception
    return fn()
  File "/home/utilisateur/Documents/Projets/data_science_job_salaries/venv/lib/python3.10/site-packages/xgboost/core.py", line 641, in <lambda>
    return self._handle_exception(lambda: self.next(input_data), 0)
  File "/home/utilisateur/Documents/Projets/data_science_job_salaries/venv/lib/python3.10/site-packages/xgboost/data.py", line 1280, in next
    input_data(**self.kwargs)
  File "/home/utilisateur/Documents/Projets/data_science_job_salaries/venv/lib/python3.10/site-packages/xgboost/core.py", line 730, in inner_f
    return func(**kwargs)
  File "/home/utilisateur/Documents/Projets/data_science_job_salaries/venv/lib/python3.10/site-packages/xgboost/core.py", line 624, in input_data
    new, cat_codes, feature_names, feature_types = _proxy_transform(
  File "/home/utilisateur/Documents/Projets/data_science_job_salaries/venv/lib/python3.10/site-packages/xgboost/data.py", line 1315, in _proxy_transform
    arr, feature_names, feature_types = _transform_pandas_df(
  File "/home/utilisateur/Documents/Projets/data_science_job_salaries/venv/lib/python3.10/site-packages/xgboost/data.py", line 490, in _transform_pandas_df
    _invalid_dataframe_dtype(data)
  File "/home/utilisateur/Documents/Projets/data_science_job_salaries/venv/lib/python3.10/site-packages/xgboost/data.py", line 308, in _invalid_dataframe_dtype
    raise ValueError(msg)
ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:experience_level: object, employment_type: object, job_title: object, employee_residence: object, company_location: object, company_size: object
