In [1]:
# Dans votre notebook
import pandas as pd

# Charger les features
df = pd.read_parquet('../data/features_engineered.parquet')

# S√©parer train et test
train = df[df['TARGET'].notna()].copy()  # Lignes avec TARGET ‚Üí train
test = df[df['TARGET'].isna()].copy()    # Lignes sans TARGET ‚Üí test Kaggle

# Pr√©parer pour l'entra√Ænement
X_train = train.drop(['TARGET', 'SK_ID_CURR'], axis=1)
y_train = train['TARGET']

# Pour les pr√©dictions finales
X_test = test.drop(['TARGET', 'SK_ID_CURR'], axis=1)
test_ids = test['SK_ID_CURR']

print(f"Train : {X_train.shape}")
print(f"Test : {X_test.shape}")

Train : (307507, 795)
Test : (48744, 795)


In [2]:
import mlflow
import os

# Configuration MLflow en local (pas Docker)
# Les runs et artifacts seront stock√©s dans ./mlruns √† la racine du projet
tracking_uri = os.path.abspath(os.path.join(os.getcwd(), '..', 'mlruns'))
mlflow.set_tracking_uri(f"file://{tracking_uri}")
mlflow.set_experiment("Projet MLOps - Credit Scoring - Random Forest")

print(f"Tracking URI: {mlflow.get_tracking_uri()}")
print(f"R√©pertoire de stockage: {tracking_uri}")
print(f"‚úÖ MLflow configur√© en local")

# V√©rifier la connexion
try:
    experiments = mlflow.search_experiments()
    print(f"Nombre d'exp√©riences: {len(experiments)}")
except Exception as e:
    print(f"‚ö†Ô∏è Erreur: {e}")


Tracking URI: file:///home/zmxw1768/Documents/oc_mlops/mlruns
R√©pertoire de stockage: /home/zmxw1768/Documents/oc_mlops/mlruns
‚úÖ MLflow configur√© en local
Nombre d'exp√©riences: 2


In [3]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
# S√©parer avec StratifiedKFold pour garder la m√™me proportion de classes
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=42)

In [4]:
# V√©rifier et nettoyer les valeurs probl√©matiques sur TOUS les datasets
import numpy as np

print("üßπ Nettoyage des donn√©es...")

# Nettoyer X_train (utilis√© pour la CV)
X_train = X_train.replace([np.inf, -np.inf], np.nan).fillna(0)

# Nettoyer X_tr et X_val (utilis√©s pour les tests simples)
X_tr = X_tr.replace([np.inf, -np.inf], np.nan).fillna(0)
X_val = X_val.replace([np.inf, -np.inf], np.nan).fillna(0)

# Nettoyer X_test (pour les pr√©dictions finales)
X_test = X_test.replace([np.inf, -np.inf], np.nan).fillna(0)

print(f"‚úÖ Donn√©es nettoy√©es")
print(f"   X_train: {X_train.shape}")
print(f"   X_test: {X_test.shape}")


üßπ Nettoyage des donn√©es...
‚úÖ Donn√©es nettoy√©es
   X_train: (307507, 795)
   X_test: (48744, 795)


In [5]:
# V√©rifier la distribution des classes
print("üìä Distribution des classes:")
print("-" * 50)
print(f"Classe 0 (Pas de d√©faut): {(y_train == 0).sum():,} ({(y_train == 0).mean()*100:.2f}%)")
print(f"Classe 1 (D√©faut):        {(y_train == 1).sum():,} ({(y_train == 1).mean()*100:.2f}%)")
print(f"Ratio d√©s√©quilibre: 1:{(y_train == 0).sum() / (y_train == 1).sum():.1f}")
print("-" * 50)
print(f"Total: {len(y_train):,} √©chantillons")


üìä Distribution des classes:
--------------------------------------------------
Classe 0 (Pas de d√©faut): 282,682 (91.93%)
Classe 1 (D√©faut):        24,825 (8.07%)
Ratio d√©s√©quilibre: 1:11.4
--------------------------------------------------
Total: 307,507 √©chantillons


In [6]:
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (roc_auc_score, recall_score, f1_score, 
                             make_scorer, confusion_matrix)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np

# Fermer toute run active
if mlflow.active_run():
    mlflow.end_run()

# D√©finir le co√ªt m√©tier personnalis√© (FN co√ªte 10x plus que FP)
def business_cost_scorer(y_true, y_pred):
    """
    Co√ªt m√©tier : FN (faux n√©gatif) co√ªte 10 fois plus cher que FP (faux positif)
    On retourne le n√©gatif du co√ªt pour maximiser (sklearn maximise les scores)
    """
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    cost = fp * 1 + fn * 10  # FN co√ªte 10x plus
    return -cost  # N√©gatif car on veut minimiser le co√ªt

# Configuration de la validation crois√©e
n_splits = 3  # R√©duit √† 3 pour √©viter la surcharge CPU
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# D√©finir les mod√®les avec class_weight='balanced' pour g√©rer le d√©s√©quilibre
models = {
    "Logistic Regression": Pipeline([
        ('scaler', StandardScaler()),  # ‚Üê NORMALISATION OBLIGATOIRE
        ('classifier', LogisticRegression(
            max_iter=1000,  # Suffisant avec normalisation
            random_state=42,
            class_weight='balanced',
            solver='lbfgs',
            verbose=1
            # n_jobs retir√© (d√©pr√©ci√©)
        ))
    ]),
    
    "Random Forest": RandomForestClassifier(
        n_estimators=50, 
        random_state=42, 
        n_jobs=-1,  # ‚Üê OK pour Random Forest
        max_depth=10,
        class_weight='balanced',
        verbose=1
    )
}

# D√©finir les scorers (response_method au lieu de needs_proba pour sklearn r√©cent)
scoring = {
    'roc_auc': make_scorer(roc_auc_score, response_method='predict_proba'),
    'recall_minority': make_scorer(recall_score, pos_label=1, zero_division=0),
    'f1': make_scorer(f1_score, pos_label=1, zero_division=0),
    'business_cost': make_scorer(business_cost_scorer)
}

print(f"üîÑ Validation crois√©e avec {n_splits} folds")
print(f"üìä M√©triques : ROC-AUC, Recall (classe 1), F1-Score, Co√ªt m√©tier (FN=10x FP)")
print(f"‚öôÔ∏è  Configuration : class_weight='balanced' pour g√©rer le d√©s√©quilibre\n")

# Tester chaque mod√®le
results = {}
for model_name, model in models.items():
    print(f"ü§ñ Entra√Ænement : {model_name}")
    
    with mlflow.start_run(run_name=f"{model_name} - CV {n_splits} folds - Balanced"):
        # üè∑Ô∏è TAGS ET METADATA
        mlflow.set_tags({
            "project": "Home Credit Default Risk",
            "phase": "baseline",
            "model_type": "classification",
            "data_version": "v1.0",
            "environment": "development"
        })
        
        # üìù DESCRIPTION/NOTE (appara√Æt dans l'interface MLflow)
        mlflow.set_tag("mlflow.note.content", 
            f"""Entra√Ænement {model_name} avec validation crois√©e stratifi√©e.
            
            Objectif: D√©tecter les d√©fauts de paiement avec gestion du d√©s√©quilibre de classes.
            Strat√©gie: class_weight='balanced' + StandardScaler.
            M√©trique prioritaire: Co√ªt m√©tier (FN=10x FP).
            
            Date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')}
            """)
        
        # Validation crois√©e avec plusieurs m√©triques
        cv_results = cross_validate(
            model, 
            X_train, 
            y_train, 
            cv=skf, 
            scoring=scoring,
            n_jobs=1,
            return_train_score=False,
            error_score='raise'  # Lever les erreurs pour debugging
        )
        
        # Logger les param√®tres
        mlflow.log_param("model", model_name)
        mlflow.log_param("cv_strategy", "StratifiedKFold")
        mlflow.log_param("n_splits", n_splits)
        mlflow.log_param("n_samples", len(X_train))
        mlflow.log_param("n_features", X_train.shape[1])
        mlflow.log_param("class_weight", "balanced")
        mlflow.log_param("business_cost_ratio", "FN=10x FP")
        
        # Logger les param√®tres sp√©cifiques du mod√®le
        for param_name, param_value in model.get_params().items():
            mlflow.log_param(f"model_{param_name}", str(param_value))
        
        # Stocker et logger les r√©sultats pour chaque m√©trique
        results[model_name] = {}
        
        for metric_name in scoring.keys():
            scores = cv_results[f'test_{metric_name}']
            mean_score = np.nanmean(scores)  # Utiliser nanmean pour g√©rer les NaN
            std_score = np.nanstd(scores)
            
            # Stocker les r√©sultats
            results[model_name][metric_name] = {
                'mean': mean_score,
                'std': std_score,
                'scores': scores
            }
            
            # Logger dans MLflow
            mlflow.log_metric(f"{metric_name}_mean", mean_score)
            mlflow.log_metric(f"{metric_name}_std", std_score)
            
            # Logger les scores de chaque fold
            for i, score in enumerate(scores, 1):
                mlflow.log_metric(f"{metric_name}_fold_{i}", score)
            
            # Affichage format√©
            metric_display = metric_name.replace('_', ' ').title()
            print(f"   {metric_display:20s}: {mean_score:.4f} (¬±{std_score:.4f})")
        
        # Entra√Æner le mod√®le sur toutes les donn√©es pour le sauvegarder
        print("   üì¶ Sauvegarde du mod√®le...")
        model.fit(X_train, y_train)
        
        # Diagnostics sur le mod√®le entra√Æn√©
        y_pred_train = model.predict(X_train[:1000])
        print(f"   üîç Pr√©dictions sur 1000 √©chantillons: {np.sum(y_pred_train)} positifs")
        
        # Cr√©er la signature et l'exemple
        if hasattr(model, 'predict_proba'):
            y_pred_proba = model.predict_proba(X_train[:100])[:, 1]
            signature = mlflow.models.signature.infer_signature(X_train, y_pred_proba)
        else:
            signature = mlflow.models.signature.infer_signature(X_train, model.predict(X_train))
        
        input_example = X_train.head(3)
        
        # Logger le mod√®le
        mlflow.sklearn.log_model(
            model,
            "model",
            signature=signature,
            input_example=input_example
        )
        
        print()

print("\n" + "="*80)
print("üìà R√âSUM√â DES PERFORMANCES")
print("="*80)
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    print("-" * 80)
    print(f"{'M√©trique':<25s} {'Mean':>12s} {'Std':>12s}")
    print("-" * 80)
    for metric_name, result in metrics.items():
        metric_display = metric_name.replace('_', ' ').title()
        print(f"{metric_display:<25s} {result['mean']:>12.4f} {result['std']:>12.4f}")

print("\n" + "="*80)
# Utiliser nanmax pour g√©rer les NaN
print(f"‚úÖ Meilleur AUC-ROC: {max(results, key=lambda k: results[k]['roc_auc']['mean'] if not np.isnan(results[k]['roc_auc']['mean']) else -1)}")
print(f"‚úÖ Meilleur Recall: {max(results, key=lambda k: results[k]['recall_minority']['mean'])}")
print(f"‚úÖ Meilleur F1-Score: {max(results, key=lambda k: results[k]['f1']['mean'])}")
print(f"‚úÖ Meilleur Co√ªt M√©tier: {max(results, key=lambda k: results[k]['business_cost']['mean'])}")
print("="*80)


üîÑ Validation crois√©e avec 3 folds
üìä M√©triques : ROC-AUC, Recall (classe 1), F1-Score, Co√ªt m√©tier (FN=10x FP)
‚öôÔ∏è  Configuration : class_weight='balanced' pour g√©rer le d√©s√©quilibre

ü§ñ Entra√Ænement : Logistic Regression
   Roc Auc             : 0.7670 (¬±0.0021)
   Recall Minority     : 0.6983 (¬±0.0062)
   F1                  : 0.2753 (¬±0.0014)
   Business Cost       : -52892.3333 (¬±404.6334)
   üì¶ Sauvegarde du mod√®le...
   üîç Pr√©dictions sur 1000 √©chantillons: 340.0 positifs


  "dataframe_split": {
    "columns": [
      "CODE_GENDER",
      "FLAG_OWN_CAR",
      "FLAG_OWN_REALTY",
      "CNT_CHILDREN",
      "AMT_INCOME_TOTAL",
      "AMT_CREDIT",
      "AMT_ANNUITY",
      "AMT_GOODS_PRICE",
      "REGION_POPULATION_RELATIVE",
      "DAYS_BIRTH",
      "DAYS_EMPLOYED",
      "DAYS_REGISTRATION",
      "DAYS_ID_PUBLISH",
      "OWN_CAR_AGE",
      "FLAG_MOBIL",
      "FLAG_EMP_PHONE",
      "FLAG_WORK_PHONE",
      "FLAG_CONT_MOBILE",
      "FLAG_PHONE",
      "FLAG_EMAIL",
      "CNT_FAM_MEMBERS",
      "REGION_RATING_CLIENT",
      "REGION_RATING_CLIENT_W_CITY",
      "HOUR_APPR_PROCESS_START",
      "REG_REGION_NOT_LIVE_REGION",
      "REG_REGION_NOT_WORK_REGION",
      "LIVE_REGION_NOT_WORK_REGION",
      "REG_CITY_NOT_LIVE_CITY",
      "REG_CITY_NOT_WORK_CITY",
      "LIVE_CITY_NOT_WORK_CITY",
      "EXT_SOURCE_1",
      "EXT_SOURCE_2",
      "EXT_SOURCE_3",
      "APARTMENTS_AVG",
      "BASEMENTAREA_AVG",
      "YEARS_BEGINEXPLUATATION_AVG",
      "


ü§ñ Entra√Ænement : Random Forest


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   10.5s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done  50 out of  50 | elapsed:    0.1s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done  50 out of  50 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   10.4s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0

   Roc Auc             : 0.7349 (¬±0.0018)
   Recall Minority     : 0.5214 (¬±0.0030)
   F1                  : 0.2729 (¬±0.0026)
   Business Cost       : -58635.6667 (¬±339.0028)
   üì¶ Sauvegarde du mod√®le...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   17.6s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done  50 out of  50 | elapsed:    0.0s finished


   üîç Pr√©dictions sur 1000 √©chantillons: 241.0 positifs


  "dataframe_split": {
    "columns": [
      "CODE_GENDER",
      "FLAG_OWN_CAR",
      "FLAG_OWN_REALTY",
      "CNT_CHILDREN",
      "AMT_INCOME_TOTAL",
      "AMT_CREDIT",
      "AMT_ANNUITY",
      "AMT_GOODS_PRICE",
      "REGION_POPULATION_RELATIVE",
      "DAYS_BIRTH",
      "DAYS_EMPLOYED",
      "DAYS_REGISTRATION",
      "DAYS_ID_PUBLISH",
      "OWN_CAR_AGE",
      "FLAG_MOBIL",
      "FLAG_EMP_PHONE",
      "FLAG_WORK_PHONE",
      "FLAG_CONT_MOBILE",
      "FLAG_PHONE",
      "FLAG_EMAIL",
      "CNT_FAM_MEMBERS",
      "REGION_RATING_CLIENT",
      "REGION_RATING_CLIENT_W_CITY",
      "HOUR_APPR_PROCESS_START",
      "REG_REGION_NOT_LIVE_REGION",
      "REG_REGION_NOT_WORK_REGION",
      "LIVE_REGION_NOT_WORK_REGION",
      "REG_CITY_NOT_LIVE_CITY",
      "REG_CITY_NOT_WORK_CITY",
      "LIVE_CITY_NOT_WORK_CITY",
      "EXT_SOURCE_1",
      "EXT_SOURCE_2",
      "EXT_SOURCE_3",
      "APARTMENTS_AVG",
      "BASEMENTAREA_AVG",
      "YEARS_BEGINEXPLUATATION_AVG",
      "



üìà R√âSUM√â DES PERFORMANCES

Logistic Regression:
--------------------------------------------------------------------------------
M√©trique                          Mean          Std
--------------------------------------------------------------------------------
Roc Auc                         0.7670       0.0021
Recall Minority                 0.6983       0.0062
F1                              0.2753       0.0014
Business Cost              -52892.3333     404.6334

Random Forest:
--------------------------------------------------------------------------------
M√©trique                          Mean          Std
--------------------------------------------------------------------------------
Roc Auc                         0.7349       0.0018
Recall Minority                 0.5214       0.0030
F1                              0.2729       0.0026
Business Cost              -58635.6667     339.0028

‚úÖ Meilleur AUC-ROC: Logistic Regression
‚úÖ Meilleur Recall: Logistic Regression

## üè∑Ô∏è Tags et Organisation MLflow

### Tags recommand√©s par cat√©gorie :

**M√©tadonn√©es projet :**
- `author` : Auteur de l'exp√©rience
- `project` : Nom du projet
- `version` : Version du code/donn√©es
- `environment` : dev / staging / production

**Contexte exp√©rience :**
- `phase` : exploration / baseline / optimization / production
- `model_type` : classification / regression / clustering
- `approach` : feature_engineering / hyperparameter_tuning / ensemble

**Tags techniques :**
- `gpu_used` : True/False
- `framework` : sklearn / xgboost / pytorch
- `data_sampling` : full / stratified / undersampling

**Tags m√©tier :**
- `business_priority` : cost_reduction / accuracy / speed
- `deployment_ready` : True/False
- `validation_status` : pending / approved / rejected

### Tag sp√©cial : Description

Le tag **`mlflow.note.content`** appara√Æt dans l'interface UI comme une note √©ditable.

### Recherche et filtrage

Dans l'UI MLflow, filtrez par tags :
```
tags.phase = "baseline" AND metrics.roc_auc_mean > 0.75
```

## Interface MLflow

Pour visualiser vos exp√©riences MLflow, lancez l'interface web depuis votre environnement uv:

**Option 1 - Depuis le terminal VSCode:**
```bash
cd /home/zmxw1768/Documents/oc_mlops
uv run mlflow ui
```

**Option 2 - Via l'environnement virtuel:**
```bash
source .venv/bin/activate
mlflow ui
```

Ensuite, ouvrez votre navigateur sur: **http://localhost:5000**

> üí° Les donn√©es sont stock√©es localement dans le dossier `./mlruns` (param√®tres, m√©triques et artifacts)

## üîß Modifier les tags d'un run existant

Si vous voulez ajouter/modifier des tags sur un run d√©j√† termin√© :

In [None]:
# Exemple : Ajouter des tags √† un run existant
import mlflow

# M√©thode 1 : Par ID de run
run_id = "4b919d82f33d4f8c9b0066efbd98e3c7"  # Remplacez par votre run_id
client = mlflow.tracking.MlflowClient()

client.set_tag(run_id, "deployment_ready", "True")
client.set_tag(run_id, "validated_by", "Team Lead")
client.set_tag(run_id, "mlflow.note.content", "‚úÖ Mod√®le valid√© pour production")

# M√©thode 2 : Chercher et taguer plusieurs runs
experiment = mlflow.get_experiment_by_name("Default")  # Ou votre nom d'exp√©rience
runs = mlflow.search_runs(
    experiment_ids=[experiment.experiment_id],
    filter_string="metrics.roc_auc_mean > 0.75"
)

for idx, run in runs.iterrows():
    client.set_tag(run.run_id, "high_performance", "True")
    print(f"‚úÖ Tagged run {run.run_id[:8]}...")

In [None]:
# Tags au niveau de l'EXP√âRIENCE (s'appliquent √† tous les runs)
experiment_id = "148740929161326232"  # Votre experiment_id

client = mlflow.tracking.MlflowClient()
client.set_experiment_tag(experiment_id, "project", "Home Credit Default Risk")
client.set_experiment_tag(experiment_id, "dataset", "application_train.csv")
client.set_experiment_tag(experiment_id, "business_goal", "Minimize default cost")