In [2]:
# Dans votre notebook
import pandas as pd

# Charger les features
df = pd.read_parquet('../data/features_engineered.parquet')

# S√©parer train et test
train = df[df['TARGET'].notna()].copy()  # Lignes avec TARGET ‚Üí train
test = df[df['TARGET'].isna()].copy()    # Lignes sans TARGET ‚Üí test Kaggle

# Pr√©parer pour l'entra√Ænement
X_train = train.drop(['TARGET', 'SK_ID_CURR'], axis=1)
y_train = train['TARGET']

# Pour les pr√©dictions finales
X_test = test.drop(['TARGET', 'SK_ID_CURR'], axis=1)
test_ids = test['SK_ID_CURR']

print(f"Train : {X_train.shape}")
print(f"Test : {X_test.shape}")

Train : (307507, 795)
Test : (48744, 795)


In [3]:
import mlflow
import os

# Configuration MLflow en local (pas Docker)
# Les runs et artifacts seront stock√©s dans ./mlruns √† la racine du projet
tracking_uri = os.path.abspath(os.path.join(os.getcwd(), '..', 'mlruns'))
mlflow.set_tracking_uri(f"file://{tracking_uri}")
mlflow.set_experiment("Projet MLOps - Credit Scoring - Random Forest")

print(f"Tracking URI: {mlflow.get_tracking_uri()}")
print(f"R√©pertoire de stockage: {tracking_uri}")
print(f"‚úÖ MLflow configur√© en local")

# V√©rifier la connexion
try:
    experiments = mlflow.search_experiments()
    print(f"Nombre d'exp√©riences: {len(experiments)}")
except Exception as e:
    print(f"‚ö†Ô∏è Erreur: {e}")


Tracking URI: file:///home/zmxw1768/Documents/oc_mlops/mlruns
R√©pertoire de stockage: /home/zmxw1768/Documents/oc_mlops/mlruns
‚úÖ MLflow configur√© en local
Nombre d'exp√©riences: 2


In [4]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
# S√©parer avec StratifiedKFold pour garder la m√™me proportion de classes
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=42)

In [5]:
# V√©rifier et nettoyer les valeurs probl√©matiques sur TOUS les datasets
import numpy as np

print("üßπ Nettoyage des donn√©es...")

# Nettoyer X_train (utilis√© pour la CV)
X_train = X_train.replace([np.inf, -np.inf], np.nan).fillna(0)

# Nettoyer X_tr et X_val (utilis√©s pour les tests simples)
X_tr = X_tr.replace([np.inf, -np.inf], np.nan).fillna(0)
X_val = X_val.replace([np.inf, -np.inf], np.nan).fillna(0)

# Nettoyer X_test (pour les pr√©dictions finales)
X_test = X_test.replace([np.inf, -np.inf], np.nan).fillna(0)

print(f"‚úÖ Donn√©es nettoy√©es")
print(f"   X_train: {X_train.shape}")
print(f"   X_test: {X_test.shape}")


üßπ Nettoyage des donn√©es...
‚úÖ Donn√©es nettoy√©es
   X_train: (307507, 795)
   X_test: (48744, 795)


In [None]:
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (roc_auc_score, recall_score, f1_score, 
                             make_scorer, confusion_matrix)
import numpy as np

# Fermer toute run active
if mlflow.active_run():
    mlflow.end_run()

# D√©finir le co√ªt m√©tier personnalis√© (FN co√ªte 10x plus que FP)
def business_cost_scorer(y_true, y_pred):
    """
    Co√ªt m√©tier : FN (faux n√©gatif) co√ªte 10 fois plus cher que FP (faux positif)
    On retourne le n√©gatif du co√ªt pour maximiser (sklearn maximise les scores)
    """
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    cost = fp * 1 + fn * 10  # FN co√ªte 10x plus
    return -cost  # N√©gatif car on veut minimiser le co√ªt

# Configuration de la validation crois√©e
n_splits = 3  # R√©duit √† 3 pour √©viter la surcharge CPU
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# D√©finir les mod√®les √† tester (r√©duire n_jobs pour √©viter surcharge)
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42, n_jobs=1),
    "Random Forest": RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=2, max_depth=10)
}

# D√©finir les scorers
scoring = {
    'roc_auc': make_scorer(roc_auc_score, needs_proba=True),
    'recall_minority': make_scorer(recall_score, pos_label=1),  # Recall classe 1 (d√©faut)
    'f1': make_scorer(f1_score, pos_label=1),
    'business_cost': make_scorer(business_cost_scorer)
}

print(f"üîÑ Validation crois√©e avec {n_splits} folds")
print(f"üìä M√©triques : ROC-AUC, Recall (classe 1), F1-Score, Co√ªt m√©tier (FN=10x FP)")
print(f"‚öôÔ∏è  Configuration optimis√©e pour √©viter surcharge CPU\n")

# Tester chaque mod√®le
results = {}
for model_name, model in models.items():
    print(f"ü§ñ Entra√Ænement : {model_name}")
    
    with mlflow.start_run(run_name=f"{model_name} - CV {n_splits} folds"):
        # Validation crois√©e avec plusieurs m√©triques (n_jobs=1 pour √©viter surcharge)
        cv_results = cross_validate(
            model, 
            X_train, 
            y_train, 
            cv=skf, 
            scoring=scoring,
            n_jobs=1,  # S√©quentiel pour √©viter surcharge CPU
            return_train_score=False
        )
        
        # Logger les param√®tres
        mlflow.log_param("model", model_name)
        mlflow.log_param("cv_strategy", "StratifiedKFold")
        mlflow.log_param("n_splits", n_splits)
        mlflow.log_param("n_samples", len(X_train))
        mlflow.log_param("n_features", X_train.shape[1])
        mlflow.log_param("business_cost_ratio", "FN=10x FP")
        
        # Logger les param√®tres sp√©cifiques du mod√®le
        for param_name, param_value in model.get_params().items():
            mlflow.log_param(f"model_{param_name}", param_value)
        
        # Stocker et logger les r√©sultats pour chaque m√©trique
        results[model_name] = {}
        
        for metric_name in scoring.keys():
            scores = cv_results[f'test_{metric_name}']
            mean_score = scores.mean()
            std_score = scores.std()
            
            # Stocker les r√©sultats
            results[model_name][metric_name] = {
                'mean': mean_score,
                'std': std_score,
                'scores': scores
            }
            
            # Logger dans MLflow
            mlflow.log_metric(f"{metric_name}_mean", mean_score)
            mlflow.log_metric(f"{metric_name}_std", std_score)
            
            # Logger les scores de chaque fold
            for i, score in enumerate(scores, 1):
                mlflow.log_metric(f"{metric_name}_fold_{i}", score)
            
            # Affichage format√©
            metric_display = metric_name.replace('_', ' ').title()
            print(f"   {metric_display:20s}: {mean_score:.4f} (¬±{std_score:.4f})")
        
        # Entra√Æner le mod√®le sur toutes les donn√©es pour le sauvegarder
        print("   üì¶ Sauvegarde du mod√®le...")
        model.fit(X_train, y_train)
        
        # Cr√©er la signature et l'exemple
        if hasattr(model, 'predict_proba'):
            y_pred_proba = model.predict_proba(X_train[:100])[:, 1]
            signature = mlflow.models.signature.infer_signature(X_train, y_pred_proba)
        else:
            signature = mlflow.models.signature.infer_signature(X_train, model.predict(X_train))
        
        input_example = X_train.head(3)
        
        # Logger le mod√®le
        mlflow.sklearn.log_model(
            model,
            "model",
            signature=signature,
            input_example=input_example
        )
        
        print()

print("\n" + "="*80)
print("üìà R√âSUM√â DES PERFORMANCES")
print("="*80)
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    print("-" * 80)
    print(f"{'M√©trique':<25s} {'Mean':>12s} {'Std':>12s}")
    print("-" * 80)
    for metric_name, result in metrics.items():
        metric_display = metric_name.replace('_', ' ').title()
        print(f"{metric_display:<25s} {result['mean']:>12.4f} {result['std']:>12.4f}")

print("\n" + "="*80)
print(f"‚úÖ Meilleur AUC-ROC: {max(results, key=lambda k: results[k]['roc_auc']['mean'])}")
print(f"‚úÖ Meilleur Recall: {max(results, key=lambda k: results[k]['recall_minority']['mean'])}")
print(f"‚úÖ Meilleur F1-Score: {max(results, key=lambda k: results[k]['f1']['mean'])}")
print(f"‚úÖ Meilleur Co√ªt M√©tier: {max(results, key=lambda k: results[k]['business_cost']['mean'])}")
print("="*80)


üîÑ Validation crois√©e avec 3 folds
üìä M√©triques : ROC-AUC, Recall (classe 1), F1-Score, Co√ªt m√©tier (FN=10x FP)
‚öôÔ∏è  Configuration optimis√©e pour √©viter surcharge CPU

ü§ñ Entra√Ænement : Logistic Regression




In [5]:
# S'assurer qu'aucune run n'est active (forcer la fermeture si erreur)
try:
    if mlflow.active_run():
        mlflow.end_run()
except Exception as e:
    print(f"Avertissement lors de la fermeture de la run: {e}")
    # Forcer la d√©sactivation de la run active en m√©moire
    import mlflow.tracking.fluent
    mlflow.tracking.fluent._active_run_stack = []

# Utiliser un context manager pour g√©rer automatiquement la run
with mlflow.start_run(run_name="Random Forest Baseline"):
    # Entra√Æner le mod√®le
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_tr, y_tr)
    
    # Pr√©dire sur la validation
    y_pred_proba = rf.predict_proba(X_val)[:, 1]
    
    # Calculer le ROC AUC
    auc = roc_auc_score(y_val, y_pred_proba)
    print(f"Validation ROC AUC: {auc:.4f}")
    
    # Log des param√®tres
    mlflow.log_param("model", "RandomForest")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("random_state", 42)
    mlflow.log_param("train_size", len(X_tr))
    mlflow.log_param("val_size", len(X_val))
    
    # Log des m√©triques
    mlflow.log_metric("roc_auc", auc)
    
    # Cr√©er la signature et un exemple d'entr√©e
    from mlflow.models.signature import infer_signature
    signature = infer_signature(X_tr, y_pred_proba)
    input_example = X_tr.head(3)
    
    # Logger le mod√®le localement avec signature et exemple
    mlflow.sklearn.log_model(
        rf,
        "model",
        signature=signature,
        input_example=input_example
    )
    
    print(f"‚úÖ Mod√®le sauvegard√© dans ./mlruns avec signature!")
    print(f"üìä Taille d'entr√©e: {X_tr.shape}")
    
    # Le context manager ferme automatiquement la run √† la fin du bloc

Validation ROC AUC: 0.7127


  "dataframe_split": {
    "columns": [
      "CODE_GENDER",
      "FLAG_OWN_CAR",
      "FLAG_OWN_REALTY",
      "CNT_CHILDREN",
      "AMT_INCOME_TOTAL",
      "AMT_CREDIT",
      "AMT_ANNUITY",
      "AMT_GOODS_PRICE",
      "REGION_POPULATION_RELATIVE",
      "DAYS_BIRTH",
      "DAYS_EMPLOYED",
      "DAYS_REGISTRATION",
      "DAYS_ID_PUBLISH",
      "OWN_CAR_AGE",
      "FLAG_MOBIL",
      "FLAG_EMP_PHONE",
      "FLAG_WORK_PHONE",
      "FLAG_CONT_MOBILE",
      "FLAG_PHONE",
      "FLAG_EMAIL",
      "CNT_FAM_MEMBERS",
      "REGION_RATING_CLIENT",
      "REGION_RATING_CLIENT_W_CITY",
      "HOUR_APPR_PROCESS_START",
      "REG_REGION_NOT_LIVE_REGION",
      "REG_REGION_NOT_WORK_REGION",
      "LIVE_REGION_NOT_WORK_REGION",
      "REG_CITY_NOT_LIVE_CITY",
      "REG_CITY_NOT_WORK_CITY",
      "LIVE_CITY_NOT_WORK_CITY",
      "EXT_SOURCE_1",
      "EXT_SOURCE_2",
      "EXT_SOURCE_3",
      "APARTMENTS_AVG",
      "BASEMENTAREA_AVG",
      "YEARS_BEGINEXPLUATATION_AVG",
      "

‚úÖ Mod√®le sauvegard√© dans ./mlruns avec signature!
üìä Taille d'entr√©e: (246005, 795)


## Interface MLflow

Pour visualiser vos exp√©riences MLflow, lancez l'interface web depuis votre environnement uv:

**Option 1 - Depuis le terminal VSCode:**
```bash
cd /home/zmxw1768/Documents/oc_mlops
uv run mlflow ui
```

**Option 2 - Via l'environnement virtuel:**
```bash
source .venv/bin/activate
mlflow ui
```

Ensuite, ouvrez votre navigateur sur: **http://localhost:5000**

> üí° Les donn√©es sont stock√©es localement dans le dossier `./mlruns` (param√®tres, m√©triques et artifacts)