In [1]:
# CELL 1 — Imports & configuration

import json
import numpy as np
import pandas as pd

from app.features_optiweb import apply_eda  # même fonction que dans tes autres notebooks

# Pour la reproductibilité (si on fait un sampling)
SEED = 42
np.random.seed(SEED)

# Nombre de lignes à utiliser pour construire les stats
# None => full dataset ; sinon ex: 200_000 pour aller plus vite
NROWS = None

# Chemin de sortie du dictionnaire de features
FEATURE_META_PATH = "feature_meta.json"

print("Config OK. NROWS =", NROWS)


Config OK. NROWS = None


In [2]:
# CELL 2 — Chargement des données via apply_eda

X_train, y_train, X_test, test_ids = apply_eda(nrows=NROWS)

print("X_train shape:", X_train.shape)
print("y_train distribution:", dict(pd.Series(y_train).value_counts()))


X_train shape: (307507, 765)
y_train distribution: {0: 282682, 1: 24825}


In [4]:
# CELL 3 — Calcul des statistiques de base par feature (numérique uniquement)

percentiles = [0.01, 0.05, 0.5, 0.95, 0.99]

# On ne garde que les colonnes numériques (int/float)
num_cols = X_train.select_dtypes(include=[np.number]).columns
X_num = X_train[num_cols]

print("Nombre de colonnes numériques:", len(num_cols))

# Stats descriptives classiques
desc = X_num.describe().T  # count, mean, std, min, 25%, 50%, 75%, max

# Quantiles demandés
q = X_num.quantile(percentiles).T  # colonnes: 0.01, 0.05, 0.5, 0.95, 0.99

# Jointure des deux
meta = desc.join(q, rsuffix="_q")

# Renommage propre
meta = meta.rename(
    columns={
        "min": "min",
        "max": "max",
        0.01: "p1",
        0.05: "p5",
        0.5: "p50",
        0.95: "p95",
        0.99: "p99",
    }
)

cols_keep = ["min", "max", "mean", "std", "p1", "p5", "p50", "p95", "p99"]
meta = meta[cols_keep]

print("Meta head:")
meta.head()


Nombre de colonnes numériques: 619
Meta head:


Unnamed: 0,min,max,mean,std,p1,p5,p50,p95,p99
CODE_GENDER,0.0,1.0,0.658352,0.474263,0.0,0.0,1.0,1.0,1.0
FLAG_OWN_CAR,0.0,1.0,0.340106,0.473745,0.0,0.0,0.0,1.0,1.0
FLAG_OWN_REALTY,0.0,1.0,0.306331,0.46097,0.0,0.0,0.0,1.0,1.0
CNT_CHILDREN,0.0,19.0,0.417047,0.722119,0.0,0.0,0.0,2.0,3.0
AMT_INCOME_TOTAL,25650.0,117000000.0,168797.685779,237124.62732,45000.0,67500.0,147150.0,337500.0,472500.0


In [5]:
# CELL 4 — Construction du dictionnaire feature_meta

feature_meta = {}

for col in X_num.columns:  # <--- on boucle sur les colonnes numériques
    s = meta.loc[col]
    
    # dtype simple
    dtype = "int" if pd.api.types.is_integer_dtype(X_num[col].dtype) else "float"
    
    f_min = float(s["min"])
    f_max = float(s["max"])
    p1   = float(s["p1"])
    p5   = float(s["p5"])
    p50  = float(s["p50"])
    p95  = float(s["p95"])
    p99  = float(s["p99"])
    mean = float(s["mean"])
    std  = float(s["std"]) if not np.isnan(s["std"]) else 0.0
    
    slider_min = p1
    slider_max = p99
    default    = p50
    
    feature_meta[col] = {
        "dtype": dtype,
        "min": f_min,
        "max": f_max,
        "p1": p1,
        "p5": p5,
        "p50": p50,
        "p95": p95,
        "p99": p99,
        "mean": mean,
        "std": std,
        "slider_min": slider_min,
        "slider_max": slider_max,
        "default": default,
    }

print("Nombre de features dans feature_meta:", len(feature_meta))
example_key = list(feature_meta.keys())[0]
print("\nExemple pour", example_key, ":\n", feature_meta[example_key])


Nombre de features dans feature_meta: 619

Exemple pour CODE_GENDER :
 {'dtype': 'int', 'min': 0.0, 'max': 1.0, 'p1': 0.0, 'p5': 0.0, 'p50': 1.0, 'p95': 1.0, 'p99': 1.0, 'mean': 0.658352492788782, 'std': 0.4742628168801094, 'slider_min': 0.0, 'slider_max': 1.0, 'default': 1.0}


In [6]:
# CELL 5 — Sauvegarde de feature_meta.json

with open(FEATURE_META_PATH, "w") as f:
    json.dump(feature_meta, f, indent=2)

print(f"✅ feature_meta.json sauvegardé à : {FEATURE_META_PATH}")


✅ feature_meta.json sauvegardé à : feature_meta.json


In [7]:
# CELL 6 — (Optionnel) Sanity check pour quelques features

CHECK_FEATURES = [
    "PAYMENT_RATE",
    "EXT_SOURCE_3",
    "EXT_SOURCE_1",
    "DAYS_BIRTH",
    "AMT_CREDIT",
]

for feat in CHECK_FEATURES:
    if feat in feature_meta:
        print("\n----", feat, "----")
        print(feature_meta[feat])
    else:
        print("\n⚠️", feat, "n'est pas dans X_train (ou meta).")



---- PAYMENT_RATE ----
{'dtype': 'float', 'min': 0.0, 'max': 0.12442957609891891, 'p1': 0.026458825916051865, 'p5': 0.028950000181794167, 'p50': 0.05000000074505806, 'p95': 0.10500312596559525, 'p99': 0.11480945348739624, 'mean': 0.053692951798439026, 'std': 0.02248341403901577, 'slider_min': 0.026458825916051865, 'slider_max': 0.11480945348739624, 'default': 0.05000000074505806}

---- EXT_SOURCE_3 ----
{'dtype': 'float', 'min': 0.0, 'max': 0.8960095643997192, 'p1': 0.0, 'p5': 0.0, 'p50': 0.4596904516220093, 'p95': 0.7776594161987305, 'p99': 0.8327850103378296, 'mean': 0.4095761477947235, 'std': 0.26817792654037476, 'slider_min': 0.0, 'slider_max': 0.8327850103378296, 'default': 0.4596904516220093}

---- EXT_SOURCE_1 ----
{'dtype': 'float', 'min': 0.0, 'max': 0.9626927971839905, 'p1': 0.0, 'p5': 0.0, 'p50': 0.0, 'p95': 0.7742510259151459, 'p99': 0.8661079609394075, 'mean': 0.21902303397655487, 'std': 0.28537365794181824, 'slider_min': 0.0, 'slider_max': 0.8661079609394075, 'default': 