# Nettoyage — Source 1 (Manga Sanctuary Volumes)

Objectif (C3) : produire une base **joinable** propre à partir de `manga_sanctuary_volumes*.jsonl`.

Sorties :
- `ms_series_clean`
- `ms_volumes_clean`

Étapes :
1) lecture robuste + `__line__` + rejets
2) normalisation types (id, numéro, url)
3) nettoyage textes (`series_synopsis`, `volume_synopsis`)
4) dédup (series_id, volume_url)
5) suppression des colonnes `.1` si duplicats (ex: `volume_url.1`)



## nettoyage de manga_sanctuary_volumes.json

### Setup et Chemins

In [42]:
import json, re, ast
from pathlib import Path
import pandas as pd


## Dépendances Parquet (export systématique Parquet + CSV)

Ce notebook exporte **toujours** en **CSV** et tente d'exporter en **Parquet**.
Pour garantir Parquet, on installe `pyarrow` si nécessaire.



In [43]:
import importlib
import sys

def ensure_pyarrow():
    try:
        import pyarrow  # noqa: F401
        return True
    except Exception:
        pass

    # Installation via pip (nécessite un environnement avec accès aux paquets)
    try:
        import subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", "pyarrow"])
        import pyarrow  # noqa: F401
        return True
    except Exception as e:
        print("⚠️ Impossible d'installer pyarrow automatiquement :", repr(e))
        print("➡️ Le CSV sera bien produit, mais Parquet peut échouer tant que pyarrow/fastparquet n'est pas installé.")
        return False

PARQUET_READY = ensure_pyarrow()
print("PARQUET_READY =", PARQUET_READY)



PARQUET_READY = True


In [44]:
from pathlib import Path

PROJECT_ROOT = Path.cwd()
# Si notebook lancé depuis un sous-dossier, remonter d'un niveau si besoin
if not (PROJECT_ROOT / "data").exists() and (PROJECT_ROOT.parent / "data").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent

def find_raw_jsonl(project_root: Path) -> Path:
    """Trouve le fichier volumes JSONL dans data/ ou dans /mnt/data (selon l'environnement)."""
    candidates = [
        project_root / "data" / "manga_sanctuary_volumes (1).jsonl",
        project_root / "data" / "manga_sanctuary_volumes.jsonl",
        project_root / "data" / "manga_sanctuary_volumes(1).jsonl",
        # environnements type sandbox
        Path("/mnt/data/manga_sanctuary_volumes (1).jsonl"),
        Path("/mnt/data/manga_sanctuary_volumes.jsonl"),
        Path("/mnt/data/manga_sanctuary_volumes(1).jsonl"),
        # fallback éventuel
        Path("/data/manga_sanctuary_volumes (1).jsonl"),
        Path("/data/manga_sanctuary_volumes.jsonl"),
        Path("/data/manga_sanctuary_volumes(1).jsonl"),
    ]
    for p in candidates:
        if p.exists():
            return p

    # fallback: premier jsonl contenant 'volumes' dans data/
    data_dir = project_root / "data"
    if data_dir.exists():
        for p in sorted(data_dir.glob("*volumes*.jsonl")):
            return p

    raise FileNotFoundError(
        "Impossible de trouver manga_sanctuary_volumes*.jsonl (cherché dans data/, /mnt/data, /data)."
    )

RAW_PATH = find_raw_jsonl(PROJECT_ROOT)
print("RAW_PATH =", RAW_PATH)

OUT_DIR  = PROJECT_ROOT / "out_ms_staging"
OUT_DIR.mkdir(exist_ok=True, parents=True)

# sorties parquet
CLEAN_VOLUMES_PATH = OUT_DIR / "ms_volumes_clean.parquet"
CLEAN_SERIES_PATH  = OUT_DIR / "ms_series_clean.parquet"

# sorties CSV (toujours)
VOLUMES_CSV_PATH = OUT_DIR / "ms_volumes_clean.csv"
SERIES_CSV_PATH  = OUT_DIR / "ms_series_clean.csv"

# audit
REJECTED_PATH = OUT_DIR / "ms_volumes_rejected.jsonl"
STATS_PATH    = OUT_DIR / "ms_volumes_stats.json"



RAW_PATH = /home/maxime/python/certification/preparation_bdd/data/manga_sanctuary_volumes (1).jsonl


### Lecture et rejet des lignes invalides

In [45]:
valid_rows = []
rejected = []

with RAW_PATH.open("r", encoding="utf-8") as f:
    for i, line in enumerate(f, start=1):
        line = line.strip()
        if not line:
            rejected.append({"line": i, "reason": "empty_line"})
            continue
        try:
            obj = json.loads(line)
            obj["__line__"] = i  # traçabilité audit
            valid_rows.append(obj)
        except json.JSONDecodeError:
            rejected.append({"line": i, "reason": "invalid_json"})

print("valid_rows:", len(valid_rows))
print("rejected_rows:", len(rejected))


valid_rows: 89188
rejected_rows: 0


### DataFrame + vue  avant nettoyage

In [47]:
df_raw = pd.json_normalize(valid_rows)
print("shape:", df_raw.shape)
df_raw.head(3)




shape: (89188, 39)


Unnamed: 0,series_id,series_url,series_title,series_type,series_category,series_year,series_other_titles,series_dessinateur,series_scenariste,series_genres,...,volume_country,volume_status,volume_tomes_published,volume_tomes_total,volume_members_rating,volume_members_votes,volume_experts_rating,volume_experts_votes,volume_synopsis,__line__
0,78152,https://www.manga-sanctuary.com/bdd/manga/7815...,Touken ranbu -ONLINE- Anthology ~ Honmaru Ranm...,Manga,2019,2019,[刀剣乱舞-ONLINE- アンソロジー ~本丸爛漫日和~],Kyôko MAKI,Niko WAKUHARA,[],...,pays édition,Complète,1.0,1.0,,0.0,,0.0,刀剣男士達を華麗に描く公式アンソロジー! 本丸でのほのぼのエピソードから、合戦場でのかっこい...,1
1,12139,https://www.manga-sanctuary.com/bdd/manhwa/121...,Blast,Manhwa,Sonyun,2007,[],Kangho PARK,Ha na LEE,[],...,,,,,,0.0,,0.0,,2
2,12139,https://www.manga-sanctuary.com/bdd/manhwa/121...,Blast,Manhwa,Sonyun,2007,[],Kangho PARK,Ha na LEE,[],...,,,,,,0.0,,0.0,,3


### Règles “champs obligatoires” (et rejets)

In [48]:
REQUIRED = ["series_id", "series_url", "series_title", "volume_url", "volume_title"]

# ligne invalide si un champ requis est NA ou si volume_title est vide
missing_mask = df_raw[REQUIRED].isna().any(axis=1) | (df_raw["volume_title"].fillna("").astype(str).str.strip() == "")

df_bad = df_raw[missing_mask].copy()
df_ok  = df_raw[~missing_mask].copy()

# log détaillé des raisons + line d'origine
for _, row in df_bad.iterrows():
    reasons = []
    for c in REQUIRED:
        v = row.get(c)
        if pd.isna(v) or (isinstance(v, str) and v.strip() == ""):
            reasons.append(f"missing_{c}")
    rejected.append({
        "line": int(row["__line__"]) if pd.notna(row.get("__line__")) else None,
        "reason": "missing_required_fields",
        "reasons": reasons
    })

print("kept_after_required:", df_ok.shape[0])
print("rejected_after_required:", df_bad.shape[0])



kept_after_required: 89129
rejected_after_required: 59


### homogénéisation

In [49]:
def to_int(s):
    return pd.to_numeric(s, errors="coerce").astype("Int64")

def to_float(s):
    return pd.to_numeric(s, errors="coerce").astype("Float64")

df = df_ok.copy()

df["series_id"] = to_int(df["series_id"])
df["series_year"] = to_int(df.get("series_year"))
df["series_popularity_rank"] = to_int(df.get("series_popularity_rank"))

df["series_members_rating"] = to_float(df.get("series_members_rating"))
df["series_experts_rating"] = to_float(df.get("series_experts_rating"))
df["series_members_votes"] = to_int(df.get("series_members_votes"))
df["series_experts_votes"] = to_int(df.get("series_experts_votes"))

df["volume_number"] = to_int(df.get("volume_number"))
df["volume_pages"]  = to_int(df.get("volume_pages"))

# règle métier : 0 pages => null
df.loc[df["volume_pages"] == 0, "volume_pages"] = pd.NA

df.dtypes.head(38)



series_id                    Int64
series_url                  object
series_title                object
series_type                 object
series_category             object
series_year                  Int64
series_other_titles         object
series_dessinateur          object
series_scenariste           object
series_genres               object
series_tags                 object
series_mag_prepub           object
series_statuses             object
series_popularity_rank       Int64
series_members_rating      Float64
series_members_votes         Int64
series_experts_rating      Float64
series_experts_votes         Int64
series_synopsis             object
series_related_works        object
volume_url                  object
volume_title                object
volume_number                Int64
volume_publication_date     object
volume_dessinateur          object
volume_scenariste           object
volume_editeur              object
volume_format               object
volume_pages        

### Nettoyage texte

In [50]:
def clean_text(x):
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return None
    s = str(x).replace("\u00a0", " ").strip()
    s = re.sub(r"\s+", " ", s)
    # placeholders -> null
    if s.lower() in {"na", "n/a", "none", ""}:
        return None
    return s

TEXT_COLS = [
    # titres / urls (join keys)
    "series_title", "series_url",
    "volume_title", "volume_url",

    # synopsis (important RAG)
    "series_synopsis", "volume_synopsis",

    # autres champs texte
    "series_mag_prepub",
    "volume_country", "volume_format",
]

for c in TEXT_COLS:
    if c in df.columns:
        df[c] = df[c].map(clean_text)

df[[c for c in ["series_title","volume_title","series_url","volume_url","series_synopsis","volume_synopsis"] if c in df.columns]].head(5)



Unnamed: 0,series_title,volume_title,series_url,volume_url,series_synopsis,volume_synopsis
0,Touken ranbu -ONLINE- Anthology ~ Honmaru Ranm...,Touken ranbu -ONLINE- Anthology ~ Honmaru Ranm...,https://www.manga-sanctuary.com/bdd/manga/7815...,https://www.manga-sanctuary.com/manga-touken-r...,,刀剣男士達を華麗に描く公式アンソロジー! 本丸でのほのぼのエピソードから、合戦場でのかっこい...
1,Blast,Blast 5 (Haksan),https://www.manga-sanctuary.com/bdd/manhwa/121...,https://www.manga-sanctuary.com/manhwa-blast-v...,Kael (alias Ryu Shiho) était une menace sous s...,
2,Blast,Blast 4 (Haksan),https://www.manga-sanctuary.com/bdd/manhwa/121...,https://www.manga-sanctuary.com/manhwa-blast-v...,Kael (alias Ryu Shiho) était une menace sous s...,
3,Blast,Blast 3 (Haksan),https://www.manga-sanctuary.com/bdd/manhwa/121...,https://www.manga-sanctuary.com/manhwa-blast-v...,Kael (alias Ryu Shiho) était une menace sous s...,
4,Blast,Blast 2 (Haksan),https://www.manga-sanctuary.com/bdd/manhwa/121...,https://www.manga-sanctuary.com/manhwa-blast-v...,Kael (alias Ryu Shiho) était une menace sous s...,


### nettoyage series_category

In [51]:
import pandas as pd
import re
import unicodedata

# 1) Mapping de normalisation (variantes -> canonique)
MAP_CAT = {
    "sonyun": "Shonen",
    "shounen": "Shonen",
    "sunjung": "Shojo",
    "chungnyun": "Seinen",
}

# 2) Catégories autorisées (contrôle qualité)
ALLOWED = {
    "Shonen", "Seinen", "Shojo", "Josei", "Kodomo",
    "Yaoi", "Yuri", "Shonen-Aï", "Shojo-Aï",
    "Ecchi-Hentai", "Parodie"
}

def _norm_key(x: str) -> str:
    """Normalise pour comparaison : trim + minuscules + sans accents + espaces simples."""
    x = str(x).strip()
    x = unicodedata.normalize("NFKD", x)
    x = "".join(ch for ch in x if not unicodedata.combining(ch))
    x = re.sub(r"\s+", " ", x).strip().lower()
    return x

def extract_year(x):
    """Récupère une année si la catégorie est en fait une année."""
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return pd.NA
    s = str(x).strip()
    if s.isdigit():
        n = int(s)
        if 1900 <= n <= 2100:
            return n
    return pd.NA

def clean_category(x):
    """Nettoie la catégorie : supprime années/vides, mappe variantes, contrôle les valeurs."""
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return None
    s = str(x).strip()
    if s == "":
        return None

    # années -> None (mais récupérées dans series_category_year_guess)
    y = extract_year(s)
    if pd.notna(y):
        return None

    key = _norm_key(s)
    canon = MAP_CAT.get(key, s.strip())

    # normalise un peu la casse/hyphens (optionnel)
    canon = canon.replace("Ai", "Aï").replace("Shojo-Ai", "Shojo-Aï").replace("Shonen-Ai", "Shonen-Aï")

    # contrôle : si hors liste, on garde mais on le marquera en "other"
    return canon

# Colonnes résultantes
df["series_category_year_guess"] = df["series_category"].map(extract_year).astype("Int64")
df["series_category_clean"] = df["series_category"].map(clean_category)

# Flag qualité (utile en C3)
df["series_category_is_allowed"] = df["series_category_clean"].isin(ALLOWED)

# KPI / preuve
print("non-null clean%:", round((df["series_category_clean"].notna().mean()) * 100, 2))
print("year_guess non-null%:", round((df["series_category_year_guess"].notna().mean()) * 100, 2))
print("allowed%:", round((df["series_category_is_allowed"].mean()) * 100, 2))

df["series_category_clean"].value_counts().head(15)


non-null clean%: 97.43
year_guess non-null%: 2.48
allowed%: 97.43


series_category_clean
Shonen          32981
Seinen          29643
Shojo           13843
Yaoi             4485
Josei            3413
Ecchi-Hentai     1240
Kodomo            412
Shonen-Aï         339
Yuri              314
Parodie           106
Shojo-Aï           62
Name: count, dtype: int64

### Normalisation des listes

In [52]:
def clean_list(v):
    """Normalise en liste de strings propres.
- None/NaN -> []
- str -> [str]
- list -> liste nettoyée
- dédoublonnage case-insensitive
"""
    if v is None or (isinstance(v, float) and pd.isna(v)):
        return []
    if isinstance(v, str):
        v = [v]
    if not isinstance(v, list):
        return []

    out = []
    for item in v:
        item = clean_text(item)
        if item:
            out.append(item)

    seen = set()
    out2 = []
    for x in out:
        k = x.lower()
        if k not in seen:
            seen.add(k)
            out2.append(x)
    return out2

LIST_COLS = ["series_other_titles", "series_genres", "series_tags"]
for c in LIST_COLS:
    if c in df.columns:
        df[c] = df[c].map(clean_list)

# KPI (preuve)
for c in ["series_genres", "series_tags"]:
    if c in df.columns:
        print(c, "non-empty rate =", round((df[c].map(len) > 0).mean(), 6))



series_genres non-empty rate = 0.0
series_tags non-empty rate = 0.0


### Déduplication volumes

In [53]:
before = len(df)
df = df.drop_duplicates(subset=["volume_url"], keep="first").copy()
after = len(df)
print("before:", before, "after:", after)



before: 89129 after: 89129


### Split “Series” vs “Volumes”

In [54]:
def unique_cols(cols):
    seen = set()
    out = []
    for c in cols:
        if c not in seen:
            seen.add(c)
            out.append(c)
    return out

SERIES_COLS = unique_cols([c for c in df.columns if c.startswith("series_")])
VOLUME_COLS = unique_cols([c for c in df.columns if c.startswith("volume_")] + ["series_id"])

ms_series = df[SERIES_COLS].drop_duplicates(subset=["series_id"]).copy()
ms_volumes = df[VOLUME_COLS].drop_duplicates(subset=["volume_url"]).copy()

print("ms_series:", ms_series.shape)
print("ms_volumes:", ms_volumes.shape)
print("series columns unique?", ms_series.columns.is_unique)
print("volumes columns unique?", ms_volumes.columns.is_unique)


ms_series: (13208, 23)
ms_volumes: (89129, 19)
series columns unique? True
volumes columns unique? True


### Nettoyage des colonnes doublonnées (ex: `volume_url` / `volume_url.1`)

Certains exports CSV peuvent introduire des colonnes suffixées `.1` lorsque des noms sont dupliqués.
On supprime `X.1` si elle est identique à `X`, sinon on la renomme en `X_alt`.


In [55]:
def drop_duplicate_dot1_cols(df):
    """Supprime les colonnes suffixées '.1' si elles dupliquent une colonne base.
    Si la base existe mais la valeur diffère, renomme en '<base>_alt'.
    Retourne (df, info).
    """
    dropped = []
    renamed = {}
    cols = list(df.columns)
    for c in cols:
        if not isinstance(c, str) or not c.endswith('.1'):
            continue
        base = c[:-2]
        if base in df.columns:
            try:
                same = df[base].equals(df[c])
            except Exception:
                same = False
            if same:
                dropped.append(c)
            else:
                new = f"{base}_alt"
                i = 2
                while new in df.columns or new in renamed.values():
                    new = f"{base}_alt{i}"
                    i += 1
                renamed[c] = new
        else:
            renamed[c] = base

    if renamed:
        df = df.rename(columns=renamed)
    if dropped:
        df = df.drop(columns=dropped)

    return df, {"dropped": dropped, "renamed": renamed}

ms_series, _dup_series = drop_duplicate_dot1_cols(ms_series)
ms_volumes, _dup_volumes = drop_duplicate_dot1_cols(ms_volumes)

print("Duplicate-columns cleanup (series):", _dup_series)
print("Duplicate-columns cleanup (volumes):", _dup_volumes)

assert ms_series.columns.is_unique, "Colonnes dupliquées dans ms_series"
assert ms_volumes.columns.is_unique, "Colonnes dupliquées dans ms_volumes"

# (optionnel) vérification spécifique
print("volume_url.1 present?", "volume_url.1" in ms_volumes.columns)


Duplicate-columns cleanup (series): {'dropped': [], 'renamed': {}}
Duplicate-columns cleanup (volumes): {'dropped': [], 'renamed': {}}
volume_url.1 present? False


### Controles qualité KPI

In [56]:
qc = {
    "volume_url_unique": bool(ms_volumes["volume_url"].is_unique),
    "series_id_null_rate": float(ms_volumes["series_id"].isna().mean()),
    "volume_number_null_rate": float(ms_volumes["volume_number"].isna().mean()) if "volume_number" in ms_volumes else None,
    "volume_pages_null_rate": float(ms_volumes["volume_pages"].isna().mean()) if "volume_pages" in ms_volumes else None,
    "series_count": int(ms_series.shape[0]),
    "volumes_count": int(ms_volumes.shape[0]),
    "avg_volumes_per_series": float(round(ms_volumes.shape[0] / ms_series.shape[0], 4)) if ms_series.shape[0] else None,
}
qc


{'volume_url_unique': True,
 'series_id_null_rate': 0.0,
 'volume_number_null_rate': 0.05765800132392375,
 'volume_pages_null_rate': 0.5108438330958498,
 'series_count': 13208,
 'volumes_count': 89129,
 'avg_volumes_per_series': 6.7481}

## Préparation des exports Postgres JSONB + écriture des rejets

- Les colonnes de type **liste / dict** doivent être sérialisées en **JSON valide** pour pouvoir être castées en `jsonb` dans Postgres.
- On conserve les DataFrames `ms_series` / `ms_volumes` tels quels pour les étapes suivantes, et on prépare des copies `*_pg` dédiées à l’export CSV.
- On écrit aussi le fichier `ms_volumes_rejected.jsonl` (audit C3).



In [57]:
import json as pyjson

def jsonb_ready_copy(df: pd.DataFrame) -> pd.DataFrame:
    """Retourne une copie du DF où toutes les colonnes list/dict sont sérialisées en JSON valide."""
    out = df.copy()
    for col in out.columns:
        # test rapide sur quelques valeurs non nulles
        sample = out[col].dropna().head(20)
        if len(sample) == 0:
            continue
        if any(isinstance(v, (list, dict)) for v in sample):
            out[col] = out[col].map(lambda v: pyjson.dumps(v if isinstance(v, (list, dict)) else [], ensure_ascii=False))
    return out

# évite les colonnes redondantes *_json si la colonne de base est déjà une list
for base in ["series_other_titles", "series_statuses", "series_related_works"]:
    jcol = base + "_json"
    if base in ms_series.columns and jcol in ms_series.columns:
        # si la base est déjà une list, on supprime la colonne _json
        try:
            if isinstance(ms_series.loc[ms_series.index[0], base], list):
                ms_series = ms_series.drop(columns=[jcol])
        except Exception:
            pass

ms_series_pg  = jsonb_ready_copy(ms_series)
ms_volumes_pg = jsonb_ready_copy(ms_volumes)

# audit rejets (C3)
# 'rejected' contient déjà les rejets JSON invalides + les rejets sur champs requis
with REJECTED_PATH.open("w", encoding="utf-8") as f:
    for r in rejected:
        f.write(pyjson.dumps(r, ensure_ascii=False) + "\n")

print("rejected_written_to:", REJECTED_PATH, "count:", len(rejected))



rejected_written_to: /home/maxime/python/certification/preparation_bdd/out_ms_staging/ms_volumes_rejected.jsonl count: 59


In [58]:
import pandas as pd

INT_COLS = ["volume_number","volume_pages","volume_members_votes","volume_experts_votes",
            "volume_tomes_published","volume_tomes_total"]
FLOAT_COLS = ["volume_members_rating","volume_experts_rating"]

for c in INT_COLS:
    if c in ms_volumes.columns:
        ms_volumes[c] = pd.to_numeric(ms_volumes[c], errors="coerce").astype("Int64")

for c in FLOAT_COLS:
    if c in ms_volumes.columns:
        ms_volumes[c] = pd.to_numeric(ms_volumes[c], errors="coerce").astype("Float64")


### Exports + stats + rejets

In [59]:
import json as pyjson

# garde-fous (évite l'erreur pyarrow "Duplicate column names")
assert ms_series.columns.is_unique, "Colonnes dupliquées dans ms_series"
assert ms_volumes.columns.is_unique, "Colonnes dupliquées dans ms_volumes"

# 1) CSV (systématique) — version JSONB-ready
ms_series_pg.to_csv(SERIES_CSV_PATH, index=False)
ms_volumes_pg.to_csv(VOLUMES_CSV_PATH, index=False)

# 2) Parquet (systématique) — garde les types natifs si possible
parquet_errors = {}
try:
    ms_series.to_parquet(CLEAN_SERIES_PATH, index=False)
except Exception as e:
    parquet_errors["ms_series"] = repr(e)

try:
    ms_volumes.to_parquet(CLEAN_VOLUMES_PATH, index=False)
except Exception as e:
    parquet_errors["ms_volumes"] = repr(e)

# stats C3 (audit + preuve)
stats_obj = {
    "raw_valid_rows": int(len(valid_rows)),
    "raw_rejected_rows": int(sum(1 for r in rejected if r.get("reason") in {"empty_line", "invalid_json"})),
    "rejected_total": int(len(rejected)),
    "kept_after_required_fields": int(df_ok.shape[0]) if "df_ok" in globals() else None,
    "series_after_dedup": int(ms_series.shape[0]),
    "volumes_after_dedup": int(ms_volumes.shape[0]),
    "qc": qc if "qc" in globals() else None,
    "kpi_lists": kpi_lists if "kpi_lists" in globals() else None,
    "exports": {
        "csv": {"series": str(SERIES_CSV_PATH), "volumes": str(VOLUMES_CSV_PATH)},
        "parquet": {"series": str(CLEAN_SERIES_PATH), "volumes": str(CLEAN_VOLUMES_PATH)},
        "parquet_errors": parquet_errors,
        "rejected_jsonl": str(REJECTED_PATH),
    },
}

STATS_PATH.write_text(pyjson.dumps(stats_obj, ensure_ascii=False, indent=2), encoding="utf-8")

stats_obj["exports"]



{'csv': {'series': '/home/maxime/python/certification/preparation_bdd/out_ms_staging/ms_series_clean.csv',
  'volumes': '/home/maxime/python/certification/preparation_bdd/out_ms_staging/ms_volumes_clean.csv'},
 'parquet': {'series': '/home/maxime/python/certification/preparation_bdd/out_ms_staging/ms_series_clean.parquet',
  'volumes': '/home/maxime/python/certification/preparation_bdd/out_ms_staging/ms_volumes_clean.parquet'},
 'parquet_errors': {},
 'rejected_jsonl': '/home/maxime/python/certification/preparation_bdd/out_ms_staging/ms_volumes_rejected.jsonl'}