# C3 — Agrégation & exports RAG-friendly (version modifiée)

Modifs :
- Suppression de `volume_members_rating` (et variante `volume_membrers_rating`) : pas utile pour ton conseiller.
- `review_count` dans `ms_volumes_enriched` casté en **Int64** et rempli à **0** pour les volumes sans review.

Entrées (`out_ms_staging/`) :
- `ms_series_clean.csv`
- `ms_volumes_clean.csv`
- `ms_reviews_clean.csv`
- (optionnel) `ms_reviews_rag_ready.csv`



In [12]:
import json
import re
from pathlib import Path
import pandas as pd



## 0) Parquet (CSV + Parquet systématiques)



In [13]:
import sys

def ensure_pyarrow_or_fail():
    try:
        import pyarrow  # noqa: F401
        return True
    except Exception:
        pass
    try:
        import subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", "pyarrow"])
        import pyarrow  # noqa: F401
        return True
    except Exception as e:
        raise RuntimeError(
            "Parquet requis mais pyarrow n'est pas disponible. "
            "Installe-le manuellement: pip install pyarrow. "
            f"Détail: {repr(e)}"
        )

PARQUET_READY = ensure_pyarrow_or_fail()
print("PARQUET_READY =", PARQUET_READY)



PARQUET_READY = True


## 1) Chemins



In [14]:
REQUIRED_STAGING_FILES = [
    "ms_series_clean.csv",
    "ms_volumes_clean.csv",
    "ms_reviews_clean.csv",
]

def find_project_root(start: Path) -> Path:
    start = start.resolve()
    for root in [start, *start.parents]:
        staging = root / "out_ms_staging"
        if all((staging / f).exists() for f in REQUIRED_STAGING_FILES):
            return root
    return start

PROJECT_ROOT = find_project_root(Path.cwd())

STAGING = PROJECT_ROOT / "out_ms_staging"
if not STAGING.exists() or not all((STAGING / f).exists() for f in REQUIRED_STAGING_FILES):
    if Path("/mnt/data").exists():
        STAGING = Path("/mnt/data")

print("PROJECT_ROOT =", PROJECT_ROOT)
print("STAGING =", STAGING)

SERIES_CSV  = STAGING / "ms_series_clean.csv"
VOLUMES_CSV = STAGING / "ms_volumes_clean.csv"
REVIEWS_CSV = STAGING / "ms_reviews_clean.csv"
RAGREV_CSV  = STAGING / "ms_reviews_rag_ready.csv"  # optionnel

missing = [p for p in [SERIES_CSV, VOLUMES_CSV, REVIEWS_CSV] if not p.exists()]
if missing:
    raise FileNotFoundError(f"Fichiers manquants dans {STAGING}: {missing}")

OUT_DIR = PROJECT_ROOT / "out_ms_final"
OUT_DIR.mkdir(exist_ok=True, parents=True)

SERIES_ENR_CSV   = OUT_DIR / "ms_series_enriched.csv"
VOLUMES_ENR_CSV  = OUT_DIR / "ms_volumes_enriched.csv"
SERIES_MET_CSV   = OUT_DIR / "ms_series_metrics.csv"
VOLUMES_MET_CSV  = OUT_DIR / "ms_volume_metrics.csv"

SERIES_ENR_PARQ  = OUT_DIR / "ms_series_enriched.parquet"
VOLUMES_ENR_PARQ = OUT_DIR / "ms_volumes_enriched.parquet"
SERIES_MET_PARQ  = OUT_DIR / "ms_series_metrics.parquet"
VOLUMES_MET_PARQ = OUT_DIR / "ms_volume_metrics.parquet"

ORPHAN_REVIEWS_CSV = OUT_DIR / "ms_reviews_orphan_volume_url.csv"
STATS_JSON         = OUT_DIR / "ms_c3_stats.json"

RAG_REVIEWS_JSONL  = OUT_DIR / "rag_reviews.jsonl"
RAG_VPROF_JSONL    = OUT_DIR / "rag_volume_profiles.jsonl"

print("OUT_DIR =", OUT_DIR)



PROJECT_ROOT = /home/maxime/python/certification/preparation_bdd
STAGING = /home/maxime/python/certification/preparation_bdd/out_ms_staging
OUT_DIR = /home/maxime/python/certification/preparation_bdd/out_ms_final


## 2) Chargement (nullable dtypes)



In [15]:
def read_csv_nullable(path: Path) -> pd.DataFrame:
    return pd.read_csv(path, dtype_backend="numpy_nullable")

ms_series  = read_csv_nullable(SERIES_CSV)
ms_volumes = read_csv_nullable(VOLUMES_CSV)
ms_reviews = read_csv_nullable(REVIEWS_CSV)

print("ms_series :", ms_series.shape)
print("ms_volumes:", ms_volumes.shape)
print("ms_reviews:", ms_reviews.shape)

assert ms_series["series_id"].isna().sum() == 0
assert ms_volumes["volume_url"].isna().sum() == 0
assert ms_reviews["volume_url"].isna().sum() == 0

assert ms_series["series_id"].is_unique
assert ms_volumes["volume_url"].is_unique
assert ms_reviews["review_url"].is_unique



ms_series : (13208, 23)
ms_volumes: (89129, 19)
ms_reviews: (6749, 19)


## 3) Audit orphelins (volume_url absent des volumes)



In [16]:
orphans = ms_reviews.loc[~ms_reviews["volume_url"].isin(ms_volumes["volume_url"])].copy()
print("orphan_reviews_rows =", len(orphans))
if len(orphans):
    orphans.to_csv(ORPHAN_REVIEWS_CSV, index=False)



orphan_reviews_rows = 0


## 4) KPI par volume & par série



In [17]:
ms_reviews["review_score"] = pd.to_numeric(ms_reviews.get("review_score"), errors="coerce").astype("Float64")
ms_reviews["review_date_dt"] = pd.to_datetime(ms_reviews.get("review_date_iso"), errors="coerce")
ms_reviews["has_date"] = ms_reviews["review_date_dt"].notna()
ms_reviews["has_body"] = ms_reviews.get("review_body").notna() & (ms_reviews.get("review_body").astype(str).str.strip() != "")

g = ms_reviews.groupby("volume_url", dropna=False)
volume_metrics = g.agg(
    review_count=("review_url", "count"),
    score_mean=("review_score", "mean"),
    score_median=("review_score", "median"),
    score_min=("review_score", "min"),
    score_max=("review_score", "max"),
    with_body_count=("has_body", "sum"),
    with_date_count=("has_date", "sum"),
    first_review_date=("review_date_dt", "min"),
    last_review_date=("review_date_dt", "max"),
).reset_index()

volume_metrics["with_body_pct"] = (volume_metrics["with_body_count"] / volume_metrics["review_count"] * 100).round(2)
volume_metrics["with_date_pct"] = (volume_metrics["with_date_count"] / volume_metrics["review_count"] * 100).round(2)
volume_metrics["first_review_date_iso"] = volume_metrics["first_review_date"].dt.strftime("%Y-%m-%d")
volume_metrics["last_review_date_iso"]  = volume_metrics["last_review_date"].dt.strftime("%Y-%m-%d")

gs = ms_reviews.groupby("series_id", dropna=False)
series_metrics = gs.agg(
    series_review_count=("review_url", "count"),
    series_score_mean=("review_score", "mean"),
    series_score_median=("review_score", "median"),
    series_score_min=("review_score", "min"),
    series_score_max=("review_score", "max"),
    series_with_body_count=("has_body", "sum"),
    series_with_date_count=("has_date", "sum"),
    series_first_review_date=("review_date_dt", "min"),
    series_last_review_date=("review_date_dt", "max"),
).reset_index()

series_metrics["series_with_body_pct"] = (series_metrics["series_with_body_count"] / series_metrics["series_review_count"] * 100).round(2)
series_metrics["series_with_date_pct"] = (series_metrics["series_with_date_count"] / series_metrics["series_review_count"] * 100).round(2)
series_metrics["series_first_review_date_iso"] = series_metrics["series_first_review_date"].dt.strftime("%Y-%m-%d")
series_metrics["series_last_review_date_iso"]  = series_metrics["series_last_review_date"].dt.strftime("%Y-%m-%d")

volume_metrics.head(3), series_metrics.head(3)



(                                          volume_url  review_count  \
 0  https://www.manga-sanctuary.com/manga-008-appr...             1   
 1  https://www.manga-sanctuary.com/manga-008-appr...             1   
 2  https://www.manga-sanctuary.com/manga-008-appr...             1   
 
    score_mean  score_median  score_min  score_max  with_body_count  \
 0         8.0           8.0        8.0        8.0                1   
 1         7.0           7.0        7.0        7.0                1   
 2         6.0           6.0        6.0        6.0                1   
 
    with_date_count first_review_date last_review_date  with_body_pct  \
 0                1        2021-06-13       2021-06-13          100.0   
 1                0               NaT              NaT          100.0   
 2                1        2025-06-22       2025-06-22          100.0   
 
    with_date_pct first_review_date_iso last_review_date_iso  
 0          100.0            2021-06-13           2021-06-13  
 1      

## 5) Enrichissement + suppression



In [18]:
# volumes enrichis
ms_volumes_enriched = ms_volumes.merge(
    volume_metrics.drop(columns=["first_review_date", "last_review_date"]),
    on="volume_url",
    how="left",
)

# ✅ suppression colonne(s) non pertinente(s) (orthographes / variations possibles)
drop_cols = []
for c in ms_volumes_enriched.columns:
    cl = str(c).strip().lower()
    if cl in {"volume_members_rating", "volume_membrers_rating"}:
        drop_cols.append(c)
    elif "members_rating" in cl or "membrers_rating" in cl:
        drop_cols.append(c)

drop_cols = sorted(set(drop_cols))
if drop_cols:
    ms_volumes_enriched = ms_volumes_enriched.drop(columns=drop_cols)
print("Dropped volume columns:", drop_cols)

# ✅ compteurs volumes -> Int64 + fill 0 (volumes sans reviews)
for c in ["review_count", "with_body_count", "with_date_count"]:
    if c in ms_volumes_enriched.columns:
        ms_volumes_enriched[c] = (
            pd.to_numeric(ms_volumes_enriched[c], errors="coerce")
              .fillna(0)
              .astype("Int64")
        )

print(ms_volumes_enriched[["review_count", "with_body_count", "with_date_count"]].dtypes)
ms_volumes_enriched[["volume_url", "series_id", "volume_number", "review_count"]].head(5)

# séries enrichies
vol_count = ms_volumes.groupby("series_id").size().reset_index(name="series_volume_count")
ms_series_enriched = (
    ms_series
    .merge(vol_count, on="series_id", how="left")
    .merge(
        series_metrics.drop(columns=["series_first_review_date", "series_last_review_date"]),
        on="series_id",
        how="left",
    )
)

# ✅ compteurs KPI série -> Int64 + fill 0 (séries sans reviews)
for c in ["series_review_count", "series_with_body_count", "series_with_date_count"]:
    if c in ms_series_enriched.columns:
        ms_series_enriched[c] = (
            pd.to_numeric(ms_series_enriched[c], errors="coerce")
              .fillna(0)
              .astype("Int64")
        )

print(ms_series_enriched[["series_review_count", "series_with_body_count", "series_with_date_count"]].dtypes)
ms_series_enriched[["series_id", "series_review_count", "series_volume_count"]].head(5)


Dropped volume columns: ['volume_members_rating']
review_count       Int64
with_body_count    Int64
with_date_count    Int64
dtype: object
series_review_count       Int64
series_with_body_count    Int64
series_with_date_count    Int64
dtype: object


Unnamed: 0,series_id,series_review_count,series_volume_count
0,78152,0,1
1,12139,0,5
2,15537,0,1
3,57120,0,1
4,10066,0,1


### gestion float64 / int64

In [19]:
import pandas as pd

for c in ["series_year", "series_category_year_guess"]:
    if c in ms_series_enriched.columns:
        ms_series_enriched[c] = pd.to_numeric(ms_series_enriched[c], errors="coerce").astype("Int64")

if "series_review_count" in ms_series_enriched.columns:
    ms_series_enriched["series_review_count"] = (
        pd.to_numeric(ms_series_enriched["series_review_count"], errors="coerce")
          .fillna(0)
          .astype("Int64")
    )


## 6) Exports (CSV + Parquet)



In [20]:
def export_both(df: pd.DataFrame, csv_path: Path, parq_path: Path):
    df.to_csv(csv_path, index=False)
    df.to_parquet(parq_path, index=False)

export_both(ms_volumes_enriched, VOLUMES_ENR_CSV, VOLUMES_ENR_PARQ)
export_both(ms_series_enriched, SERIES_ENR_CSV, SERIES_ENR_PARQ)
export_both(volume_metrics, VOLUMES_MET_CSV, VOLUMES_MET_PARQ)
export_both(series_metrics, SERIES_MET_CSV, SERIES_MET_PARQ)

print("Exported:", VOLUMES_ENR_CSV.name, SERIES_ENR_CSV.name, VOLUMES_MET_CSV.name, SERIES_MET_CSV.name)



Exported: ms_volumes_enriched.csv ms_series_enriched.csv ms_volume_metrics.csv ms_series_metrics.csv


## 7) RAG docs (reviews + volume profiles) — chunking optionnel



In [21]:
USE_CHUNKING = False
CHUNK_CHARS = 1200
OVERLAP_CHARS = 200

def chunk_text(s: str, chunk_chars=1200, overlap=200):
    s = (s or "").strip()
    if not s:
        return []
    if len(s) <= chunk_chars:
        return [s]
    out, start = [], 0
    while start < len(s):
        end = min(len(s), start + chunk_chars)
        out.append(s[start:end])
        if end == len(s):
            break
        start = max(0, end - overlap)
    return out

rag_df = read_csv_nullable(RAGREV_CSV) if RAGREV_CSV.exists() else ms_reviews.loc[ms_reviews.get("rag_ready") == True].copy()

need_cols = ["review_url","series_id","volume_url","volume_number","review_score","review_date_iso","review_author","review_type","rag_text","rag_len"]
for c in need_cols:
    if c not in rag_df.columns:
        rag_df[c] = pd.NA

# rag_reviews.jsonl
rag_docs = []
for _, r in rag_df.iterrows():
    rid = str(r["review_url"])
    text = r["rag_text"]
    if text is pd.NA or str(text).strip() == "":
        continue
    chunks = chunk_text(str(text), CHUNK_CHARS, OVERLAP_CHARS) if USE_CHUNKING else [str(text)]
    for ci, chunk in enumerate(chunks):
        doc_id = f"{rid}#c{ci}" if USE_CHUNKING else rid
        md_ = {
            "review_url": rid,
            "series_id": None if pd.isna(r["series_id"]) else int(r["series_id"]),
            "volume_url": None if pd.isna(r["volume_url"]) else str(r["volume_url"]),
            "volume_number": None if pd.isna(r["volume_number"]) else int(r["volume_number"]),
            "review_score": None if pd.isna(r["review_score"]) else float(r["review_score"]),
            "review_date_iso": None if pd.isna(r["review_date_iso"]) else str(r["review_date_iso"]),
            "review_author": None if pd.isna(r["review_author"]) else str(r["review_author"]),
            "review_type": None if pd.isna(r["review_type"]) else str(r["review_type"]),
            "chunk_index": ci if USE_CHUNKING else None,
        }
        rag_docs.append({"id": doc_id, "text": chunk, "metadata": md_})

with RAG_REVIEWS_JSONL.open("w", encoding="utf-8") as f:
    for d in rag_docs:
        f.write(json.dumps(d, ensure_ascii=False) + "\n")

print("rag_reviews_docs_written =", len(rag_docs))

# rag_volume_profiles.jsonl
reviews_for_snippets = rag_df.copy()
reviews_for_snippets["review_score"] = pd.to_numeric(reviews_for_snippets["review_score"], errors="coerce").astype("Float64")
reviews_for_snippets["rag_text"] = reviews_for_snippets["rag_text"].astype(str)

def safe_str(v) -> str:
    if v is None:
        return ""
    try:
        if pd.isna(v):
            return ""
    except Exception:
        return str(v)
    return str(v)

def build_volume_profile(row):
    vol_title = safe_str(row.get("volume_title")).strip()
    vol_num = row.get("volume_number")
    series_id = row.get("series_id")
    vol_url = safe_str(row.get("volume_url")).strip()

    vol_syn = safe_str(row.get("volume_synopsis")).strip()
    if vol_syn.lower() in {"nan", "none"}:
        vol_syn = ""

    syn_series = ""
    if pd.notna(series_id):
        srow = ms_series_enriched.loc[ms_series_enriched["series_id"] == series_id]
        if len(srow):
            syn_series = safe_str(srow.iloc[0].get("series_synopsis")).strip()
            if syn_series.lower() in {"nan", "none"}:
                syn_series = ""

    rc = row.get("review_count")
    sm = row.get("score_mean")
    s_med = row.get("score_median")

    rc_txt = f"{int(rc)}" if pd.notna(rc) else "0"
    mean_txt = f"{float(sm):.2f}" if pd.notna(sm) else "NA"
    med_txt = f"{float(s_med):.2f}" if pd.notna(s_med) else "NA"

    rv = reviews_for_snippets.loc[reviews_for_snippets["volume_url"] == vol_url].copy()
    snippets = []
    if len(rv):
        rv["snippet_len"] = rv["rag_text"].str.len()
        rv = rv.sort_values(by=["review_score","snippet_len"], ascending=[False, False]).head(3)
        for t in rv["rag_text"].tolist():
            t = re.sub(r"\s+", " ", t.strip().replace("\n", " "))
            snippets.append(t[:420])

    header = vol_title if vol_title else (f"Volume {int(vol_num)}" if pd.notna(vol_num) else "Volume")
    if pd.notna(vol_num) and vol_title:
        header = f"{vol_title} (Tome {int(vol_num)})"

    parts = [header]
    if vol_syn:
        parts.append(f"Synopsis (volume) : {vol_syn}")
    elif syn_series:
        parts.append(f"Synopsis (série) : {syn_series}")
    parts.append(f"Notes (reviews) : moyenne {mean_txt} / médiane {med_txt} — {rc_txt} avis")

    if snippets:
        parts.append("Extraits d'avis :")
        for sn in snippets:
            parts.append(f"- {sn}")

    return "\n\n".join(parts).strip()

vdocs = []
for _, row in ms_volumes_enriched.iterrows():
    vol_url = safe_str(row.get("volume_url")).strip()
    if not vol_url:
        continue
    text = build_volume_profile(row)
    md_ = {
        "volume_url": vol_url,
        "series_id": None if pd.isna(row.get("series_id")) else int(row.get("series_id")),
        "volume_number": None if pd.isna(row.get("volume_number")) else int(row.get("volume_number")),
        "review_count": 0 if pd.isna(row.get("review_count")) else int(row.get("review_count")),
        "score_mean": None if pd.isna(row.get("score_mean")) else float(row.get("score_mean")),
        "score_median": None if pd.isna(row.get("score_median")) else float(row.get("score_median")),
    }
    vdocs.append({"id": vol_url, "text": text, "metadata": md_})

with RAG_VPROF_JSONL.open("w", encoding="utf-8") as f:
    for d in vdocs:
        f.write(json.dumps(d, ensure_ascii=False) + "\n")

print("rag_volume_profiles_written =", len(vdocs))



rag_reviews_docs_written = 3187
rag_volume_profiles_written = 89129


## 8) Stats 



In [22]:
stats = {
    "rows": {
        "series": int(len(ms_series)),
        "volumes": int(len(ms_volumes)),
        "reviews": int(len(ms_reviews)),
        "reviews_rag_ready": int(len(rag_df)),
    },
    "orphans": {
        "orphan_reviews_rows": int(len(orphans)),
        "orphan_reviews_csv": str(ORPHAN_REVIEWS_CSV) if len(orphans) else None,
    },
    "exports": {
        "ms_series_enriched_csv": str(SERIES_ENR_CSV),
        "ms_volumes_enriched_csv": str(VOLUMES_ENR_CSV),
        "ms_series_metrics_csv": str(SERIES_MET_CSV),
        "ms_volume_metrics_csv": str(VOLUMES_MET_CSV),
        "rag_reviews_jsonl": str(RAG_REVIEWS_JSONL),
        "rag_volume_profiles_jsonl": str(RAG_VPROF_JSONL),
    },
}
STATS_JSON.write_text(json.dumps(stats, ensure_ascii=False, indent=2), encoding="utf-8")
stats



{'rows': {'series': 13208,
  'volumes': 89129,
  'reviews': 6749,
  'reviews_rag_ready': 3187},
 'orphans': {'orphan_reviews_rows': 0, 'orphan_reviews_csv': None},
 'exports': {'ms_series_enriched_csv': '/home/maxime/python/certification/preparation_bdd/out_ms_final/ms_series_enriched.csv',
  'ms_volumes_enriched_csv': '/home/maxime/python/certification/preparation_bdd/out_ms_final/ms_volumes_enriched.csv',
  'ms_series_metrics_csv': '/home/maxime/python/certification/preparation_bdd/out_ms_final/ms_series_metrics.csv',
  'ms_volume_metrics_csv': '/home/maxime/python/certification/preparation_bdd/out_ms_final/ms_volume_metrics.csv',
  'rag_reviews_jsonl': '/home/maxime/python/certification/preparation_bdd/out_ms_final/rag_reviews.jsonl',
  'rag_volume_profiles_jsonl': '/home/maxime/python/certification/preparation_bdd/out_ms_final/rag_volume_profiles.jsonl'}}