# Kitsu — Top Rated : Nettoyage C3 + exports prêts PostgreSQL (JSONB) + préparation agrégation MS

Sorties systématiques :
- `kitsu_top_rated_clean.csv`
- `kitsu_top_rated_clean.parquet`

Points clés :
- lecture robuste + contrôles + KPI
- champs texte *RAG-friendly* (synopsis nettoyé)
- colonnes JSON (tags, auteurs, candidats titres) prêtes **JSONB**
- colonnes de matching (titres normalisés) pour la future jointure MS↔Kitsu



In [1]:
from pathlib import Path
import json
import re
import unicodedata
import pandas as pd



## 0) Parquet obligatoire (pyarrow)



In [2]:
import sys

AUTO_INSTALL_PYARROW = False

def ensure_pyarrow_or_fail(auto_install: bool = False) -> bool:
    try:
        import pyarrow  # noqa: F401
        return True
    except Exception as e:
        if not auto_install:
            raise RuntimeError(
                "Parquet requis mais `pyarrow` n'est pas disponible dans l'environnement du notebook. "
                "Installe-le dans le même env que Jupyter (ex: pip install pyarrow). "
                f"Détail: {e!r}"
            ) from e
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "pyarrow"])
    import pyarrow  # noqa: F401
    return True

PARQUET_READY = ensure_pyarrow_or_fail(auto_install=AUTO_INSTALL_PYARROW)
print("PARQUET_READY =", PARQUET_READY)



PARQUET_READY = True


## 1) Chemins



In [None]:
def find_project_root(start: Path) -> Path:
    start = start.resolve()
    for root in [start, *start.parents]:
        if (root / "data" / "top_rated.json").exists():
            return root
    return start

PROJECT_ROOT = find_project_root(Path.cwd())
DATA_DIR = PROJECT_ROOT / "data"

# Fichier attendu dans ce projet
IN_JSON = DATA_DIR / "top_rated.json"
if not IN_JSON.exists():
    raise FileNotFoundError(f"Fichier introuvable: {IN_JSON} (cwd={Path.cwd()})")

# ex: 1000 pour tester, None = tout
LIMIT_ROWS = None

OUT_DIR = PROJECT_ROOT / "exports" / "kitsu"

OUT_CSV  = OUT_DIR / "kitsu_top_rated_clean.csv"
OUT_PARQ = OUT_DIR / "kitsu_top_rated_clean.parquet"
KPI_JSON = OUT_DIR / "kitsu_top_rated_kpi.json"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("IN_JSON:", IN_JSON)
print("OUT_DIR:", OUT_DIR)



PROJECT_ROOT: /home/maxime/python/certification/preparation_bdd
IN_JSON: /home/maxime/python/certification/preparation_bdd/data/top_rated.json
OUT_DIR: /home/maxime/python/certification/preparation_bdd/exports/kitsu


## 2) Lecture robuste + contrôles structure



In [4]:
obj = json.loads(IN_JSON.read_text(encoding="utf-8"))
assert isinstance(obj, dict), "JSON racine doit être un objet"
assert "meta" in obj and "data" in obj, "Format inattendu: meta/data manquants"

meta = obj["meta"]
data = obj["data"]
assert isinstance(data, list), "data doit être une liste"

if LIMIT_ROWS is not None:
    data = data[: int(LIMIT_ROWS)]

print("rows:", len(data))
print("meta keys:", list(meta.keys()))
print("sample keys:", list(data[0].keys()))



rows: 61531
meta keys: ['category', 'source', 'endpoint', 'fetched_at', 'limit', 'offset']
sample keys: ['id', 'slug', 'titles', 'status', 'synopsis', 'authors', 'ratings', 'popularity', 'tags']


## 3) Flatten (json_normalize)



In [5]:
df = pd.json_normalize(data)
print("df shape:", df.shape)
df.head(2)



df shape: (61531, 13)


Unnamed: 0,id,slug,status,synopsis,authors,titles.canonical,titles.en,titles.ja,ratings.average,ratings.rank,popularity.rank,tags.categories,tags.genres
0,60854,the-greatest-estate-developer,finished,When civil engineering student Su-Ho Kim falls...,[],The Greatest Estate Developer,The Greatest Estate Developer,,85.92,1.0,347.0,"[Adventure, Comedy, Supernatural, Isekai]",[]
1,38,one-piece,current,"Gol D. Roger was known as the Pirate King, the...",[],One Piece,One Piece,ONE PIECE,85.05,2.0,3.0,"[Comedy, Super Power, Fantasy, Action, Friends...","[Comedy, Sports, Super Power, Fantasy, Action,..."


## 4) Normalisation types + champs plats



In [6]:
df["kitsu_id"] = pd.to_numeric(df.get("id"), errors="coerce").astype("Int64")
df = df.drop(columns=[c for c in ["id"] if c in df.columns])

df["slug"] = df.get("slug").astype("string")

df["rating_average"] = pd.to_numeric(df.get("ratings.average"), errors="coerce").astype("Float64")
df["rating_rank"] = pd.to_numeric(df.get("ratings.rank"), errors="coerce").astype("Int64")
df["popularity_rank"] = pd.to_numeric(df.get("popularity.rank"), errors="coerce").astype("Int64")

# Harmonisation échelle (Kitsu est souvent 0..100)
df["rating_average_10"] = (df["rating_average"] / 10).round(2)

df["title_canonical"] = df.get("titles.canonical").astype("string")
df["title_en"] = df.get("titles.en").astype("string")
df["title_ja"] = df.get("titles.ja").astype("string")

df["status"] = df.get("status").astype("string").str.strip().str.lower()

# colonnes utiles (si existantes) : garde en TEXT
for col in ["series_type", "series_category", "subtype"]:
    if col in df.columns:
        df[col] = df[col].astype("string")

df[["kitsu_id","slug","title_canonical","rating_average","rating_average_10","rating_rank","popularity_rank"]].head(5)



Unnamed: 0,kitsu_id,slug,title_canonical,rating_average,rating_average_10,rating_rank,popularity_rank
0,60854,the-greatest-estate-developer,The Greatest Estate Developer,85.92,8.59,1,347
1,38,one-piece,One Piece,85.05,8.5,2,3
2,57766,kimetsu-no-yaiba-rengoku-kyoujurou-gaiden,Kimetsu no Yaiba: Rengoku Kyoujurou Gaiden,84.91,8.49,3,45
3,55546,kimetsu-no-yaiba-tomioka-giyuu-gaiden,Kimetsu no Yaiba: Tomioka Giyuu Gaiden,84.9,8.49,4,72
4,8,berserk,Berserk,84.84,8.48,5,17


## 5) Synopsis — nettoyage RAG-friendly (+ longueur)



In [7]:
def clean_synopsis_plus(s: str) -> str:
    if s is None or (isinstance(s, float) and pd.isna(s)):
        return None
    s = str(s).strip()
    if not s:
        return None
    s = re.sub(r"\r\n?", "\n", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    # retire les mentions source dans parenthèses
    s = re.sub(r"\(Source:\s*[^)]+\)", "", s, flags=re.I).strip()
    # retire les lignes "Source: ..."
    s = re.sub(r"(?im)^\s*source:\s*.*$", "", s).strip()
    return s or None

df["synopsis_raw"] = df.get("synopsis")
df["synopsis_clean"] = df["synopsis_raw"].map(clean_synopsis_plus)
df["synopsis_len"] = df["synopsis_clean"].fillna("").astype(str).str.len().astype("Int64")

df[["synopsis_len","synopsis_clean"]].head(3)



Unnamed: 0,synopsis_len,synopsis_clean
0,487,When civil engineering student Su-Ho Kim falls...
1,1189,"Gol D. Roger was known as the Pirate King, the..."
2,431,"Since ancient times, rumors have abounded of m..."


## 6) Colonnes JSON (JSONB-ready) + tags texte



In [8]:
def ensure_list(x):
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return []
    return x if isinstance(x, list) else []

# list[dict{name, role}] (selon ce que renvoie ton runner)
if "authors" in df.columns:
    df["authors_json"] = df["authors"].map(ensure_list)
else:
    df["authors_json"] = [[] for _ in range(len(df))]

for src_col, dst_col in [("tags.categories", "categories_json"), ("tags.genres", "genres_json")]:
    if src_col in df.columns:
        df[dst_col] = df[src_col].map(ensure_list)
    else:
        df[dst_col] = [[] for _ in range(len(df))]

def normalize_tag_list(L):
    out = []
    for x in (L or []):
        s = str(x).strip()
        if s:
            out.append(s)
    # uniques, tri pour stabilité
    return sorted(set(out))

df["tags_all_json"] = (df["categories_json"] + df["genres_json"]).map(normalize_tag_list)

# champs texte pratiques (pour debug / SQL LIKE)
df["categories_text"] = df["categories_json"].map(lambda L: " | ".join(normalize_tag_list(L)))
df["genres_text"] = df["genres_json"].map(lambda L: " | ".join(normalize_tag_list(L)))
df["tags_all_text"] = df["tags_all_json"].map(lambda L: " | ".join(normalize_tag_list(L)))

# couverture auteurs
df["authors_count"] = df["authors_json"].map(lambda L: len(L) if isinstance(L, list) else 0).astype("Int64")

df[["authors_count","tags_all_text"]].head(5)



Unnamed: 0,authors_count,tags_all_text
0,0,Adventure | Comedy | Isekai | Supernatural
1,0,Action | Adventure | Comedy | Drama | Fantasy ...
2,0,
3,0,Action | Adventure | Demon | Historical | Shou...
4,0,Action | Adventure | Alternative Past | Contem...


## 7) Colonnes de matching (titres normalisés + candidats)



In [9]:
def norm_title(s: str) -> str:
    if s is None or (isinstance(s, float) and pd.isna(s)):
        return None
    s = str(s).strip().lower()
    if not s:
        return None
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    s = s.replace("&", "and")
    s = re.sub(r"[^a-z0-9]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s or None

df["title_norm_canonical"] = df["title_canonical"].map(norm_title)
df["title_norm_en"] = df["title_en"].map(norm_title)
df["title_norm_ja"] = df["title_ja"].map(norm_title)

def title_candidates(row):
    cand = []
    for k in ["title_canonical","title_en","title_ja"]:
        v = row.get(k)
        if isinstance(v, str) and v.strip() and v not in cand:
            cand.append(v.strip())
    return cand

df["title_candidates_json"] = df.apply(title_candidates, axis=1)
df["title_candidates_count"] = df["title_candidates_json"].map(len).astype("Int64")

df[["title_canonical","title_en","title_norm_canonical","title_candidates_count"]].head(5)



Unnamed: 0,title_canonical,title_en,title_norm_canonical,title_candidates_count
0,The Greatest Estate Developer,The Greatest Estate Developer,the greatest estate developer,1
1,One Piece,One Piece,one piece,2
2,Kimetsu no Yaiba: Rengoku Kyoujurou Gaiden,Demon Slayer: Rengoku Kyoujurou Side Story,kimetsu no yaiba rengoku kyoujurou gaiden,3
3,Kimetsu no Yaiba: Tomioka Giyuu Gaiden,Kimetsu no Yaiba: Tomioka Giyuu Gaiden,kimetsu no yaiba tomioka giyuu gaiden,2
4,Berserk,Berserk,berserk,2


## 8) Dédup + KPI (preuves C3)



In [10]:
df = df[df["kitsu_id"].notna()].copy()
df = df.drop_duplicates(subset=["kitsu_id"])

kpi = {
    "rows": int(len(df)),
    "synopsis_non_empty_%": float((df["synopsis_clean"].notna()).mean() * 100),
    "tags_non_empty_%": float((df["tags_all_json"].map(len) > 0).mean() * 100),
    "genres_non_empty_%": float((df["genres_json"].map(len) > 0).mean() * 100),
    "categories_non_empty_%": float((df["categories_json"].map(len) > 0).mean() * 100),
    "authors_non_empty_%": float((df["authors_json"].map(len) > 0).mean() * 100),
    "title_candidates_avg": float(df["title_candidates_count"].mean()),
}
kpi



{'rows': 43085,
 'synopsis_non_empty_%': 79.68202390623188,
 'tags_non_empty_%': 97.74631542300104,
 'genres_non_empty_%': 67.24846234188232,
 'categories_non_empty_%': 84.10815829174886,
 'authors_non_empty_%': 0.0,
 'title_candidates_avg': 2.1215736335151445}

## 9) Schéma final (kitsu_top_rated_clean)



In [11]:
df_clean = df[[
    "kitsu_id","slug",
    "title_canonical","title_en","title_ja",
    "title_norm_canonical","title_norm_en","title_norm_ja",
    "title_candidates_json","title_candidates_count",
    "status",
    "synopsis_clean","synopsis_raw","synopsis_len",
    "rating_average","rating_average_10","rating_rank","popularity_rank",
    "authors_json","authors_count",
    "categories_json","genres_json","tags_all_json",
    "categories_text","genres_text","tags_all_text",
]].copy()

# traçabilité
df_clean["kitsu_fetched_at"] = meta.get("fetched_at")
df_clean["kitsu_endpoint"] = meta.get("endpoint")
df_clean["kitsu_source"] = meta.get("source")
df_clean["kitsu_category"] = meta.get("category")

df_clean.head(2)



Unnamed: 0,kitsu_id,slug,title_canonical,title_en,title_ja,title_norm_canonical,title_norm_en,title_norm_ja,title_candidates_json,title_candidates_count,...,categories_json,genres_json,tags_all_json,categories_text,genres_text,tags_all_text,kitsu_fetched_at,kitsu_endpoint,kitsu_source,kitsu_category
0,60854,the-greatest-estate-developer,The Greatest Estate Developer,The Greatest Estate Developer,,the greatest estate developer,the greatest estate developer,na,[The Greatest Estate Developer],1,...,"[Adventure, Comedy, Supernatural, Isekai]",[],"[Adventure, Comedy, Isekai, Supernatural]",Adventure | Comedy | Isekai | Supernatural,,Adventure | Comedy | Isekai | Supernatural,2025-12-13T21:16:15+00:00,manga?sort=ratingRank,kitsu,top_rated
1,38,one-piece,One Piece,One Piece,ONE PIECE,one piece,one piece,one piece,"[One Piece, ONE PIECE]",2,...,"[Comedy, Super Power, Fantasy, Action, Friends...","[Comedy, Sports, Super Power, Fantasy, Action,...","[Action, Adventure, Comedy, Drama, Fantasy, Fr...",Action | Adventure | Comedy | Drama | Fantasy ...,Action | Adventure | Comedy | Fantasy | Friend...,Action | Adventure | Comedy | Drama | Fantasy ...,2025-12-13T21:16:15+00:00,manga?sort=ratingRank,kitsu,top_rated


### normalisation pour facilité le match 

In [12]:
import json

def parse_list_json(s):
    return json.loads(s) if isinstance(s, str) else (s if isinstance(s, list) else [])

def norm_list(L):
    return [norm_title(x) for x in L if norm_title(x)]

df_clean["title_norm_candidates_json"] = df_clean["title_candidates_json"].map(
    lambda s: json.dumps(norm_list(parse_list_json(s)), ensure_ascii=False)
)


### creation de clé primaire pour facilité la jointure

In [13]:
df_clean["title_norm_primary"] = df_clean["title_norm_canonical"].fillna(df_clean["title_norm_en"])


## 10) Export CSV + Parquet (JSONB-ready)



In [14]:
import json as _json

def export_csv_parquet_jsonb(df: pd.DataFrame, csv_path: Path, parq_path: Path, json_cols: list[str]):
    # Parquet: conserve listes/dicts
    df.to_parquet(parq_path, index=False)

    # CSV: sérialise les colonnes json en string JSON
    df_csv = df.copy()
    for c in json_cols:
        if c in df_csv.columns:
            df_csv[c] = df_csv[c].apply(
                lambda x: _json.dumps(x, ensure_ascii=False) if isinstance(x, (list, dict)) else "[]"
            )
    df_csv.to_csv(csv_path, index=False)

json_cols = [
    "authors_json",
    "categories_json",
    "genres_json",
    "tags_all_json",
    "title_candidates_json",
]

export_csv_parquet_jsonb(df_clean, OUT_CSV, OUT_PARQ, json_cols=json_cols)
KPI_JSON.write_text(_json.dumps(kpi, ensure_ascii=False, indent=2), encoding="utf-8")

print("Exported:", OUT_CSV)
print("Exported:", OUT_PARQ)
print("KPI:", KPI_JSON)



Exported: /home/maxime/python/certification/preparation_bdd/exports/kitsu/kitsu_top_rated_clean.csv
Exported: /home/maxime/python/certification/preparation_bdd/exports/kitsu/kitsu_top_rated_clean.parquet
KPI: /home/maxime/python/certification/preparation_bdd/exports/kitsu/kitsu_top_rated_kpi.json
