### Imports + chemins

In [24]:
from pathlib import Path
import json
import re
import pandas as pd

def find_project_root(start: Path) -> Path:
    start = start.resolve()
    for root in [start, *start.parents]:
        if (root / "data" / "top_rated.json").exists():
            return root
    return start

PROJECT_ROOT = find_project_root(Path.cwd())

# Source attendue dans ce repo : `data/top_rated.json`
IN_JSON = PROJECT_ROOT / "data" / "top_rated.json"
if not IN_JSON.exists():
    raise FileNotFoundError(f"Fichier introuvable: {IN_JSON} (cwd={Path.cwd()})")

# Optionnel: limite pour itérer plus vite (None = tout)
LIMIT_ROWS = 1000  # ex: 5000

OUT_DIR = PROJECT_ROOT / "exports" / "kitsu"
OUT_DIR.mkdir(parents=True, exist_ok=True)


### Charger + contrôler la structure

In [25]:
obj = json.loads(IN_JSON.read_text(encoding="utf-8"))
assert "meta" in obj and "data" in obj, "Format inattendu (meta/data manquants)"

meta = obj["meta"]
data = obj["data"]
if LIMIT_ROWS is not None:
    data = data[: int(LIMIT_ROWS)]
print("rows:", len(data))
print("meta keys:", meta.keys())
print("sample keys:", data[0].keys())


rows: 1000
meta keys: dict_keys(['category', 'source', 'endpoint', 'fetched_at', 'limit', 'offset'])
sample keys: dict_keys(['id', 'slug', 'titles', 'status', 'synopsis', 'authors', 'ratings', 'popularity', 'tags'])


### Normaliser (flatten) vers un DataFrame

In [26]:
df = pd.json_normalize(data)

# colonnes imbriquées attendues (selon ton échantillon)
# titles.canonical / titles.en / titles.ja
# ratings.average / ratings.rank
# popularity.rank
# tags.categories / tags.genres


### Cast types + champs “plats”

In [27]:
df["kitsu_id"] = pd.to_numeric(df["id"], errors="coerce").astype("Int64")
df = df.drop(columns=["id"])

df["rating_average"] = pd.to_numeric(df.get("ratings.average"), errors="coerce")
df["rating_rank"] = pd.to_numeric(df.get("ratings.rank"), errors="coerce").astype("Int64")
df["popularity_rank"] = pd.to_numeric(df.get("popularity.rank"), errors="coerce").astype("Int64")

df["title_canonical"] = df.get("titles.canonical")
df["title_en"] = df.get("titles.en")
df["title_ja"] = df.get("titles.ja")

df["status"] = df.get("status").astype(str).str.strip().str.lower()


### Nettoyage synopsis “RAG-friendly”

In [28]:
def clean_synopsis(s: str) -> str:
    if s is None or (isinstance(s, float) and pd.isna(s)):
        return None
    s = str(s).strip()
    s = re.sub(r"\r\n?", "\n", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    # Optionnel: enlever les mentions de source (souvent bruit pour RAG)
    s = re.sub(r"\(Source:\s*[^)]+\)", "", s, flags=re.IGNORECASE).strip()
    return s or None

df["synopsis_raw"] = df.get("synopsis")
df["synopsis_clean"] = df["synopsis_raw"].map(clean_synopsis)


### import json as _json

In [29]:


def ensure_list(x):
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return []
    return x if isinstance(x, list) else []

df["authors_json"] = df.get("authors").map(ensure_list)                 # list[dict{name,role}]
df["categories_json"] = df.get("tags.categories").map(ensure_list)      # list[str]
df["genres_json"] = df.get("tags.genres").map(ensure_list)              # list[str]

# (optionnel) tag_total pour matching / recherche
df["tags_all_json"] = (df["categories_json"] + df["genres_json"]).map(lambda L: sorted(set([str(x).strip() for x in L if str(x).strip()])))


In [30]:
import json as _json

def ensure_list(x):
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return []
    return x if isinstance(x, list) else []

df["authors_json"] = df.get("authors").map(ensure_list)                 # list[dict{name,role}]
df["categories_json"] = df.get("tags.categories").map(ensure_list)      # list[str]
df["genres_json"] = df.get("tags.genres").map(ensure_list)              # list[str]

# (optionnel) tag_total pour matching / recherche
df["tags_all_json"] = (df["categories_json"] + df["genres_json"]).map(lambda L: sorted(set([str(x).strip() for x in L if str(x).strip()])))


### Dédup + contrôles qualité

In [31]:
df = df[df["kitsu_id"].notna()].copy()
df = df.drop_duplicates(subset=["kitsu_id"])

kpi = {
    "rows": len(df),
    "synopsis_non_empty_%": (df["synopsis_clean"].notna()).mean() * 100,
    "authors_non_empty_%": (df["authors_json"].map(len) > 0).mean() * 100,
    "tags_non_empty_%": (df["tags_all_json"].map(len) > 0).mean() * 100,
}
kpi


{'rows': 1000,
 'synopsis_non_empty_%': np.float64(98.5),
 'authors_non_empty_%': np.float64(0.0),
 'tags_non_empty_%': np.float64(99.0)}

### Sélection colonnes finales

In [32]:
df_clean = df[[
    "kitsu_id", "slug",
    "title_canonical", "title_en", "title_ja",
    "status",
    "synopsis_clean", "synopsis_raw",
    "rating_average", "rating_rank", "popularity_rank",
    "authors_json",
    "categories_json", "genres_json", "tags_all_json",
]].copy()

# ajouter meta (utile traçabilité)
df_clean["kitsu_fetched_at"] = meta.get("fetched_at")
df_clean["kitsu_endpoint"] = meta.get("endpoint")
df_clean["kitsu_source"] = meta.get("source")
df_clean["kitsu_category"] = meta.get("category")


### Normalisation des clés de matching avec Manga Sanctuary

In [33]:
import unicodedata
import re
import json

def norm_title(s: str) -> str:
    if s is None or (isinstance(s, float) and pd.isna(s)):
        return None
    s = str(s).strip().lower()
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    s = s.replace("&", "and")
    s = re.sub(r"[^a-z0-9]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s or None

df_clean["title_norm_canonical"] = df_clean["title_canonical"].map(norm_title)
df_clean["title_norm_en"]        = df_clean["title_en"].map(norm_title)
df_clean["title_norm_ja"]        = df_clean["title_ja"].map(norm_title)

def title_candidates(row):
    cand = [row.get("title_canonical"), row.get("title_en"), row.get("title_ja")]
    cand = [c for c in cand if c and str(c).strip()]
    # uniques, ordre conservé
    out = []
    for c in cand:
        if c not in out:
            out.append(c)
    return out

df_clean["title_candidates_json"] = df_clean.apply(title_candidates, axis=1)


### nettoyage Synopsis 

In [34]:
def clean_synopsis_plus(s: str) -> str:
    if s is None or (isinstance(s, float) and pd.isna(s)):
        return None
    s = str(s).strip()
    s = re.sub(r"\r\n?", "\n", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    s = re.sub(r"\(Source:\s*[^)]+\)", "", s, flags=re.I).strip()
    s = re.sub(r"(?im)^\s*source:\s*.*$", "", s).strip()  # ligne "Source: ..."
    return s or None

df_clean["synopsis_clean"] = df_clean["synopsis_raw"].map(clean_synopsis_plus)
df_clean["synopsis_len"] = df_clean["synopsis_clean"].fillna("").str.len()


### gestion des auteurs

In [35]:
df_clean["authors_count"] = df_clean["authors_json"].apply(lambda s: len(json.loads(s)) if isinstance(s,str) else (len(s) if isinstance(s,list) else 0))


### preparation Postgres JSONB

In [36]:
def json_list_to_text(s):
    try:
        L = json.loads(s) if isinstance(s, str) else s
        if not isinstance(L, list): return ""
        return " | ".join([str(x).strip() for x in L if str(x).strip()])
    except Exception:
        return ""

df_clean["tags_all_text"] = df_clean["tags_all_json"].map(json_list_to_text)
df_clean["genres_text"]   = df_clean["genres_json"].map(json_list_to_text)
df_clean["categories_text"] = df_clean["categories_json"].map(json_list_to_text)


### normalisation des notes sur 10

In [37]:
df_clean["rating_average_10"] = (pd.to_numeric(df_clean["rating_average"], errors="coerce") / 10).round(2)


### Export systématique CSV + Parquet (JSONB-ready)

In [38]:
def export_both_jsonb_ready(df: pd.DataFrame, csv_path: Path, parq_path: Path, json_cols: list[str]):
    # Parquet: garde les listes/dicts (nécessite `pyarrow` ou `fastparquet`)
    try:
        df.to_parquet(parq_path, index=False)
    except Exception as e:
        print(f"Parquet export skipped: {e!r} (installe pyarrow pour l'activer)")

    # CSV: sérialise JSONB en string JSON
    df_csv = df.copy()
    for c in json_cols:
        if c in df_csv.columns:
            df_csv[c] = df_csv[c].apply(lambda x: _json.dumps(x, ensure_ascii=False) if isinstance(x, (list, dict)) else "[]")
    df_csv.to_csv(csv_path, index=False)

OUT_CSV = OUT_DIR / "kitsu_top_rated_clean.csv"
OUT_PARQ = OUT_DIR / "kitsu_top_rated_clean.parquet"

export_both_jsonb_ready(
    df_clean,
    OUT_CSV,
    OUT_PARQ,
    json_cols=["authors_json","categories_json","genres_json","tags_all_json"]
)

print("Exported:", OUT_CSV, OUT_PARQ)


Exported: /home/maxime/python/certification/preparation_bdd/exports/kitsu/kitsu_top_rated_clean.csv /home/maxime/python/certification/preparation_bdd/exports/kitsu/kitsu_top_rated_clean.parquet
