# Nettoyage & normalisation des exports hebdomadaires Kitsu ‚Üí PostgreSQL (objectif RAG/LLM)

Fichiers attendus (par d√©faut dans `Preparation_weekly/data/`, ou adapter `DATA_DIR`) :

- `most_popular.json` : popularit√© globale (site Kitsu)
- `top_publishing.json` : classement/rank (note/rating rank)
- `trending_weekly.json` : **tendance semaine** (classement bas√© sur l‚Äôordre du JSON)

Objectifs :
1. Nettoyer / uniformiser les champs (titres, synopsis, tags, types)
2. Convertir la **note vers /10** (`rating_average_10`)
3. Construire :
   - `kitsu_series_core` (r√©f√©rentiel : 1 ligne par `kitsu_id`)
   - `kitsu_weekly_snapshot` (historique : liste + position + date)
4. Exporter en CSV compatibles PostgreSQL (`\copy`) + bonus documents RAG

> Hypoth√®se : `id` = `kitsu_id` (identique √† Kitsu) ‚úÖ


In [2]:
# (Optionnel) Installer les d√©pendances si besoin
# !pip install -q pandas python-dateutil unidecode

import json
import re
import html
from pathlib import Path

import pandas as pd
from dateutil import parser as dtparser
from unidecode import unidecode


## 1) Charger les 3 exports (meta + data)


In [3]:
# üîß Param√®tres / chemins (align√©s avec `notebook_kitsu_weekly_most_popular_to_postgres`)
REPO_ROOT = next((p for p in [Path.cwd(), *Path.cwd().parents] if (p / "pyproject.toml").exists()), Path.cwd())

WEEKLY_DIR = REPO_ROOT / "Preparation_weekly"
DATA_DIR = WEEKLY_DIR / "data"
OUT_DIR = WEEKLY_DIR / "export"  # dossier d√©j√† pr√©sent

OUT_DIR.mkdir(parents=True, exist_ok=True)

FILES = {
    "most_popular": DATA_DIR / "most_popular.json",
    "top_publishing": DATA_DIR / "top_publishing.json",
    "trending_weekly": DATA_DIR / "trending_weekly.json",
}

missing = [k for k, p in FILES.items() if not p.exists()]
if missing:
    raise FileNotFoundError(f"Fichiers manquants: {missing}. Chemin DATA_DIR = {DATA_DIR.resolve()}")

def load_export(path: Path) -> dict:
    return json.loads(path.read_text(encoding="utf-8"))

exports = {name: load_export(path) for name, path in FILES.items()}
{k: (len(v.get("data", [])), v.get("meta", {}).get("endpoint"), v.get("meta", {}).get("fetched_at")) for k, v in exports.items()}


{'most_popular': (100,
  'manga?sort=popularityRank',
  '2025-12-13T20:32:51+00:00'),
 'top_publishing': (100,
  'manga?filter[status]=current&sort=popularityRank',
  '2025-12-13T20:31:33+00:00'),
 'trending_weekly': (10, 'trending/manga', '2025-12-13T20:29:08+00:00')}

## 2) Nettoyage & normalisation (fonctions)


In [4]:
_ws_re = re.compile(r"\s+")
_non_alnum_re = re.compile(r"[^a-z0-9]+")

def clean_text(s):
    """Nettoyage texte: unescape HTML, espaces, suppression caract√®res nuls."""
    if s is None:
        return None
    s = html.unescape(str(s))
    s = s.replace("\u0000", " ")
    s = _ws_re.sub(" ", s).strip()
    return s or None

def clean_synopsis(s):
    """Nettoyage synopsis: clean_text + suppression optionnelle (Source: ...)."""
    s = clean_text(s)
    if not s:
        return None
    # Retire les mentions (Source: ...), souvent du bruit pour les embeddings
    s = re.sub(r"\(Source:[^)]+\)", "", s, flags=re.IGNORECASE).strip()
    s = _ws_re.sub(" ", s).strip()
    return s or None

def norm_title(s):
    """Normalisation de titre pour matching cross-sources (MS‚ÜîKitsu)."""
    s = clean_text(s)
    if not s:
        return None
    s = unidecode(s).lower()
    s = _non_alnum_re.sub(" ", s)
    s = _ws_re.sub(" ", s).strip()
    return s or None

def to_int(x):
    try:
        if x is None or x == "":
            return None
        return int(x)
    except Exception:
        return None

def to_float(x):
    try:
        if x is None or x == "":
            return None
        return float(x)
    except Exception:
        return None

def rating_to_10(avg):
    """Convertit vers /10 : si valeur > 10, on suppose /100 (Kitsu fr√©quent)."""
    x = to_float(avg)
    if x is None:
        return None
    return round(x / 10.0, 2) if x > 10 else round(x, 2)

def parse_ts(s):
    try:
        return dtparser.isoparse(s) if s else None
    except Exception:
        return None

def completeness_score(row: dict) -> int:
    """Arbitrage multi-listes: garder la ligne la plus 'compl√®te' pour un kitsu_id."""
    keys = [
        "slug","status","title_canonical","title_en","title_ja","synopsis_clean",
        "rating_average_10","rating_rank","popularity_rank"
    ]
    score = 0
    for k in keys:
        v = row.get(k)
        if v not in (None, "", [], {}, "null"):
            score += 1
    score += 1 if row.get("categories") else 0
    score += 1 if row.get("genres") else 0
    score += 1 if row.get("authors_raw") else 0
    return score


## 3) Flatten des 3 fichiers vers un sch√©ma commun

R√®gles de ranking dans `kitsu_weekly_snapshot` :

- `position` = ordre d‚Äôapparition (1..N) **pour toutes les listes**
- `list_rank` :
  - `most_popular` ‚Üí `popularity.rank`
  - `top_publishing` ‚Üí `ratings.rank`
  - `trending_weekly` ‚Üí `NULL` (car la liste n‚Äôest pas bas√©e sur un rank Kitsu)
- `trend_rank` :
  - **uniquement** pour `trending_weekly` ‚Üí `position`


In [5]:
def flatten_one(export: dict, fallback_list_name: str):
    meta = export.get("meta") or {}
    list_name = meta.get("category") or fallback_list_name
    fetched_at = meta.get("fetched_at")
    endpoint = meta.get("endpoint")

    out = []
    for pos, it in enumerate(export.get("data") or [], start=1):
        titles = it.get("titles") or {}
        ratings = it.get("ratings") or {}
        popularity = it.get("popularity") or {}
        tags = it.get("tags") or {}
        authors = it.get("authors") or []

        # rank propre √† la liste
        if list_name == "most_popular":
            list_rank = popularity.get("rank")
        elif list_name == "top_publishing":
            list_rank = ratings.get("rank")
        else:
            list_rank = None  # trending_weekly: classement = position

        out.append({
            "list_name": list_name,
            "fetched_at": fetched_at,
            "fetched_at_ts": parse_ts(fetched_at),
            "endpoint": endpoint,

            "position": pos,                       # ‚úÖ ordre d‚Äôapparition
            "list_rank": to_int(list_rank),         # rank explicite si dispo
            "trend_rank": pos if list_name == "trending_weekly" else None,  # ‚úÖ demand√©

            "kitsu_id": to_int(it.get("id")),
            "slug": clean_text(it.get("slug")),
            "status": clean_text(it.get("status")),

            "title_canonical": clean_text(titles.get("canonical")),
            "title_en": clean_text(titles.get("en")),
            "title_ja": clean_text(titles.get("ja")),

            "synopsis_clean": clean_synopsis(it.get("synopsis")),

            "rating_average_raw": to_float(ratings.get("average")),
            "rating_average_10": rating_to_10(ratings.get("average")),
            "rating_rank": to_int(ratings.get("rank")),

            "popularity_rank": to_int(popularity.get("rank")),

            "categories": tags.get("categories") or [],
            "genres": tags.get("genres") or [],
            "authors_raw": authors,
        })
    return out

rows = []
for name, export in exports.items():
    rows.extend(flatten_one(export, name))

df_all = pd.DataFrame(rows)
df_all.head(3)


Unnamed: 0,list_name,fetched_at,fetched_at_ts,endpoint,position,list_rank,trend_rank,kitsu_id,slug,status,...,title_en,title_ja,synopsis_clean,rating_average_raw,rating_average_10,rating_rank,popularity_rank,categories,genres,authors_raw
0,most_popular,2025-12-13T20:32:51+00:00,2025-12-13 20:32:51+00:00,manga?sort=popularityRank,1,1.0,,26004,boku-no-hero-academia,finished,...,My Hero Academia,ÂÉï„ÅÆ„Éí„Éº„É≠„Éº„Ç¢„Ç´„Éá„Éü„Ç¢,What would the world be like if 80 percent of ...,84.67,8.47,8,1,"[Comedy, Super Power, School Life, Action, Sup...","[Comedy, Super Power, School, Action]","[{'name': 'Kouhei Horikoshi', 'role': 'Sc√©nari..."
1,most_popular,2025-12-13T20:32:51+00:00,2025-12-13 20:32:51+00:00,manga?sort=popularityRank,2,2.0,,7176,tokyo-ghoul-m,finished,...,Tokyo Ghoul,Êù±‰∫¨Âñ∞Á®Æ„Éà„Éº„Ç≠„Éß„Éº„Ç∞„Éº„É´,Shy Ken Kaneki is thrilled to go on a date wit...,84.16,8.42,22,2,"[Horror, Drama, Action, Psychological, Mystery...","[Mystery, Supernatural, Psychological, Thrille...","[{'name': 'Sui Ishida', 'role': 'Sc√©nario & De..."
2,most_popular,2025-12-13T20:32:51+00:00,2025-12-13 20:32:51+00:00,manga?sort=popularityRank,3,3.0,,38,one-piece,current,...,One Piece,ONE PIECE,"Gol D. Roger was known as the Pirate King, the...",85.05,8.5,2,3,"[Comedy, Super Power, Fantasy, Action, Friends...","[Comedy, Sports, Super Power, Fantasy, Action,...","[{'name': 'Eiichiro Oda', 'role': 'Sc√©nario & ..."


## 4) Contr√¥les qualit√© (par liste)


In [6]:
def quality_report(df: pd.DataFrame) -> pd.DataFrame:
    rep = []
    for list_name, g in df.groupby("list_name"):
        rep.append({
            "list_name": list_name,
            "rows": int(len(g)),
            "distinct_kitsu_id": int(g["kitsu_id"].nunique()),
            "duplicate_rows_in_list": int(g["kitsu_id"].duplicated().sum()),
            "pct_missing_synopsis": float((g["synopsis_clean"].isna()).mean() * 100),
            "pct_no_authors": float((g["authors_raw"].apply(len) == 0).mean() * 100),
            "pct_no_genres": float((g["genres"].apply(len) == 0).mean() * 100),
            "pct_missing_ja": float((g["title_ja"].isna()).mean() * 100),
        })
    return pd.DataFrame(rep).sort_values("list_name")

quality_report(df_all)


Unnamed: 0,list_name,rows,distinct_kitsu_id,duplicate_rows_in_list,pct_missing_synopsis,pct_no_authors,pct_no_genres,pct_missing_ja
0,most_popular,100,100,0,0.0,13.0,12.0,3.0
1,top_publishing,100,80,20,0.0,31.0,20.0,11.0
2,trending_weekly,10,10,0,0.0,90.0,90.0,90.0


## 5) D√©doublonnage dans chaque liste

R√®gle : **on garde la premi√®re occurrence** d‚Äôun `kitsu_id` dans la liste (donc la meilleure `position`).


In [7]:
df_list_clean = []
for list_name, g in df_all.groupby("list_name"):
    g2 = g.sort_values(["position"]).drop_duplicates("kitsu_id", keep="first").copy()
    df_list_clean.append(g2)

df_list_clean = pd.concat(df_list_clean, ignore_index=True)
quality_report(df_list_clean)


Unnamed: 0,list_name,rows,distinct_kitsu_id,duplicate_rows_in_list,pct_missing_synopsis,pct_no_authors,pct_no_genres,pct_missing_ja
0,most_popular,100,100,0,0.0,13.0,12.0,3.0
1,top_publishing,80,80,0,0.0,32.5,21.25,12.5
2,trending_weekly,10,10,0,0.0,90.0,90.0,90.0


## 6) Construire `kitsu_weekly_snapshot` (historique)

- 1 ligne par (list_name, fetched_at_ts, kitsu_id)
- `position` = ordre d‚Äôapparition (classement de la liste)
- `trend_rank` = `position` uniquement pour `trending_weekly`


In [8]:
df_snapshot = df_list_clean[[
    "list_name","fetched_at_ts","endpoint","kitsu_id",
    "position","list_rank","trend_rank"
]].copy()

trend = df_snapshot[df_snapshot["list_name"] == "trending_weekly"]
assert trend["trend_rank"].isna().sum() == 0, "trend_rank doit √™tre rempli pour trending_weekly"

df_snapshot.head(10)


Unnamed: 0,list_name,fetched_at_ts,endpoint,kitsu_id,position,list_rank,trend_rank
0,most_popular,2025-12-13 20:32:51+00:00,manga?sort=popularityRank,26004,1,1.0,
1,most_popular,2025-12-13 20:32:51+00:00,manga?sort=popularityRank,7176,2,2.0,
2,most_popular,2025-12-13 20:32:51+00:00,manga?sort=popularityRank,38,3,3.0,
3,most_popular,2025-12-13 20:32:51+00:00,manga?sort=popularityRank,14916,4,4.0,
4,most_popular,2025-12-13 20:32:51+00:00,manga?sort=popularityRank,24147,5,5.0,
5,most_popular,2025-12-13 20:32:51+00:00,manga?sort=popularityRank,57,6,6.0,
6,most_popular,2025-12-13 20:32:51+00:00,manga?sort=popularityRank,37280,7,7.0,
7,most_popular,2025-12-13 20:32:51+00:00,manga?sort=popularityRank,19682,8,8.0,
8,most_popular,2025-12-13 20:32:51+00:00,manga?sort=popularityRank,16554,9,9.0,
9,most_popular,2025-12-13 20:32:51+00:00,manga?sort=popularityRank,26401,10,10.0,


## 7) Construire `kitsu_series_core` (r√©f√©rentiel)

Si une m√™me ≈ìuvre appara√Æt dans plusieurs listes, on conserve la ligne la plus ‚Äúcompl√®te‚Äù.  
Cl√© : `kitsu_id`.


In [9]:
candidates = df_list_clean.to_dict(orient="records")

best_by_id = {}
for r in candidates:
    kid = r["kitsu_id"]
    if kid is None:
        continue
    prev = best_by_id.get(kid)
    if prev is None or completeness_score(r) > completeness_score(prev):
        best_by_id[kid] = r

df_core = pd.DataFrame(best_by_id.values())

# Titres normalis√©s (matching)
df_core["title_norm_canonical"] = df_core["title_canonical"].apply(norm_title)
df_core["title_norm_en"] = df_core["title_en"].apply(norm_title)
df_core["title_norm_ja"] = df_core["title_ja"].apply(norm_title)

# Tags unifi√©s (RAG)
def norm_tag(t):
    t = clean_text(t)
    if not t:
        return None
    mapping = {
        "sci-fi": "Science Fiction",
        "sci fi": "Science Fiction",
    }
    key = unidecode(t).lower().strip()
    return mapping.get(key, t)

df_core["categories_norm"] = df_core["categories"].apply(lambda xs: sorted({norm_tag(x) for x in xs if norm_tag(x)}))
df_core["genres_norm"] = df_core["genres"].apply(lambda xs: sorted({norm_tag(x) for x in xs if norm_tag(x)}))
df_core["tags_all_norm"] = df_core.apply(lambda r: sorted(set(r["categories_norm"]) | set(r["genres_norm"])), axis=1)

df_core = df_core.sort_values("kitsu_id").reset_index(drop=True)

df_core[["kitsu_id","title_canonical","rating_average_raw","rating_average_10","rating_rank","popularity_rank"]].head(10)


Unnamed: 0,kitsu_id,title_canonical,rating_average_raw,rating_average_10,rating_rank,popularity_rank
0,4,Monster,83.24,8.32,55,61
1,8,Berserk,84.84,8.48,5,17
2,12,20th Century Boys,82.67,8.27,103,55
3,23,Hajime no Ippo,82.65,8.27,106,199
4,35,NARUTO,82.36,8.24,211,27
5,37,BLEACH,77.8,7.78,1846,39
6,38,One Piece,85.05,8.5,2,3
7,57,Death Note,84.41,8.44,14,6
8,64,D.Gray-man,82.46,8.25,155,110
9,66,Fullmetal Alchemist,83.96,8.4,31,19


## 8) Table auteurs (optionnelle)

M√™me sans reviews, les auteurs (quand pr√©sents) enrichissent le document RAG.


In [10]:
rows = []
for _, r in df_list_clean.iterrows():
    kid = r["kitsu_id"]
    for a in (r["authors_raw"] or []):
        if isinstance(a, dict):
            rows.append({
                "kitsu_id": kid,
                "author_name": clean_text(a.get("name")),
                "author_role": clean_text(a.get("role")),
            })

df_authors = pd.DataFrame(rows).dropna(subset=["kitsu_id","author_name"]).drop_duplicates()
df_authors.head(10)


Unnamed: 0,kitsu_id,author_name,author_role
0,26004,Kouhei Horikoshi,Sc√©nario & Dessin
1,7176,Sui Ishida,Sc√©nario & Dessin
2,38,Eiichiro Oda,Sc√©nario & Dessin
3,14916,Hajime Isayama,Sc√©nario & Dessin
4,24147,Yusuke Murata,Dessin
5,24147,ONE,Sc√©nario
6,57,Takeshi Obata,Dessin
7,57,Tsugumi Ohba,Sc√©nario
8,37280,Koyoharu Gotouge,Sc√©nario & Dessin
9,19682,Yuusei Matsui,Sc√©nario & Dessin


## 9) Exports CSV (compatibles PostgreSQL)

On g√©n√®re :
- `Preparation_weekly/export/kitsu_series_core.csv`
- `Preparation_weekly/export/kitsu_weekly_snapshot.csv`
- `Preparation_weekly/export/kitsu_series_authors.csv` (optionnel)
- `Preparation_weekly/export/kitsu_rag_documents.csv` (bonus embeddings)


In [11]:
import json as _json

OUT_DIR.mkdir(parents=True, exist_ok=True)

core_path = OUT_DIR / "kitsu_series_core.csv"
snap_path = OUT_DIR / "kitsu_weekly_snapshot.csv"
authors_path = OUT_DIR / "kitsu_series_authors.csv"
rag_path = OUT_DIR / "kitsu_rag_documents.csv"

# S√©rialisation JSON pour JSONB
df_core_out = df_core.copy()
df_core_out["categories_json"] = df_core_out["categories_norm"].apply(lambda x: _json.dumps(x, ensure_ascii=False))
df_core_out["genres_json"] = df_core_out["genres_norm"].apply(lambda x: _json.dumps(x, ensure_ascii=False))
df_core_out["tags_all_json"] = df_core_out["tags_all_norm"].apply(lambda x: _json.dumps(x, ensure_ascii=False))

df_core_out = df_core_out[[
    "kitsu_id","slug","status",
    "title_canonical","title_en","title_ja",
    "title_norm_canonical","title_norm_en","title_norm_ja",
    "synopsis_clean",
    "rating_average_raw","rating_average_10","rating_rank","popularity_rank",
    "categories_json","genres_json","tags_all_json"
]].copy()

df_core_out.to_csv(core_path, index=False)
df_snapshot.to_csv(snap_path, index=False)
df_authors.to_csv(authors_path, index=False)

print("‚úÖ Exports √©crits:")
print(core_path.resolve())
print(snap_path.resolve())
print(authors_path.resolve())


‚úÖ Exports √©crits:
/home/maxime/python/certification/preparation_bdd/Preparation_weekly/export/kitsu_series_core.csv
/home/maxime/python/certification/preparation_bdd/Preparation_weekly/export/kitsu_weekly_snapshot.csv
/home/maxime/python/certification/preparation_bdd/Preparation_weekly/export/kitsu_series_authors.csv


## 10) Bonus RAG : `kitsu_rag_documents.csv`

Document texte minimal (sans reviews) :
- titres
- synopsis_clean
- tags_all
- auteurs (si dispo)


In [12]:
def build_doc(row):
    parts = []
    titles = [row.get("title_canonical"), row.get("title_en"), row.get("title_ja")]
    titles = [t for t in titles if t]
    if titles:
        parts.append("Titres: " + " | ".join(dict.fromkeys(titles)))
    if row.get("synopsis_clean"):
        parts.append("Synopsis: " + row["synopsis_clean"])
    tags = row.get("tags_all_norm") or []
    if tags:
        parts.append("Tags: " + ", ".join(tags))

    kid = row.get("kitsu_id")
    if kid is not None and not df_authors.empty:
        a = df_authors[df_authors["kitsu_id"] == kid]
        if not a.empty:
            aa = []
            for _, r in a.iterrows():
                if r.get("author_role"):
                    aa.append(f"{r['author_name']} ({r['author_role']})")
                else:
                    aa.append(r["author_name"])
            parts.append("Auteurs: " + "; ".join(aa))

    return "\n".join(parts)

df_rag = df_core[[
    "kitsu_id","slug","title_canonical","title_en","title_ja","synopsis_clean","tags_all_norm"
]].copy()

df_rag["doc_text"] = df_rag.apply(build_doc, axis=1)
df_rag_out = df_rag.copy()
df_rag_out["tags_all_json"] = df_rag_out["tags_all_norm"].apply(lambda x: _json.dumps(x, ensure_ascii=False))
df_rag_out = df_rag_out.drop(columns=["tags_all_norm"])

df_rag_out.to_csv(rag_path, index=False)
print("‚úÖ Export RAG:", rag_path.resolve())

df_rag[["kitsu_id","doc_text"]].head(2)


‚úÖ Export RAG: /home/maxime/python/certification/preparation_bdd/Preparation_weekly/export/kitsu_rag_documents.csv


Unnamed: 0,kitsu_id,doc_text
0,4,Titres: Monster | MONSTER\nSynopsis: Dr. Kenzo...
1,8,Titres: Berserk | „Éô„É´„Çª„É´„ÇØ\nSynopsis: His name is...


## 11) (Optionnel) DDL PostgreSQL (exemple)

```sql
CREATE TABLE IF NOT EXISTS manga.kitsu_series_core (
  kitsu_id BIGINT PRIMARY KEY,
  slug TEXT,
  status TEXT,
  title_canonical TEXT,
  title_en TEXT,
  title_ja TEXT,
  title_norm_canonical TEXT,
  title_norm_en TEXT,
  title_norm_ja TEXT,
  synopsis_clean TEXT,
  rating_average_raw DOUBLE PRECISION,
  rating_average_10 DOUBLE PRECISION,
  rating_rank INTEGER,
  popularity_rank INTEGER,
  categories_json JSONB,
  genres_json JSONB,
  tags_all_json JSONB
);

CREATE TABLE IF NOT EXISTS manga.kitsu_weekly_snapshot (
  list_name TEXT NOT NULL,
  fetched_at_ts TIMESTAMPTZ NOT NULL,
  kitsu_id BIGINT NOT NULL REFERENCES manga.kitsu_series_core(kitsu_id),
  position INTEGER NOT NULL,      -- ordre d‚Äôapparition (classement de la liste)
  list_rank INTEGER,             -- rank explicite (most_popular/top_publishing)
  trend_rank INTEGER,            -- = position uniquement pour trending_weekly
  endpoint TEXT,
  PRIMARY KEY (list_name, fetched_at_ts, kitsu_id)
);
```
