In [None]:
import requests
import json

In [None]:
import pandas as pd, numpy as np, re, os, json
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder
from tqdm import tqdm
import time
import pickle
import joblib
from pathlib import Path

In [None]:
DEPLOY_DIR = Path("deployment_cb")
DEPLOY_DIR.mkdir(exist_ok=True)

In [None]:
ANILIST_QUERY = """
query ($page: Int, $perPage: Int, $type: MediaType) {
  Page(page: $page, perPage: $perPage) {
    media(type: $type, sort: POPULARITY_DESC) {
      id
      type
      format
      status
      title { romaji english native }
      description(asHtml: false)
      genres
      tags { name rank isGeneralSpoiler isMediaSpoiler }
      averageScore
      meanScore
      popularity
      favourites
      source
      startDate { year month day}
      endDate { year month day }
      season
      seasonYear
      countryOfOrigin
      episodes
      duration
      chapters
      volumes
      studios(isMain: true) { nodes { name siteUrl} }
       relations {
        edges {
          relationType
          node { id type title { romaji english } }
        }
      }
      coverImage { large color medium }
      bannerImage
      trailer { id site thumbnail }
    }
  }
}
"""

In [None]:
MEDIA_TYPES = ["ANIME", "MANGA"]
URL = "https://graphql.anilist.co"

In [None]:
import random
URL = "https://graphql.anilist.co"

def fetch_media(start_page=41, end_page=81, per_page=50, use_cache=True):
    cache_file = DEPLOY_DIR / f"all_media_{start_page}to{end_page}_{per_page}.json"
    if use_cache and cache_file.exists():
        print(f"Using cached media JSON: {cache_file.name}")
        return json.load(open(cache_file, "r", encoding="utf-8"))

    all_items = []
    for media_type in MEDIA_TYPES:
        print(f"\n=== Fetching {media_type} from page {start_page} to {end_page} ===")
        for page in range(start_page, end_page):
            success = False
            for attempt in range(3):  # retry up to 3 times per page
                try:
                    variables = {"page": page, "perPage": per_page, "type": media_type}
                    r = requests.post(URL, json={"query": ANILIST_QUERY, "variables": variables}, timeout=15)
                    if r.status_code != 200:
                        print(f"HTTP {r.status_code} for {media_type} page {page}")
                        time.sleep(2)
                        continue

                    data = r.json()
                    media_list = data.get("data", {}).get("Page", {}).get("media", [])

                    # Retry if page incomplete
                    if len(media_list) < per_page and attempt < 2:
                        print(f"Page {page} ({media_type}) incomplete ({len(media_list)} items), retrying...")
                        time.sleep(2)
                        continue

                    for m in media_list:
                        m["__fetched_type"] = media_type
                        title = m.get("title") or {}
                        if (
                            title
                            and (m.get("description") or m.get("genres") or m.get("tags")
                                 or (m.get("studios") and m.get("studios").get("nodes")))
                        ):
                            all_items.append(m)

                    print(f"Page {page} ({media_type}) fetched: {len(media_list)} items")
                    success = True
                    break  # stop retrying if success

                except Exception as e:
                    print(f"Error fetching {media_type} page {page}: {e}")
                    time.sleep(2)

            if not success:
                print(f"Failed to fetch {media_type} page {page} after 3 retries.")

            time.sleep(1 + random.random())  # polite delay (1–2s)

    with open(cache_file, "w", encoding="utf-8") as f:
        json.dump(all_items, f, indent=2, ensure_ascii=False)
    print(f"\nFinished: fetched & cached {len(all_items)} items → {cache_file.name}")
    return all_items


In [None]:
def clean_text(text):
    if not text: return ""
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"[^0-9A-Za-z\u00C0-\u017F\s]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip().lower()
def safe_join(items):
    return " ".join([str(x) for x in items if x])

In [None]:
def build_dataframe(raw_list):
    rows = []
    for m in raw_list:
        if not m or not m.get("title"):
            continue  # skip invalid items

        title_obj = m.get("title") or {}
        relations_edges = (m.get("relations") or {}).get("edges") or []
        relations_text = []
        for e in relations_edges:
            node = e.get("node") or {}
            t = node.get("title") or {}
            tname = t.get("romaji") or t.get("english") or ""
            if tname:
                relations_text.append(f"{e.get('relationType','')} {tname}")

        tags = [t.get("name") for t in (m.get("tags") or []) if t]
        studios_nodes = (m.get("studios", {}) or {}).get("nodes") or []
        studios = [s.get("name") for s in studios_nodes if s.get("name")]
        studio_links = [s.get("siteUrl") for s in studios_nodes if s.get("siteUrl")]
        start = m.get("startDate") or {}
        end = m.get("endDate") or {}

        # Skip entries without any useful metadata
        if not any([m.get("description"), m.get("genres"), tags, studios]):
            continue

        row = {
            "id": m.get("id"),
            "fetched_type": m.get("__fetched_type", m.get("type")),
            "title_romaji": title_obj.get("romaji") or "",
            "title_english": title_obj.get("english") or "",
            "title_native": title_obj.get("native") or "",
            "display_title": (title_obj.get("romaji") or title_obj.get("english") or title_obj.get("native") or "").strip(),
            "description": clean_text(m.get("description") or ""),
            "genres": clean_text(safe_join(m.get("genres") or [])),
            "tags": clean_text(safe_join(tags)),
            "studio": clean_text(safe_join(studios)),
            "studio_links": safe_join(studio_links),
            "averageScore": m.get("averageScore") or 0,
            "meanScore": m.get("meanScore") or 0,
            "popularity": m.get("popularity") or 0,
            "favourites": m.get("favourites") or 0,
            "source": (m.get("source") or "").lower(),
            "start_year": start.get("year") or m.get("seasonYear") or 0,
            "start_month": start.get("month"),
            "start_day": start.get("day"),
            "end_year": end.get("year"),
            "end_month": end.get("month"),
            "end_day": end.get("day"),
            "season": (m.get("season") or "").lower(),
            "country": (m.get("countryOfOrigin") or "").lower(),
            "episodes": m.get("episodes") or 0,
            "duration": m.get("duration") or 0,
            "chapters": m.get("chapters") or 0,
            "volumes": m.get("volumes") or 0,
            "relations": clean_text(" ".join(relations_text)),
            "format": (m.get("format") or "").lower(),
            "status": (m.get("status") or "").lower(),
            "coverImage": (m.get("coverImage") or {}).get("large") or "",
            "bannerImage": m.get("bannerImage") or "",
            "trailer_thumbnail": (m.get("trailer") or {}).get("thumbnail") or ""
        }
        rows.append(row)

    df = pd.DataFrame(rows).drop_duplicates(subset=["id"]).reset_index(drop=True)

    # Ensure display_title exists
    df["display_title"] = df.apply(
        lambda r: r["display_title"] if r["display_title"] else (r["title_romaji"] or r["title_english"] or r["title_native"]),
        axis=1
    )
    return df


In [None]:
def build_weighted_text(df):
    TEXT_FEATURE_WEIGHTS = {"description":0.43,"genres":0.20,"tags":0.15,"studio":0.05,"source":0.05,"relations":0.12}
    def repeat_row(r):
        parts=[]
        for col, w in TEXT_FEATURE_WEIGHTS.items():
            repeat=max(1,int(round(w*10)))
            parts.append(((r.get(col,"")+" ")*repeat).strip())
        return " ".join(parts)
    df["combined_text"] = df.apply(repeat_row, axis=1)
    return df

In [None]:
def build_categorical_sim(df):
    enc = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    cat_cols = ["format","season","country"]
    mat = enc.fit_transform(df[cat_cols].fillna("").astype(str))
    norms = np.linalg.norm(mat, axis=1, keepdims=True)
    norms[norms==0]=1.0
    mat_norm = mat / norms
    return np.dot(mat_norm, mat_norm.T)


In [None]:
NUMERIC_FEATURES = ["meanScore", "averageScore", "popularity", "favourites", "duration", "episodes", "chapters", "volumes"]

In [None]:
def build_numeric_sim(df):
    present = [c for c in NUMERIC_FEATURES if c in df.columns]
    if not present: return np.zeros((len(df), len(df)))
    mat = MinMaxScaler().fit_transform(df[present].fillna(0).astype(float))
    return np.dot(mat, mat.T)

In [None]:
def build_tfidf_sim(df):
    tfidf = TfidfVectorizer(max_features=5000)
    mat = tfidf.fit_transform(df["combined_text"].fillna("").tolist())
    return (mat*mat.T).toarray(), tfidf

In [None]:
def build_semantic_sim(df):
    model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
    embeddings = model.encode(df["combined_text"].tolist(), show_progress_bar=True, convert_to_numpy=True)
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    embeddings = embeddings / norms
    return np.dot(embeddings, embeddings.T)

In [None]:
FUSION_WEIGHTS = {
    "semantic": 0.6,
    "lexical": 0.2,
    "numeric": 0.15,
    "categorical": 0.05
}

In [None]:
def fuse_similarities(sem_sim, lex_sim, num_sim, cat_sim):
    fused = 0.6*sem_sim + 0.2*lex_sim + 0.15*num_sim + 0.05*cat_sim
    fused = (fused-fused.min())/(fused.max()-fused.min())
    return fused

In [None]:
def apply_recency_weight(df, fused_sim, recency_weight=0.1):
    """
    Boost similarity for newer or ongoing media.
    recency_weight: 0.0–0.2 is enough
    """
    if "start_year" not in df.columns:
        return fused_sim

    current_year = pd.Timestamp.now().year
    start_year = df["start_year"].fillna(df["start_year"].min()).astype(float)
    end_year = df.get("end_year", pd.Series([current_year]*len(df))).fillna(current_year).astype(float)

    # newer start_year gets higher score
    recency_score = (start_year - start_year.min()) / (start_year.max() - start_year.min() + 1e-9)
    ongoing_boost = ((end_year - start_year) / (current_year - start_year.min() + 1e-9)) * 0.5
    recency_score += ongoing_boost

    # normalize 0–1
    recency_score = (recency_score - recency_score.min()) / (recency_score.max() - recency_score.min() + 1e-9)

    # pairwise weighting
    recency_boost = np.outer(recency_score, recency_score)
    adjusted = (1 - recency_weight) * fused_sim + recency_weight * recency_boost

    # normalize final matrix 0–1
    adjusted = (adjusted - adjusted.min()) / (adjusted.max() - adjusted.min())
    return adjusted


In [None]:
raw_data = fetch_media(start_page=41,end_page=81,per_page=50,use_cache=True)
df = build_dataframe(raw_data)
df = build_weighted_text(df)
cat_sim = build_categorical_sim(df)
num_sim = build_numeric_sim(df)
lex_sim, tfidf = build_tfidf_sim(df)
sem_sim = build_semantic_sim(df)
fused = fuse_similarities(sem_sim, lex_sim, num_sim, cat_sim)
fused = apply_recency_weight(df, fused, recency_weight=0.1)

print("Shapes:")
print("Semantic:", sem_sim.shape)
print("Lexical:", lex_sim.shape)
print("Numeric:", num_sim.shape)
print("Categorical:", cat_sim.shape)
print("Fused matrix shape:", fused.shape)



In [None]:
with open(DEPLOY_DIR/"anime_cb_extended_data.pkl","wb") as f: pickle.dump(df,f,protocol=pickle.HIGHEST_PROTOCOL)
np.save(DEPLOY_DIR/"fused_sim_extended.npy", fused)
import joblib
joblib.dump(tfidf, DEPLOY_DIR/"tfidf_vectorizer_extended.joblib")
manifest_extended = {"anime_data":"anime_cb_data.pkl","fused_sim":"fused_sim.npy","tfidf_vectorizer":"tfidf_vectorizer.joblib"}
with open(DEPLOY_DIR/"manifest.json","w") as f: json.dump(manifest_extended,f,indent=2)
print("CB artifacts saved in", DEPLOY_DIR)