In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/anime-recommendation-database-2020/watching_status.csv
/kaggle/input/anime-recommendation-database-2020/rating_complete.csv
/kaggle/input/anime-recommendation-database-2020/animelist.csv
/kaggle/input/anime-recommendation-database-2020/anime.csv
/kaggle/input/anime-recommendation-database-2020/anime_with_synopsis.csv
/kaggle/input/anime-recommendation-database-2020/html folder/instructions.txt
/kaggle/input/anime-recommendation-database-2020/html folder/html/5/reviews_1.html
/kaggle/input/anime-recommendation-database-2020/html folder/html/5/staff.html
/kaggle/input/anime-recommendation-database-2020/html folder/html/5/reviews_2.html
/kaggle/input/anime-recommendation-database-2020/html folder/html/5/pictures.html
/kaggle/input/anime-recommendation-database-2020/html folder/html/5/stats.html
/kaggle/input/anime-recommendation-database-2020/html folder/html/5/details.html
/kaggle/input/anime-recommendation-database-2020/html folder/html/5/reviews_3.html
/kaggle/input/anime

In [5]:
!pip install requests rapidfuzz sentence-transformers pandas

Collecting rapidfuzz
  Downloading rapidfuzz-3.14.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-non

In [3]:
# Step 2: Define the AniList GraphQL Query and API URL

API_URL = "https://graphql.anilist.co"

ANILIST_QUERY = """
query ($page: Int, $perPage: Int, $sort: [MediaSort]) {
  Page(page: $page, perPage: $perPage) {
    pageInfo {
      total
      currentPage
      lastPage
      hasNextPage
      perPage
    }
    media(sort: $sort, type: ANIME) {
      id
      idMal
      title { romaji english native }
      description(asHtml: false)
      genres
      tags { name rank }
      meanScore
      popularity
      episodes
      season
      seasonYear
      format
      status
      studios(isMain: true) { nodes { name } }
      coverImage { extraLarge large medium }
      siteUrl
      synonyms
    }
  }
}
"""


In [4]:
import requests
import json

# Step 3: Function to execute one GraphQL request
def fetch_page(page=1, per_page=5, sort=["POPULARITY_DESC"]):
    response = requests.post(
        API_URL,
        json={
            "query": ANILIST_QUERY,
            "variables": {
                "page": page,
                "perPage": per_page,
                "sort": sort
            }
        }
    )
    # Check if the request worked
    if response.status_code == 200:
        data = response.json()
        return data
    else:
        print("Error:", response.status_code, response.text)
        return None

# Let's test it by fetching 5 anime from page 1
data = fetch_page(page=1, per_page=5)
print(json.dumps(data, indent=2))


{
  "data": {
    "Page": {
      "pageInfo": {
        "total": 21679,
        "currentPage": 1,
        "lastPage": 4336,
        "hasNextPage": true,
        "perPage": 5
      },
      "media": [
        {
          "id": 16498,
          "idMal": 16498,
          "title": {
            "romaji": "Shingeki no Kyojin",
            "english": "Attack on Titan",
            "native": "\u9032\u6483\u306e\u5de8\u4eba"
          },
          "description": "Several hundred years ago, humans were nearly exterminated by titans. Titans are typically several stories tall, seem to have no intelligence, devour human beings and, worst of all, seem to do it for the pleasure rather than as a food source. A small percentage of humanity survived by walling themselves in a city protected by extremely high walls, even taller than the biggest of titans.<br><br>\r\nFlash forward to the present and the city has not seen a titan in over 100 years. Teenage boy Eren and his foster sister Mikasa witness som

In [6]:
import re

# Step 4: Function to clean descriptions and flatten each anime entry into a simple Python dict

def clean_description(text: str) -> str:
    if text is None:
        return ""
    # Remove HTML tags (safety)
    text = re.sub(r"<[^>]+>", "", text)
    # Replace newlines and extra spaces
    text = text.replace("\n", " ").strip()
    text = re.sub(r"\s{2,}", " ", text)
    return text

def normalize_media_entry(m: dict) -> dict:
    studios = []
    try:
        studios = [n["name"] for n in m.get("studios", {}).get("nodes", [])]
    except:
        studios = []

    tags = [t.get("name") for t in m.get("tags") or []]

    return {
        "anilist_id": m.get("id"),
        "idMal": m.get("idMal"),
        "title_romaji": (m.get("title") or {}).get("romaji"),
        "title_english": (m.get("title") or {}).get("english"),
        "title_native": (m.get("title") or {}).get("native"),
        "description": clean_description(m.get("description")),
        "genres": m.get("genres") or [],
        "tags": tags,
        "meanScore": m.get("meanScore"),
        "popularity": m.get("popularity"),
        "episodes": m.get("episodes"),
        "season": m.get("season"),
        "seasonYear": m.get("seasonYear"),
        "format": m.get("format"),
        "status": m.get("status"),
        "studios": studios,
        "cover_large": (m.get("coverImage") or {}).get("large"),
        "siteUrl": m.get("siteUrl"),
        "synonyms": m.get("synonyms") or []
    }

# Let's apply it to the first anime entry from the last response:
sample_entry = data["data"]["Page"]["media"][0]
normalized = normalize_media_entry(sample_entry)
print(json.dumps(normalized, indent=2))


NameError: name 'data' is not defined

# Next step# 

In [7]:
# Step: Fetch a professional AniList dataset (Kaggle-ready)
import requests
import time
import json
import re
import pandas as pd
from typing import List, Dict

API_URL = "https://graphql.anilist.co"

ANILIST_QUERY = """
query ($page: Int, $perPage: Int, $sort: [MediaSort]) {
  Page(page: $page, perPage: $perPage) {
    pageInfo { total currentPage lastPage hasNextPage perPage }
    media(sort: $sort, type: ANIME) {
      id
      idMal
      title { romaji english native }
      synonyms
      description(asHtml: false)
      genres
      tags { name rank }
      meanScore
      averageScore
      popularity
      episodes
      season
      seasonYear
      format
      status
      studios(isMain: true) { nodes { name } }
      coverImage { large extraLarge medium }
      siteUrl
    }
  }
}
"""


def normalize_media_entry(m: dict) -> dict:
    """Flatten an AniList media object to a dict suitable for DataFrame rows."""
    # safe extraction helpers
    def safe_title(t): 
        if not t: return {"romaji": None, "english": None, "native": None}
        return {"romaji": t.get("romaji"), "english": t.get("english"), "native": t.get("native")}
    studios = []
    try:
        studios = [n["name"] for n in m.get("studios", {}).get("nodes", [])]
    except Exception:
        studios = []
    tags = [t.get("name") for t in m.get("tags") or []]
    title = safe_title(m.get("title"))
    return {
        "anilist_id": m.get("id"),
        "idMal": m.get("idMal"),
        "title_romaji": title.get("romaji"),
        "title_english": title.get("english"),
        "title_native": title.get("native"),
        "synonyms": m.get("synonyms") or [],
        "description": clean_description(m.get("description")),
        "genres": m.get("genres") or [],
        "tags": tags,
        "meanScore": m.get("meanScore"),
        "averageScore": m.get("averageScore"),
        "popularity": m.get("popularity"),
        "episodes": m.get("episodes"),
        "season": m.get("season"),
        "seasonYear": m.get("seasonYear"),
        "format": m.get("format"),
        "status": m.get("status"),
        "studios": studios,
        "cover_large": (m.get("coverImage") or {}).get("large"),
        "cover_xl": (m.get("coverImage") or {}).get("extraLarge"),
        "siteUrl": m.get("siteUrl")
    }

def fetch_anime_data_multi(pages:int=20, per_page:int=50, delay:float=0.6, sort:List[str]=None) -> pd.DataFrame:
    """Fetch pages x per_page anime entries and return a pandas DataFrame.
    Default pages=20 => 1000 anime (20 * 50).
    """
    sort = sort or ["POPULARITY_DESC"]
    collected = []
    for page in range(1, pages+1):
        variables = {"page": page, "perPage": per_page, "sort": sort}
        resp = requests.post(API_URL, json={"query": ANILIST_QUERY, "variables": variables}, timeout=30)
        if resp.status_code != 200:
            print(f"Warning: API returned {resp.status_code} at page {page}. Response text:\n{resp.text[:300]}")
            # polite backoff and retry once
            time.sleep(2.0)
            resp = requests.post(API_URL, json={"query": ANILIST_QUERY, "variables": variables}, timeout=30)
            if resp.status_code != 200:
                print(f"Failed again at page {page}, skipping page.")
                time.sleep(delay)
                continue
        payload = resp.json().get("data", {}).get("Page", {}).get("media", [])
        if not payload:
            print(f"No media returned for page {page}; stopping early.")
            break
        for m in payload:
            try:
                collected.append(normalize_media_entry(m))
            except Exception as e:
                print("Normalize error:", e)
        print(f"Fetched page {page} → total collected: {len(collected)}")
        time.sleep(delay)
    df = pd.DataFrame(collected)
    # Save to working dir
    out_json = f"/kaggle/working/anilist_top_{len(df)}.json"
    out_csv = f"/kaggle/working/anilist_top_{len(df)}.csv"
    with open(out_json, "w", encoding="utf-8") as f:
        json.dump(collected, f, ensure_ascii=False, indent=2)
    df.to_csv(out_csv, index=False)
    print(f"Saved JSON ({out_json}) and CSV ({out_csv})")
    return df

# Run the fetch (default 20 pages -> ~1000 entries)
anime_df = fetch_anime_data_multi(pages=20, per_page=50, delay=0.6)
print("Data shape:", anime_df.shape)
# show a sample row (first non-null)
sample_idx = anime_df['title_romaji'].first_valid_index()
if sample_idx is None:
    sample_idx = 0
print("Sample row:")
display(anime_df.iloc[[sample_idx]].T)


Fetched page 1 → total collected: 50
Fetched page 2 → total collected: 100
Fetched page 3 → total collected: 150
Fetched page 4 → total collected: 200
Fetched page 5 → total collected: 250
Fetched page 6 → total collected: 300
Fetched page 7 → total collected: 350
Fetched page 8 → total collected: 400
Fetched page 9 → total collected: 450
Fetched page 10 → total collected: 500
Fetched page 11 → total collected: 550
Fetched page 12 → total collected: 600
Fetched page 13 → total collected: 650
Fetched page 14 → total collected: 700
Fetched page 15 → total collected: 750
Fetched page 16 → total collected: 800
Fetched page 17 → total collected: 850
Fetched page 18 → total collected: 900
Fetched page 19 → total collected: 950
Fetched page 20 → total collected: 1000
Saved JSON (/kaggle/working/anilist_top_1000.json) and CSV (/kaggle/working/anilist_top_1000.csv)
Data shape: (1000, 21)
Sample row:


Unnamed: 0,0
anilist_id,16498
idMal,16498.0
title_romaji,Shingeki no Kyojin
title_english,Attack on Titan
title_native,進撃の巨人
synonyms,"[SnK, AoT, Ataque a los Titanes, Ataque dos Ti..."
description,"Several hundred years ago, humans were nearly ..."
genres,"[Action, Drama, Fantasy, Mystery]"
tags,"[Kaiju, Revenge, Tragedy, Military, Post-Apoca..."
meanScore,85.0


In [8]:
print("Duplicates by anilist_id:", anime_df['anilist_id'].duplicated().sum())
print("Missing idMal:", anime_df['idMal'].isna().sum())

Duplicates by anilist_id: 0
Missing idMal: 1


In [9]:
import pandas as pd
anime_kaggle = pd.read_csv('/kaggle/input/anime-recommendation-database-2020/anime.csv')
ratings_kaggle = pd.read_csv('/kaggle/input/anime-recommendation-database-2020/rating_complete.csv')


In [10]:
anime_kaggle.rename(columns={'MAL_ID': 'idMal'}, inplace=True)

In [11]:
# Merge API and Kaggle datasets using idMal
anime_merged = pd.merge(
    anime_df,
    anime_kaggle,
    on='idMal',
    how='inner',
    suffixes=('_api', '_kaggle')
)

# Basic diagnostics
print("Shape after merge:", anime_merged.shape)
print("\nPreview first 5 rows:")
display(anime_merged.head())


Shape after merge: (814, 55)

Preview first 5 rows:


Unnamed: 0,anilist_id,idMal,title_romaji,title_english,title_native,synonyms,description,genres,tags,meanScore,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,16498,16498.0,Shingeki no Kyojin,Attack on Titan,進撃の巨人,"[SnK, AoT, Ataque a los Titanes, Ataque dos Ti...","Several hundred years ago, humans were nearly ...","[Action, Drama, Fantasy, Mystery]","[Kaiju, Revenge, Tragedy, Military, Post-Apoca...",85.0,...,470882.0,514879.0,459113.0,220228.0,70768.0,31141.0,11805.0,4637.0,2707.0,4939.0
1,101922,38000.0,Kimetsu no Yaiba,Demon Slayer: Kimetsu no Yaiba,鬼滅の刃,"[KnY, Kimetsu no Yaiba: Kyoudai no Kizuna, Dem...","It is the Taisho Period in Japan. Tanjiro, a k...","[Action, Adventure, Drama, Fantasy, Supernatural]","[Demons, Shounen, Swordplay, Male Protagonist,...",82.0,...,243770.0,300149.0,242534.0,95902.0,27255.0,11430.0,3902.0,1484.0,862.0,3000.0
2,1535,1535.0,DEATH NOTE,Death Note,DEATH NOTE,"[デスノート, 死亡笔记, מחברת המוות, Notatnik śmierci, C...",Light Yagami is a genius high school student w...,"[Mystery, Psychological, Supernatural, Thriller]","[Crime, Detective, Anti-Hero, Male Protagonist...",84.0,...,557406.0,535252.0,415890.0,201522.0,68577.0,28048.0,10462.0,3692.0,2256.0,3586.0
3,113415,40748.0,Jujutsu Kaisen,JUJUTSU KAISEN,呪術廻戦,"[JJK, Sorcery Fight, 咒术回战, 주술회전, มหาเวทย์ผนึกม...","A boy fights... for ""the right death."" Hardshi...","[Action, Drama, Supernatural]","[Urban Fantasy, Shounen, Youkai, Super Power, ...",84.0,...,53357.0,81070.0,74305.0,24187.0,5181.0,2116.0,633.0,263.0,178.0,689.0
4,21459,31964.0,Boku no Hero Academia,My Hero Academia,僕のヒーローアカデミア,"[BNHA, MHA, 나의 히어로 아카데미아 1기, 나히아 1기, אקדמיית ה...",What would the world be like if 80 percent of ...,"[Action, Adventure, Comedy]","[Super Power, Superhero, Shounen, Primarily Te...",76.0,...,192539.0,318675.0,414913.0,253871.0,77961.0,29893.0,9015.0,3664.0,1807.0,3129.0


In [12]:
print("\nMissing values (Top 10 columns with most missing):")
print(anime_merged.isnull().sum().sort_values(ascending=False).head(10))



Missing values (Top 10 columns with most missing):
title_english    13
seasonYear        4
season            4
episodes          2
Dropped           0
Studios           0
Source            0
Duration          0
Rating            0
Ranked            0
dtype: int64


In [16]:
anime_merged.head()

Unnamed: 0,anilist_id,idMal,title_romaji,title_english,title_native,synonyms,description,genres,tags,meanScore,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,16498,16498.0,Shingeki no Kyojin,Attack on Titan,進撃の巨人,"[SnK, AoT, Ataque a los Titanes, Ataque dos Ti...","Several hundred years ago, humans were nearly ...","[Action, Drama, Fantasy, Mystery]","[Kaiju, Revenge, Tragedy, Military, Post-Apoca...",85.0,...,470882.0,514879.0,459113.0,220228.0,70768.0,31141.0,11805.0,4637.0,2707.0,4939.0
1,101922,38000.0,Kimetsu no Yaiba,Demon Slayer: Kimetsu no Yaiba,鬼滅の刃,"[KnY, Kimetsu no Yaiba: Kyoudai no Kizuna, Dem...","It is the Taisho Period in Japan. Tanjiro, a k...","[Action, Adventure, Drama, Fantasy, Supernatural]","[Demons, Shounen, Swordplay, Male Protagonist,...",82.0,...,243770.0,300149.0,242534.0,95902.0,27255.0,11430.0,3902.0,1484.0,862.0,3000.0
2,1535,1535.0,DEATH NOTE,Death Note,DEATH NOTE,"[デスノート, 死亡笔记, מחברת המוות, Notatnik śmierci, C...",Light Yagami is a genius high school student w...,"[Mystery, Psychological, Supernatural, Thriller]","[Crime, Detective, Anti-Hero, Male Protagonist...",84.0,...,557406.0,535252.0,415890.0,201522.0,68577.0,28048.0,10462.0,3692.0,2256.0,3586.0
3,113415,40748.0,Jujutsu Kaisen,JUJUTSU KAISEN,呪術廻戦,"[JJK, Sorcery Fight, 咒术回战, 주술회전, มหาเวทย์ผนึกม...","A boy fights... for ""the right death."" Hardshi...","[Action, Drama, Supernatural]","[Urban Fantasy, Shounen, Youkai, Super Power, ...",84.0,...,53357.0,81070.0,74305.0,24187.0,5181.0,2116.0,633.0,263.0,178.0,689.0
4,21459,31964.0,Boku no Hero Academia,My Hero Academia,僕のヒーローアカデミア,"[BNHA, MHA, 나의 히어로 아카데미아 1기, 나히아 1기, אקדמיית ה...",What would the world be like if 80 percent of ...,"[Action, Adventure, Comedy]","[Super Power, Superhero, Shounen, Primarily Te...",76.0,...,192539.0,318675.0,414913.0,253871.0,77961.0,29893.0,9015.0,3664.0,1807.0,3129.0


In [13]:
# 1. Identification columns
id_cols = [
    'anilist_id', 'idMal',
    'title_romaji', 'title_english', 'title_native', 'synonyms'
]

anime_id_df = anime_merged[id_cols].copy()

# 2. Content metadata
content_cols = [
    'description', 'genres', 'tags', 'studios', 'format', 'status',
    'episodes', 'season', 'seasonYear', 'Source', 'Duration', 'Rating'
]

anime_content_df = anime_merged[content_cols].copy()

# 3. Popularity/engagement metrics
popularity_cols = [
    'meanScore', 'averageScore', 'popularity',  # API
    'Score', 'Ranked', 'Popularity', 'Members', 'Favorites', 'Watching', 
    'Completed', 'On-Hold', 'Dropped', 'Plan to Watch'  # Kaggle
]

anime_popularity_df = anime_merged[popularity_cols].copy()

# 4. Combine cleaned features for final dataset
anime_final_df = pd.concat([anime_id_df, anime_content_df, anime_popularity_df], axis=1)

print("✅ Identification features:", anime_id_df.shape)
print("✅ Content features:", anime_content_df.shape)
print("✅ Popularity features:", anime_popularity_df.shape)
print("✅ Final dataset shape:", anime_final_df.shape)
print("\nPreview final dataset:")
display(anime_final_df.head())


✅ Identification features: (814, 6)
✅ Content features: (814, 12)
✅ Popularity features: (814, 13)
✅ Final dataset shape: (814, 31)

Preview final dataset:


Unnamed: 0,anilist_id,idMal,title_romaji,title_english,title_native,synonyms,description,genres,tags,studios,...,Score,Ranked,Popularity,Members,Favorites,Watching,Completed,On-Hold,Dropped,Plan to Watch
0,16498,16498.0,Shingeki no Kyojin,Attack on Titan,進撃の巨人,"[SnK, AoT, Ataque a los Titanes, Ataque dos Ti...","Several hundred years ago, humans were nearly ...","[Action, Drama, Fantasy, Mystery]","[Kaiju, Revenge, Tragedy, Military, Post-Apoca...",[WIT STUDIO],...,8.48,115.0,2,2531397,129844,140753,2182587,37345,44635,126077
1,101922,38000.0,Kimetsu no Yaiba,Demon Slayer: Kimetsu no Yaiba,鬼滅の刃,"[KnY, Kimetsu no Yaiba: Kyoudai no Kizuna, Dem...","It is the Taisho Period in Japan. Tanjiro, a k...","[Action, Adventure, Drama, Fantasy, Supernatural]","[Demons, Shounen, Swordplay, Male Protagonist,...",[ufotable],...,8.62,62.0,24,1423778,51725,154413,1079964,24614,19506,145281
2,1535,1535.0,DEATH NOTE,Death Note,DEATH NOTE,"[デスノート, 死亡笔记, מחברת המוות, Notatnik śmierci, C...",Light Yagami is a genius high school student w...,"[Mystery, Psychological, Supernatural, Thriller]","[Crime, Detective, Anti-Hero, Male Protagonist...",[MADHOUSE],...,8.63,60.0,1,2589552,145201,122401,2146116,75054,80834,165147
3,113415,40748.0,Jujutsu Kaisen,JUJUTSU KAISEN,呪術廻戦,"[JJK, Sorcery Fight, 咒术回战, 주술회전, มหาเวทย์ผนึกม...","A boy fights... for ""the right death."" Hardshi...","[Action, Drama, Supernatural]","[Urban Fantasy, Shounen, Youkai, Super Power, ...",[MAPPA],...,8.54,88.0,130,707976,20816,533016,299,13891,5177,155593
4,21459,31964.0,Boku no Hero Academia,My Hero Academia,僕のヒーローアカデミア,"[BNHA, MHA, 나의 히어로 아카데미아 1기, 나히아 1기, אקדמיית ה...",What would the world be like if 80 percent of ...,"[Action, Adventure, Comedy]","[Super Power, Superhero, Shounen, Primarily Te...",[bones],...,8.11,400.0,6,1909814,50005,90902,1655900,18092,19212,125708


In [14]:
import ast

def clean_list_column(series):
    return series.fillna('[]').apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

anime_final_df['genres'] = clean_list_column(anime_final_df['genres'])
anime_final_df['tags'] = clean_list_column(anime_final_df['tags'])

print("Genres example:", anime_final_df['genres'].head(3).tolist())
print("Tags example:", anime_final_df['tags'].head(3).tolist())


Genres example: [['Action', 'Drama', 'Fantasy', 'Mystery'], ['Action', 'Adventure', 'Drama', 'Fantasy', 'Supernatural'], ['Mystery', 'Psychological', 'Supernatural', 'Thriller']]
Tags example: [['Kaiju', 'Revenge', 'Tragedy', 'Military', 'Post-Apocalyptic', 'Primarily Teen Cast', 'Super Power', 'Male Protagonist', 'Henshin', 'Gore', 'Swordplay', 'Memory Manipulation', 'Steampunk', 'Dystopian', 'Vore', 'Ensemble Cast', 'Orphan', 'Espionage', 'Cannibalism', 'Kuudere', 'Shounen', 'Coming of Age', 'Suicide', 'Survival', 'Medieval', 'Time Skip', 'Amnesia', 'Rural', 'CGI', 'Primarily Male Cast', 'Adoption'], ['Demons', 'Shounen', 'Swordplay', 'Male Protagonist', 'Vampire', 'Tragedy', 'Primarily Teen Cast', 'Travel', 'Monster Girl', 'Revenge', 'Orphan', 'CGI', 'Gore', 'Primarily Male Cast', 'Mythology', 'Rural', 'Slapstick', 'Historical', 'Super Power', 'Rotoscoping', 'Body Horror', 'Curses', 'Chibi', 'Time Skip', 'Snowscape', 'Animals', 'Food'], ['Crime', 'Detective', 'Anti-Hero', 'Male Prot

In [15]:

anime_final_df['description'] = anime_final_df['description'].apply(clean_description)

print("✅ Description sample after cleaning:\n")
print(anime_final_df['description'].head(3).tolist())


✅ Description sample after cleaning:

['Several hundred years ago, humans were nearly exterminated by titans. Titans are typically several stories tall, seem to have no intelligence, devour human beings and, worst of all, seem to do it for the pleasure rather than as a food source. A small percentage of humanity survived by walling themselves in a city protected by extremely high walls, even taller than the biggest of titans. Flash forward to the present and the city has not seen a titan in over 100 years. Teenage boy Eren and his foster sister Mikasa witness something horrific as the city walls are destroyed by a colossal titan that appears out of thin air. As the smaller titans flood the city, the two kids watch in horror as their mother is eaten alive. Eren vows that he will murder every single titan and take revenge for all of mankind. (Source: MangaHelpers)', 'It is the Taisho Period in Japan. Tanjiro, a kindhearted boy who sells charcoal for a living, finds his family slaughtered

In [16]:
# Prefer AniList ID if available, otherwise fallback to MAL ID
anime_final_df['unique_id'] = anime_final_df['anilist_id'].fillna(anime_final_df['idMal'])

# Check duplicates
duplicate_count = anime_final_df.duplicated(subset=['unique_id']).sum()
print(f"Number of duplicates based on unique IDs: {duplicate_count}")

# Drop duplicates
anime_final_df = anime_final_df.drop_duplicates(subset=['unique_id'], keep='first')

Number of duplicates based on unique IDs: 0


            ┌────────────────────────────────────────────┐
            │              USER VECTOR (CF)              │
            │         Learned via matrix factorization   │
            └────────────────────────────────────────────┘
                               │
                               ▼
                     ┌────────────────┐
                     │   CONCATENATE  │◄── Anime Content Embeddings
                     └────────────────┘         • Description embedding
                               │               • Genres / Tags vectors
                               ▼               • Studios / Popularity
                     ┌──────────────────────────────┐
                     │       Neural Network (MLP)   │
                     │  Learns complex interactions │
                     └──────────────────────────────┘
                               │
                               ▼
                   🔚 Output: Predicted Rating / Ranking Score


# Next step

In [17]:
# Show basic stats
print("Shape:", ratings_kaggle.shape)
print("Missing values:", ratings_kaggle.isna().sum())
print("Sample rows:")
ratings_kaggle.head()
print("Rating distribution:")
ratings_kaggle['rating'].value_counts().head(10)


Shape: (57633278, 3)
Missing values: user_id     0
anime_id    0
rating      0
dtype: int64
Sample rows:
Rating distribution:


rating
8     14642156
7     13325549
9      9773857
6      6849293
10     6716048
5      3436250
4      1455102
3       696048
2       405556
1       333419
Name: count, dtype: int64

In [18]:
valid_anime_ids = anime_final_df['idMal'].dropna().astype(int).unique()
ratings_filtered = ratings_kaggle[ratings_kaggle['anime_id'].isin(valid_anime_ids)]

print("Original ratings:", ratings_kaggle.shape)
print("Filtered ratings:", ratings_filtered.shape)
print("Number of unique anime after filtering:", ratings_filtered['anime_id'].nunique())
print("Number of unique users after filtering:", ratings_filtered['user_id'].nunique())


Original ratings: (57633278, 3)
Filtered ratings: (28047257, 3)
Number of unique anime after filtering: 726
Number of unique users after filtering: 306507


In [19]:
# Step 1: Filter users with fewer than 
min_ratings_per_user = 100
user_counts = ratings_filtered['user_id'].value_counts()
active_users = user_counts[user_counts >= min_ratings_per_user].index
ratings_filtered = ratings_filtered[ratings_filtered['user_id'].isin(active_users)]

# Step 2: Filter anime with fewer than
min_anime_ratings = 50
anime_counts = ratings_filtered['anime_id'].value_counts()
popular_anime = anime_counts[anime_counts >= min_anime_ratings].index
ratings_filtered = ratings_filtered[ratings_filtered['anime_id'].isin(popular_anime)]

print("After filtering:")
print("Ratings:", ratings_filtered.shape)
print("Unique anime:", ratings_filtered['anime_id'].nunique())
print("Unique users:", ratings_filtered['user_id'].nunique())


After filtering:
Ratings: (20159917, 3)
Unique anime: 710
Unique users: 107478


In [20]:
# 1. Install Surprise if not already installed
!pip install scikit-surprise



In [21]:

from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import accuracy

# 2. Prepare the data for Surprise
# Surprise requires the columns to be in the order: user, item, rating
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(ratings_filtered[['user_id', 'anime_id', 'rating']], reader)

# 3. Split into train and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)



In [22]:
# count ratings in trainset
trainset_size = sum(1 for _ in trainset.all_ratings())

print(f"✅ Data successfully prepared for Surprise")
print(f"Trainset size: {trainset_size} ratings")
print(f"Testset size: {len(testset)} ratings")


✅ Data successfully prepared for Surprise
Trainset size: 16127933 ratings
Testset size: 4031984 ratings


In [23]:
# Step 1: Initialize the SVD model (default parameters)
svd_model = SVD()

# Step 2: Train the model
svd_model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7c92ba4d8f90>

In [24]:
# Step 3: Predict on testset
predictions = svd_model.test(testset)

# Step 4: Evaluate performance
rmse = accuracy.rmse(predictions)
print(f"✅ Baseline SVD RMSE: {rmse}")


RMSE: 1.0386
✅ Baseline SVD RMSE: 1.0386402087370183


# Next step

In [25]:
anime_final_df.columns


Index(['anilist_id', 'idMal', 'title_romaji', 'title_english', 'title_native',
       'synonyms', 'description', 'genres', 'tags', 'studios', 'format',
       'status', 'episodes', 'season', 'seasonYear', 'Source', 'Duration',
       'Rating', 'meanScore', 'averageScore', 'popularity', 'Score', 'Ranked',
       'Popularity', 'Members', 'Favorites', 'Watching', 'Completed',
       'On-Hold', 'Dropped', 'Plan to Watch', 'unique_id'],
      dtype='object')

In [26]:
def to_text(x):
    if isinstance(x, list):
        return ", ".join(x)
    return str(x) if pd.notnull(x) else ""

anime_final_df['content_text'] = (
    "Description: " + anime_final_df['description'].fillna('') + ". " +
    "Genres: " + anime_final_df['genres'].apply(to_text) + ". " +
    "Tags: " + anime_final_df['tags'].apply(to_text) + ". " +
    "Studios: " + anime_final_df['studios'].apply(to_text)
)

anime_final_df[['content_text']].head()


Unnamed: 0,content_text
0,"Description: Several hundred years ago, humans..."
1,Description: It is the Taisho Period in Japan....
2,Description: Light Yagami is a genius high sch...
3,"Description: A boy fights... for ""the right de..."
4,Description: What would the world be like if 8...


In [27]:
!pip install -q sentence-transformers

from sentence_transformers import SentenceTransformer

# Load the embedding model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

print("✅ Model loaded successfully!")


2025-10-28 04:13:03.155417: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761624783.425076      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761624783.502261      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Model loaded successfully!


In [28]:
import numpy as np
from tqdm import tqdm

# Convert content_text to list
content_list = anime_final_df['content_text'].tolist()

# Generate embeddings in batches
embeddings = model.encode(content_list, batch_size=32, show_progress_bar=True)

# Convert to numpy array
embeddings = np.array(embeddings)

# Add to dataframe
anime_final_df['content_embedding'] = list(embeddings)

print(f"✅ Generated embeddings with shape: {embeddings.shape}")


Batches:   0%|          | 0/26 [00:00<?, ?it/s]

✅ Generated embeddings with shape: (814, 384)


In [30]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

embeddings_matrix = embeddings
print("📌 Embeddings matrix shape:", embeddings_matrix.shape)

# Step 1: Normalize the embeddings to unit vectors (important for true cosine similarity behavior)
from sklearn.preprocessing import normalize
embeddings_normalized = normalize(embeddings_matrix)

# Step 2: Compute cosine similarity between all pairs
similarity_matrix = cosine_similarity(embeddings_normalized)

print("✅ Similarity matrix computed")
print("Similarity matrix shape:", similarity_matrix.shape)

#Build title -> 

A = anime_final_df['synonyms'].tolist()
B = anime_final_df['title_romaji'].tolist()
C= anime_final_df['title_english'].tolist()

titles =[sublist + [item_b, item_c] for sublist, item_b, item_c in zip(A, B, C)]

# 2. Get the "leftover" items from A
# We need to find the shortest list that was zipped *with* A
shortest_of_B_and_C = min(len(B), len(C))
leftovers = A[shortest_of_B_and_C:]  

# 3. Add the leftovers to your new list
titles.extend(leftovers)

# The Fix
title_to_index = {}
for i, item in enumerate(titles):
    if isinstance(item, list):
        # If the item is a list of synonyms, loop through it
        for title in item:
            # ADD THIS CHECK: Only process the title if it's a string
            if isinstance(title, str):
                title_to_index[title.lower()] = i
    elif isinstance(item, str):
        # If the item is just a string, add it directly
        title_to_index[item.lower()] = i
    # This will safely skip any items that are not lists or strings

📌 Embeddings matrix shape: (814, 384)
✅ Similarity matrix computed
Similarity matrix shape: (814, 814)


In [31]:
def recommend_similar_content(query_title, top_k=10, show_scores=True):
    
    # This part is still correct, as title_to_index maps all strings to an index
    qi = title_to_index.get(query_title.lower())
    
    if qi is None:
        # --- FIX 1: ADAPTED FUZZY SEARCH ---
        # We must loop through the inner lists to find a partial match
        matches = []
        query_lower = query_title.lower()
        
        # enumerate(titles) gives (i, title_list)
        for i, title_list in enumerate(titles):
            # We must check if title_list is actually a list
            if isinstance(title_list, list):
                for title in title_list:
                    # Check if the title is a string and contains the query
                    if isinstance(title, str) and query_lower in title.lower():
                        matches.append((i, title_list))
                        break  # Found a match, move to the next anime
            
            # Fallback in case some entries in 'titles' are just strings
            elif isinstance(title_list, str):
                if query_lower in title_list.lower():
                    matches.append((i, title_list))

        if not matches:
            raise ValueError(f"Title '{query_title}' not found (no exact or contains match).")
        qi = matches[0][0]  # pick first match
    
    # This logic remains the same
    sim_scores = similarity_matrix[qi]
    sim_scores[qi] = -1.0 
    top_idx = np.argpartition(-sim_scores, range(top_k))[:top_k]
    top_idx = top_idx[np.argsort(-sim_scores[top_idx])]
    
    results = []
    for idx in top_idx:
        
        # --- FIX 2: GET A CLEAN DISPLAY TITLE ---
        # titles[idx] might be a list. We need to pick one string.
        title_entry = titles[idx]
        display_title = "Unknown Title" # Default fallback
        
        if isinstance(title_entry, list):
            # Get the first valid string from the list
            # next(..., None) safely returns None if no string is found
            display_title = next((t for t in title_entry if isinstance(t, str)), display_title)
        elif isinstance(title_entry, str):
            # If it's already a string, just use it
            display_title = title_entry
            
        results.append({
            "index": int(idx),
            "title": display_title, # This is now guaranteed to be a string
            "similarity": float(sim_scores[idx])
        })
        
    if show_scores:
        return results
    else:
        # This now correctly returns a list of strings
        return [r["title"] for r in results]

In [39]:
from rapidfuzz import process, fuzz
import numpy as np

# --- Setup ---

all_title_strings = list(title_to_index.keys())

def recommend_similar_content_fuzzy(query_title, top_k=10, show_scores=True):
    
    # 1. Try exact match first (this is correct)
    qi = title_to_index.get(query_title.lower())
    
    if qi is None:
        # --- FIX 1: FUZZY SEARCH ---
        # Search against the flat list of all known title strings
        match, score, _ = process.extractOne(
            query_title,
            all_title_strings,  # <-- Use the flat list of strings
            scorer=fuzz.WRatio
        )
        
        # Now, use the matched string (which is a key) to get the index
        qi = title_to_index[match]

    # --- This part is correct ---
    sim_scores = similarity_matrix[qi]
    sim_scores[qi] = -1.0  # exclude itself
    
    # Top-k most similar indices
    top_idx = np.argpartition(-sim_scores, range(top_k))[:top_k]
    top_idx = top_idx[np.argsort(-sim_scores[top_idx])]
    
    results = []
    for idx in top_idx:
        
        # --- FIX 2: GET A CLEAN DISPLAY TITLE ---
        # (Same fix as your previous function)
        title_entry = titles[idx]
        display_title = "Unknown Title" # Default fallback
        
        if isinstance(title_entry, list):
            # Get the first valid string from the list
            display_title = next((t for t in title_entry if isinstance(t, str)), display_title)
        elif isinstance(title_entry, str):
            # If it's already a string, just use it
            display_title = title_entry
            
        results.append({
            "index": int(idx),
            "title": display_title, # This is now guaranteed to be a string
            "similarity": float(sim_scores[idx])
        })
        
    if show_scores:
        return results
    else:
        # This now correctly returns a list of strings
        return [r["title"] for r in results]

In [41]:
 recommend_similar_content_fuzzy("attack on", top_k=8)

[{'index': 8, 'title': 'SnK 2', 'similarity': 0.6390928626060486},
 {'index': 354, 'title': 'SnK', 'similarity': 0.5166860222816467},
 {'index': 723, 'title': 'テクノライズ', 'similarity': 0.5073861479759216},
 {'index': 231, 'title': 'Kiznaiver', 'similarity': 0.49683260917663574},
 {'index': 15, 'title': 'SnK 3', 'similarity': 0.49437129497528076},
 {'index': 44,
  'title': "Kiseiju - L'ospite indesiderato",
  'similarity': 0.4911375939846039},
 {'index': 239, 'title': 'แบล็ค บุลเลท ', 'similarity': 0.4804025888442993},
 {'index': 226, 'title': 'DBS', 'similarity': 0.48010456562042236}]

In [43]:
# Example usage :
print(recommend_similar_content("tokyo revengers", top_k=8))


[{'index': 22, 'title': 'Bokumachi', 'similarity': 0.65992671251297}, {'index': 168, 'title': 'リライフ', 'similarity': 0.6587320566177368}, {'index': 267, 'title': 'Higehiro', 'similarity': 0.6297817826271057}, {'index': 227, 'title': 'Summertime Render', 'similarity': 0.6267410516738892}, {'index': 696, 'title': 'Descending Stories: Showa Genroku Rakugo Shinju', 'similarity': 0.6240444183349609}, {'index': 98, 'title': 'Chu-2 Byo demo Koi ga Shitai!', 'similarity': 0.6182321310043335}, {'index': 444, 'title': '精灵幻想记', 'similarity': 0.6172804236412048}, {'index': 558, 'title': '白兔糖', 'similarity': 0.6167734861373901}]


In [51]:
# Example: blending collaborative + content

def hybrid_recommend(user_id, query_title, top_k=10, alpha=0.6):
    # 1️⃣ Fuzzy match title to index
    results = recommend_similar_content(query_title, top_k=top_k, show_scores=True)
    
    # 2️⃣ Extract candidate anime indices
    candidate_indices = [r['index'] for r in results]
    
    # 3️⃣ Compute normalized content scores for candidates
    sim_scores = np.array([r['similarity'] for r in results])
    sim_scores_norm = (sim_scores - sim_scores.min()) / (sim_scores.max() - sim_scores.min() + 1e-8)
    
    # 4️⃣ Get collaborative predictions from SVD
    collab_scores = np.array([svd_model.predict(user_id, anime_final_df.iloc[idx]['anilist_id']).est
                              for idx in candidate_indices])
    
    # 5️⃣ Combine scores
    final_scores = alpha * collab_scores + (1 - alpha) * sim_scores_norm
    
    # 6️⃣ Sort by final score
    top_idx = np.argsort(-final_scores)[:top_k]
    
    recommendations = []
    for idx in top_idx:
        rec_idx = candidate_indices[idx]
        recommendations.append({
            "title": titles[rec_idx],
            "hybrid_score": float(final_scores[idx]),
            "collab_score": float(collab_scores[idx]),
            "content_score": float(sim_scores_norm[idx])
        })
    
    return recommendations



In [52]:
# Example usage:
hybrid_recommend(user_id=164, query_title="death not", top_k=8, alpha=0.7)


[{'title': ['DARKER THAN BLACK -Black Contractor-',
   'Bí Mật Bóng Tối',
   'Brama piekieł',
   'DARKER THAN BLACK: Kuro no Keiyakusha',
   'Darker than Black'],
  'hybrid_score': 5.819657403337024,
  'collab_score': 8.11512229266428,
  'content_score': 0.4635726615734273},
 {'title': ['Bungou Stray Dogs Movie',
   'Bungou Stray Dogs: DEAD APPLE',
   'Bungo Stray Dogs: DEAD APPLE'],
  'hybrid_score': 5.724140029769011,
  'collab_score': 7.790930741093728,
  'content_score': 0.9016283700113377},
 {'title': ['アキラ', 'AKIRA', 'Akira'],
  'hybrid_score': 5.689354027541429,
  'collab_score': 7.699077284347072,
  'content_score': 0.9999997616615998},
 {'title': ['BNHA 4',
   'MHA 4',
   '我的英雄学院 4',
   '我的英雄学院第四季',
   'มายฮีโร่ อคาเดเมีย ภาค 4',
   'أكاديميتي للأبطال',
   'Моя геройская академия 4',
   'Boku no Hero Academia 4',
   'My Hero Academia Season 4'],
  'hybrid_score': 5.6346238533328075,
  'collab_score': 7.790930741093728,
  'content_score': 0.6032411152239945},
 {'title': ['Fate/

In [53]:
import pickle

# Save the trained SVD model
with open("svd_model.pkl", "wb") as f:
    pickle.dump(svd_model, f)

print("✅ SVD model saved to svd_model.pkl")


✅ SVD model saved to svd_model.pkl


In [54]:
# embeddings variable or anime_final_df['content_embedding'].tolist()
np.save("anime_embeddings.npy", embeddings_matrix)
print("✅ Content embeddings saved to anime_embeddings.npy")


✅ Content embeddings saved to anime_embeddings.npy
