In [None]:
import pandas as pd

In [None]:
# Load the dataset
file_path = '/mnt/data/anime.csv'
anime_df = pd.read_csv(file_path)

In [None]:
# Display first few rows and dataset info
anime_df.head(), anime_df.info()

In [None]:
"""
anime_recommender.py
Drop this file in the same environment where you have anime_features.csv
"""

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from difflib import get_close_matches
from typing import Union, Optional

In [None]:
# --------- CONFIG: update if your path differs ----------
FEATURES_PATH = r"D:\DATA SCIENCE\ASSIGNMENTS\11 recommendation system\anime_features.csv"
# -------------------------------------------------------

In [None]:
class AnimeRecommender:
    def __init__(self, features_path: str = FEATURES_PATH, compute_matrix: bool = True):
        # Load precomputed features
        self.df = pd.read_csv(features_path)
        # Identify feature columns (everything except anime_id & name)
        self.id_col = 'anime_id'
        self.name_col = 'name'
        self.feature_cols = [c for c in self.df.columns if c not in {self.id_col, self.name_col}]
        # Feature matrix
        self.features = self.df[self.feature_cols].values.astype(float)
        # Precompute cosine similarity matrix if requested
        self.sim_matrix = None
        if compute_matrix:
            self.sim_matrix = cosine_similarity(self.features)
            # Keep numerical stability: clip tiny negatives to 0, and 1 on diagonal
            np.fill_diagonal(self.sim_matrix, 1.0)

    def _get_index_for_title(self, title: Union[int, str]) -> Optional[int]:
        """
        Return dataframe index for a given anime name or anime_id.
        If string given, tries exact match (case-insensitive), then fuzzy match.
        """
        if isinstance(title, int) or (isinstance(title, str) and title.isdigit()):
            # try anime_id
            try:
                anime_id = int(title)
                matches = self.df[self.df[self.id_col] == anime_id]
                if not matches.empty:
                    return matches.index[0]
            except ValueError:
                pass

        if isinstance(title, str):
            # exact case-insensitive match
            mask = self.df[self.name_col].str.lower() == title.lower()
            if mask.any():
                return mask.idxmax()

            # partial substring match
            substr_mask = self.df[self.name_col].str.lower().str.contains(title.lower())
            if substr_mask.any():
                return substr_mask[substr_mask].index[0]

            # fuzzy match fallback
            choices = self.df[self.name_col].tolist()
            close = get_close_matches(title, choices, n=1, cutoff=0.6)
            if close:
                return self.df[self.name_col] == close[0].__str__() and self.df[self.name_col][self.df[self.name_col] == close[0]].index[0]
        return None

    def recommend_anime(self,
                        target: Union[str, int],
                        top_n: int = 10,
                        threshold: Optional[float] = None,
                        include_target: bool = False) -> pd.DataFrame:
        """
        Recommend similar anime for a given target (title string or anime_id).
        Parameters:
            - target: anime title (str) or anime_id (int or numeric string)
            - top_n: return up to top_n recommendations (ignored if threshold used and fewer results)
            - threshold: float in [0,1]. If provided, returns all anime with similarity >= threshold.
                         If None, returns top_n highest-similarity anime.
            - include_target: whether to include the target anime itself in results (default False)
        Returns:
            pandas DataFrame with columns: anime_id, name, similarity
        """
        idx = self._get_index_for_title(target)
        if idx is None:
            raise ValueError(f"Target '{target}' not found (no close matches). Check spelling or use anime_id.")

        # compute similarity row if matrix isn't precomputed
        if self.sim_matrix is not None:
            sims = self.sim_matrix[idx]
        else:
            sims = cosine_similarity(self.features[idx:idx+1], self.features).flatten()

        results = pd.DataFrame({
            self.id_col: self.df[self.id_col],
            self.name_col: self.df[self.name_col],
            'similarity': sims
        })

        # Optionally drop target
        if not include_target:
            results = results[results[self.id_col] != self.df.loc[idx, self.id_col]]

        # Apply threshold or top_n
        if threshold is not None:
            filtered = results[results['similarity'] >= float(threshold)].sort_values('similarity', ascending=False)
            return filtered.reset_index(drop=True)
        else:
            top = results.sort_values('similarity', ascending=False).head(top_n).reset_index(drop=True)
            return top

In [None]:
# ---------------- Example usage ----------------
if __name__ == "__main__":
    rec = AnimeRecommender(FEATURES_PATH)
    # Example 1: by exact title
    print("Top 8 similar to 'Fullmetal Alchemist: Brotherhood':")
    print(rec.recommend_anime("Fullmetal Alchemist: Brotherhood", top_n=8))

    # Example 2: fuzzy / partial ti

In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler

In [None]:
# Load dataset from your local path
file_path = r"D:\DATA SCIENCE\ASSIGNMENTS\11 recommendation system\Recommendation System\anime.csv"
anime = pd.read_csv(file_path)

In [None]:
# --- Step 1 Cleaning ---
anime['episodes'] = pd.to_numeric(anime['episodes'].replace("Unknown", None), errors='coerce')
anime['genre'] = anime['genre'].fillna("Unknown")
anime['type'] = anime['type'].fillna("Unknown")
anime['rating'] = anime['rating'].fillna(anime['rating'].mean())

In [None]:
# --- Step 2 Feature Extraction ---
def split_genres(genres):
    if not genres or genres == "Unknown":
        return []
    return [g.strip() for g in genres.split(',') if g.strip()]

In [None]:
anime['genre_list'] = anime['genre'].apply(split_genres)

In [None]:
# Multi-hot encode genres
mlb = MultiLabelBinarizer(sparse_output=False)
genre_matrix = mlb.fit_transform(anime['genre_list'])
genre_cols = [f"genre__{g}" for g in mlb.classes_]
genre_df = pd.DataFrame(genre_matrix, columns=genre_cols, index=anime.index)

In [None]:
# Normalize rating and members
scaler = MinMaxScaler()
num_df = anime[['rating', 'members']].copy()
num_df[['rating_norm', 'members_norm']] = scaler.fit_transform(num_df[['rating', 'members']])

In [None]:
# Combine all features
features_df = pd.concat([anime[['anime_id', 'name']], genre_df, num_df[['rating_norm', 'members_norm']]], axis=1)

In [None]:
# --- Save processed file to your folder ---
out_path = r"D:\DATA SCIENCE\ASSIGNMENTS\11 recommendation system\anime_features.csv"
features_df.to_csv(out_path, index=False)

In [None]:
print("✅ Feature extraction complete!")
print(f"Shape of feature matrix: {features_df.shape}")
print(f"File saved to: {out_path}")

In [None]:
import pandas as pd

In [None]:
# Load dataset
file_path = r"D:\DATA SCIENCE\ASSIGNMENTS\11 recommendation system\Recommendation System\anime.csv"
anime_df = pd.read_csv(file_path)

In [None]:
# Convert "episodes" to numeric and handle missing values
anime_df['episodes'] = pd.to_numeric(anime_df['episodes'].replace("Unknown", None), errors='coerce')
anime_df['genre'] = anime_df['genre'].fillna("Unknown")
anime_df['type'] = anime_df['type'].fillna("Unknown")
anime_df['rating'] = anime_df['rating'].fillna(anime_df['rating'].mean())