<a href="https://colab.research.google.com/github/Pravallika-02-datascience/Recommendation-System-Project/blob/main/Recommendation_system_Project_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ---------------------------------------------------------------
# Anime Recommendation System • Cosine‑Similarity Approach
# ---------------------------------------------------------------
# One‑time installs (uncomment if needed):
# !pip install pandas numpy scikit-learn

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

# -------------------------------------------------
# 1. LOAD & CLEAN DATA
# -------------------------------------------------
df = pd.read_csv("anime.csv")        # path inside the unzipped folder

# Replace 'Unknown' episode counts with NaN, convert to float
df["episodes"] = pd.to_numeric(df["episodes"].replace("Unknown", np.nan), errors="coerce")

# Drop rows lacking critical info
df = df.dropna(subset=["genre", "type", "rating", "episodes"]).reset_index(drop=True)

print("Cleaned dataset size:", df.shape)

# -------------------------------------------------
# 2. FEATURE EXTRACTION
# -------------------------------------------------
# (a)  CountVectorizer turns comma‑separated genres into binary bag‑of‑words
genre_vect = CountVectorizer(tokenizer=lambda x: x.split(", "), binary=True)

# (b)  One‑hot encode the broadcast 'type' (TV, OVA, …)
type_enc   = OneHotEncoder(drop="first")

# (c)  Numeric attributes  (episodes, rating, members) – z‑score scaled
num_cols   = ["episodes", "rating", "members"]
num_scale  = StandardScaler()

# ColumnTransformer glues everything together
preprocess = ColumnTransformer(
    transformers=[
        ("genre", genre_vect, "genre"),
        ("type",  type_enc,   ["type"]),
        ("num",   num_scale,  num_cols)
    ],
    remainder="drop"
)

# Fit–transform to get the final feature matrix
feature_matrix = preprocess.fit_transform(df)

print("Feature matrix shape:", feature_matrix.shape)

# -------------------------------------------------
# 3. COSINE‑SIMILARITY MATRIX
# -------------------------------------------------
# Cosine similarity for every anime vs every other anime
sim_matrix = cosine_similarity(feature_matrix)

# -------------------------------------------------
# 4. RECOMMENDATION FUNCTION
# -------------------------------------------------
def recommend(anime_title, top_n=10, min_sim=0.2):
    """
    Return up to `top_n` anime similar to `anime_title`,
    filtered by a minimum cosine similarity.
    """
    if anime_title not in df["name"].values:
        raise ValueError("Anime title not found!")

    idx  = df.index[df["name"] == anime_title][0]
    sims = list(enumerate(sim_matrix[idx]))

    # Sort by similarity (ignore the anime itself at index idx)
    sims = sorted(sims, key=lambda x: x[1], reverse=True)
    sims = [(i, s) for i, s in sims if i != idx and s >= min_sim][:top_n]

    recs = df.iloc[[i for i, _ in sims]][["name", "genre", "rating"]].copy()
    recs["similarity"] = [s for _, s in sims]
    return recs.reset_index(drop=True)

# Quick sanity check
print("\nSample recommendations for 'Naruto':")
print(recommend("Naruto").head())

# -------------------------------------------------
# 5. SIMPLE TRAIN/TEST EVALUATION
# -------------------------------------------------
# Idea: Hold‑out 20 % of titles; for each test title, ask if
# at least **one** of its true  “Type” peers appears in top‑k
# recommendations → compute Precision/Recall/F1 at k.

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["type"])

def relevant_set(row, pool):
    """Return indices of anime in `pool` that share the same broadcast type."""
    return pool.index[pool["type"] == row["type"]].tolist()

k = 10
y_true, y_pred = [], []

for _, test_row in test_df.iterrows():
    idx_test = df.index[df["name"] == test_row["name"]][0]
    rec_indices = (
        np.argsort(sim_matrix[idx_test])[::-1]   # descending
        .tolist()
    )

    rec_indices = [i for i in rec_indices if i != idx_test][:k]
    predicted_relevant = set(rec_indices)

    actual_relevant = set(relevant_set(test_row, train_df))

    # For Precision/Recall calculation: label 1 if any overlap, else 0
    y_true.append(1)
    y_pred.append(1 if predicted_relevant & actual_relevant else 0)

precision = precision_score(y_true, y_pred)
recall    = recall_score(y_true, y_pred)
f1        = f1_score(y_true, y_pred)

print(f"\nEvaluation @k={k}")
print(f" Precision: {precision:.3f}")
print(f" Recall   : {recall:.3f}")
print(f" F1‑score : {f1:.3f}")

# -------------------------------------------------
# 6. USAGE EXAMPLE
# -------------------------------------------------
"""
>>> from anime_recommender import recommend
>>> recommend("Fullmetal Alchemist: Brotherhood", top_n=5)
"""

# End of script


Cleaned dataset size: (11830, 7)




Feature matrix shape: (11830, 51)

Sample recommendations for 'Naruto':
                     name                                              genre  \
0              Fairy Tail  Action, Adventure, Comedy, Fantasy, Magic, Sho...   
1                  Bleach  Action, Comedy, Shounen, Super Power, Supernat...   
2  Hunter x Hunter (2011)            Action, Adventure, Shounen, Super Power   
3              D.Gray-man                 Action, Adventure, Comedy, Shounen   
4              Soul Eater  Action, Adventure, Comedy, Fantasy, Shounen, S...   

   rating  similarity  
0    8.22    0.980142  
1    7.95    0.964667  
2    9.13    0.961851  
3    8.20    0.957116  
4    8.08    0.944704  

Evaluation @k=10
 Precision: 1.000
 Recall   : 0.997
 F1‑score : 0.998


'\n>>> from anime_recommender import recommend\n>>> recommend("Fullmetal Alchemist: Brotherhood", top_n=5)\n'