# ***ASSIGNMENT - 16***

## Importing Required Libraries

In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

## **Data Preprocessing:**

In [None]:
# ------------------------------------------------------------
# Load the Dataset
# ------------------------------------------------------------

data = pd.read_csv("anime.csv")

print("\nFirst 5 rows of the dataset:")
print(data.head())

print("\nDataset Information:")
print(data.info())

print("\nSummary Statistics:")
print(data.describe(include="all"))

# ------------------------------------------------------------
# Handle Missing Values
# ------------------------------------------------------------

print("\nMissing Values (Before Handling):")
print(data.isnull().sum())

# Convert episodes to numeric (handles 'Unknown')
data["episodes"] = pd.to_numeric(data["episodes"], errors="coerce")

# Numerical columns
numerical_cols = ["episodes", "rating", "members"]
for col in numerical_cols:
    data[col].fillna(data[col].median(), inplace=True)

# Categorical columns
categorical_cols = ["name", "genre", "type"]
for col in categorical_cols:
    data[col].fillna(data[col].mode()[0], inplace=True)

print("\nMissing Values (After Handling):")
print(data.isnull().sum())


First 5 rows of the dataset:
   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          GintamaÂ°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (tot

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)


## **Feature Extraction:**

In [None]:
# ------------------------------------------------------------
# Convert Categorical Features to Numerical
# ------------------------------------------------------------

# TF-IDF vectorization for genre
tfidf = TfidfVectorizer(stop_words="english")
genre_tfidf = tfidf.fit_transform(data["genre"])

# ------------------------------------------------------------
# Normalize Numerical Features
# ------------------------------------------------------------

scaler = MinMaxScaler()

numerical_features = data[["rating", "episodes", "members"]]
numerical_scaled = scaler.fit_transform(numerical_features)

# ------------------------------------------------------------
# Combine All Features
# ------------------------------------------------------------

feature_matrix = np.hstack((
    genre_tfidf.toarray(),
    numerical_scaled))

## **Recommendation System:**

In [None]:

# ------------------------------------------------------------
# Compute Cosine Similarity Matrix
# ------------------------------------------------------------

cosine_sim = cosine_similarity(feature_matrix)

# ------------------------------------------------------------
# Recommendation Function
# ------------------------------------------------------------

def recommend_anime(anime_title, top_n=5, similarity_threshold=0.3):
    """
    Recommend anime based on cosine similarity.

    Parameters:
    anime_title (str): Target anime name
    top_n (int): Number of recommendations
    similarity_threshold (float): Minimum similarity score

    Returns:
    DataFrame of recommended anime
    """

    if anime_title not in data["name"].values:
        return "Anime title not found in the dataset."

    idx = data[data["name"] == anime_title].index[0]

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Filter by threshold and remove the anime itself
    sim_scores = [
        (i, score) for i, score in sim_scores
        if score >= similarity_threshold and i != idx]

    sim_scores = sim_scores[:top_n]
    anime_indices = [i[0] for i in sim_scores]

    return data.loc[anime_indices, ["name", "genre", "type", "rating", "episodes"]]

# ------------------------------------------------------------
# Experimenting with Different Threshold Values
# ------------------------------------------------------------

print("\nRecommendations with threshold = 0.4:")
print(recommend_anime(data["name"].iloc[0], similarity_threshold=0.4))

print("\nRecommendations with threshold = 0.6:")
print(recommend_anime(data["name"].iloc[0], similarity_threshold=0.6))


Recommendations with threshold = 0.4:
                                       name  \
5805            Wind: A Breath of Heart OVA   
6394           Wind: A Breath of Heart (TV)   
1111  Aura: Maryuuin Kouga Saigo no Tatakai   
878           Shakugan no Shana II (Second)   
1201         Angel Beats!: Another Epilogue   

                                                  genre     type  rating  \
5805               Drama, Romance, School, Supernatural      OVA    6.35   
6394               Drama, Romance, School, Supernatural       TV    6.14   
1111       Comedy, Drama, Romance, School, Supernatural    Movie    7.67   
878   Action, Drama, Fantasy, Romance, School, Super...       TV    7.79   
1201                        Drama, School, Supernatural  Special    7.63   

      episodes  
5805       3.0  
6394      13.0  
1111       1.0  
878       24.0  
1201       1.0  

Recommendations with threshold = 0.6:
                                       name  \
5805            Wind: A Breath of

In [None]:
"""
Performance Analysis:

Strengths:
- Content-based recommendation using cosine similarity.
- No dependency on user interaction data.
- Handles cold-start problem for new anime.

Limitations:
- No personalization based on user taste.
- Genre similarity dominates recommendations.
- Does not learn from user feedback.

Possible Improvements:
- Incorporate user-rating matrix for collaborative filtering.
- Use hybrid recommendation (content + collaborative).
- Assign weights to different features.
- Apply dimensionality reduction (PCA).
"""

'\nPerformance Analysis:\n\nStrengths:\n- Content-based recommendation using cosine similarity.\n- No dependency on user interaction data.\n- Handles cold-start problem for new anime.\n\nLimitations:\n- No personalization based on user taste.\n- Genre similarity dominates recommendations.\n- Does not learn from user feedback.\n\nPossible Improvements:\n- Incorporate user-rating matrix for collaborative filtering.\n- Use hybrid recommendation (content + collaborative).\n- Assign weights to different features.\n- Apply dimensionality reduction (PCA).\n'

## **Interview Questions:**

#### 1. Difference between User-based and Item-based Collaborative Filtering
User-Based Collaborative Filtering:
- Finds users with similar preferences.
- Recommends items liked by similar users.
- Scalability issues with large user bases.

Item-Based Collaborative Filtering:
- Finds similarity between items.
- Recommends items similar to those the user liked.
- More stable and scalable.

### 2. Collaborative Filtering:
Collaborative Filtering:
- Recommendation approach based on user behavior.
- Uses ratings, clicks, or interactions.
- Assumes similar users have similar preferences.

How It Works:
- Build a user-item interaction matrix.
- Compute similarity between users or items.
- Generate recommendations based on similarity.