In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv(r"C:\Users\hp\OneDrive\anime.csv") 
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [6]:
df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [14]:
data_cleaned = df.dropna()

In [15]:
data_cleaned.isnull().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [16]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler





# Convert 'episodes' to numeric, handling 'Unknown' values
data_cleaned['episodes'] = pd.to_numeric(data_cleaned['episodes'], errors='coerce').fillna(0).astype(int)

# Process 'genre' as multi-label data
data_cleaned['genre'] = data_cleaned['genre'].apply(lambda x: x.split(', '))
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(data_cleaned['genre'])

# Encode 'type' as a one-hot vector
type_encoded = pd.get_dummies(data_cleaned['type'])

# Combine features
features = np.concatenate([genre_encoded, type_encoded, 
                           data_cleaned[['episodes', 'rating', 'members']].values], axis=1)

# Normalize the combined features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(features_scaled)

# Recommendation function based on cosine similarity
def recommend_anime(anime_name, data_cleaned, similarity_matrix, top_n=5):
    anime_index = data_cleaned[data_cleaned['name'] == anime_name].index[0]
    similarity_scores = list(enumerate(similarity_matrix[anime_index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    recommended_indices = [i[0] for i in similarity_scores[1:top_n+1]]
    
    return df.iloc[recommended_indices][['name', 'genre', 'rating']]

# Example usage
print(recommend_anime('Fullmetal Alchemist: Brotherhood', data_cleaned, similarity_matrix, top_n=3))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned['episodes'] = pd.to_numeric(data_cleaned['episodes'], errors='coerce').fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned['genre'] = data_cleaned['genre'].apply(lambda x: x.split(', '))


                    name                                              genre  \
200  Fullmetal Alchemist  Action, Adventure, Comedy, Drama, Fantasy, Mag...   
288           Fairy Tail  Action, Adventure, Comedy, Fantasy, Magic, Sho...   
775       Akame ga Kill!                         Action, Adventure, Fantasy   

     rating  
200    8.33  
288    8.22  
775    7.84  


In [17]:
def recommend_anime(anime_name, data_cleaned, similarity_matrix, threshold=0.5, top_n=10):
    anime_index = data_cleaned[data_cleaned['name'] == anime_name].index[0]
    similarity_scores = list(enumerate(similarity_matrix[anime_index]))
    
    # Filter based on threshold
    filtered_scores = [(i, score) for i, score in similarity_scores if i != anime_index and score >= threshold]
    filtered_scores = sorted(filtered_scores, key=lambda x: x[1], reverse=True)[:top_n]
    
    recommended_indices = [i[0] for i in filtered_scores]
    return df.iloc[recommended_indices][['name', 'genre', 'rating']]


In [20]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_recommendations(test_df, train_df, similarity_matrix, threshold=0.5):
    y_true = []
    y_pred = []
    
    for _, row in test_df.iterrows():
        target_genre = set(row['genre'])
        
        # Get recommendations for this anime in the train set
        recommendations = recommend_anime(row['name'], train_df, similarity_matrix, threshold=threshold)
        
        # Define true labels (1 if genre overlap, 0 otherwise)
        true_labels = [(1 if set(rec_genre).intersection(target_genre) else 0) for rec_genre in recommendations['genre']]
        
        # Predicted labels (1 for all recommendations)
        predicted_labels = [1] * len(true_labels)
        
        y_true.extend(true_labels)
        y_pred.extend(predicted_labels)
    
    # Calculate metrics
    precision = precision_score(y_true, y_pred, zero_division=1)
    recall = recall_score(y_true, y_pred, zero_division=1)
    f1 = f1_score(y_true, y_pred, zero_division=1)
    
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-score: {f1:.2f}")


# Interview Questions

In [None]:
#1. Can you explain the difference between user-based and item-based collaborative filtering?

# 1. user based : - when two user are having similar preferences other user with same history pattern will be recommended the same type of content

# 2. item based : - when a user has interacted with an item or product in historty will be recommended again in the future



In [None]:
#2. What is collaborative filtering, and how does it work?

# Collaborative filtering is a popular technique used in recommendation systems to suggest items to users based on their past interactions.
# It relies on patterns of user behavior rather than explicit features of the items,
# making it useful when item metadata is unavailable or incomplete.
# there are two types of collaborative filetering 
# 1. user based : - when two user are having similar preferences other user with same history pattern will be recommended the same type of content

# 2. item based : - when a user has interacted with an item or product in historty will be recommended again in the future