In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

# Load the dataset
file_path = '/mnt/data/anime.csv'  # Update with the correct file path if needed
df = pd.read_csv('/content/anime.csv')

# Display basic information about the dataset
print("Dataset Info:")
df.info()

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Fill missing values
if 'genre' in df.columns:
    df['genre'].fillna('Unknown', inplace=True)
if 'rating' in df.columns:
    df['rating'].fillna(df['rating'].mean(), inplace=True)

# Display the first few rows to understand the structure
print("\nDataset Preview:")
print(df.head())

# Feature Extraction
# Selecting features for similarity computation (e.g., genres, user ratings)
features = ['genre', 'rating']
df = df[features]

# Convert categorical features into numerical representations
if 'genre' in df.columns:
    label_encoder = LabelEncoder()
    df['genre'] = label_encoder.fit_transform(df['genre'].astype(str))

# Normalize numerical features
if 'rating' in df.columns:
    scaler = MinMaxScaler()
    df['rating'] = scaler.fit_transform(df[['rating']])

# Split dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Compute cosine similarity
def compute_similarity(df):
    return cosine_similarity(df)

# Recommend anime based on cosine similarity
def recommend_anime(target_index, df, similarity_matrix, threshold=0.5, top_n=5):
    similarity_scores = similarity_matrix[target_index]
    similar_anime_indices = np.where(similarity_scores > threshold)[0]
    similar_anime_indices = similar_anime_indices[similar_anime_indices != target_index]  # Exclude the target itself
    sorted_indices = similar_anime_indices[np.argsort(-similarity_scores[similar_anime_indices])]  # Sort by similarity score
    return sorted_indices[:top_n]  # Return indices instead of values

# Compute similarity matrix
similarity_matrix = compute_similarity(train_df)

# Evaluate the recommendation system
def evaluate_recommendations(test_df, similarity_matrix, threshold=0.5):
    y_true = []
    y_pred = []

    for i in range(len(test_df)):
        recommended_indices = recommend_anime(i, train_df, similarity_matrix, threshold)

        if i in recommended_indices:
            y_pred.append(1)
        else:
            y_pred.append(0)

        y_true.append(1)  # Assume all test instances are relevant

    precision = precision_score(y_true, y_pred, zero_division=1)
    recall = recall_score(y_true, y_pred, zero_division=1)
    f1 = f1_score(y_true, y_pred, zero_division=1)

    return precision, recall, f1

precision, recall, f1 = evaluate_recommendations(test_df, similarity_matrix)
print("\nEvaluation Metrics:")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")


# Areas for Improvement
#Enhancing Feature Selection
  #Incorporate synopsis, user reviews, and number of ratings to provide richer similarity comparisons.
  #Use TF-IDF for text-based features like genres and descriptions.

#Hybrid Recommendation Approach
  #Use collaborative filtering (based on user behavior) along with content-based filtering to improve accuracy.
  #Consider matrix factorization techniques (e.g., SVD) to find hidden patterns in user preferences.
#
#Hyperparameter Tuning
  #Experiment with different cosine similarity thresholds to optimize recommendation diversity and relevance.
  #Tune normalization techniques for features like ratings to prevent bias.

#More Robust Evaluation Metrics
  #Instead of assuming all test instances are relevant, use real user feedback for Mean Average Precision (MAP) and Mean Reciprocal Rank (MRR).
  #Conduct A/B testing with real users to measure actual engagement.


#Interview Questions:
  #1. Can you explain the difference between user-based and item-based collaborative filtering?
  #Collaborative filtering (CF) is a recommendation technique that suggests items based on user behavior and preferences.

#Feature :	 User-Based Collaborative Filtering	/ Item-Based Collaborative Filtering
#Concept:	Finds users similar to the target user and recommends what similar users liked.	/ Finds items similar to what the user has interacted with and recommends those items.
#Similarity Calculation :	Compares users based on their interaction history (e.g., ratings, clicks)./	Compares items based on how users have interacted with them.
#Common Algorithm :	Pearson correlation, Cosine similarity between user vectors./	Cosine similarity, Jaccard similarity between item vectors.
#Scalability :	Less scalable, as new users require comparing with all others. /	More scalable, as item similarity can be precomputed.
#Cold Start Issue : Struggles with new users who have no interaction history. /	Struggles with new items that have no interactions.

#Example:

#User-Based CF: "Users who liked Naruto also liked One Piece."
#Item-Based CF: "Since you liked Naruto, you might like Bleach because many users rated them similarly."

#2. What is collaborative filtering, and how does it work?
   #Collaborative Filtering (CF) is a recommendation system technique that makes predictions based on the preferences of similar users or items.

#How It Works:
 #Data Collection – Collects user-item interaction data (e.g., ratings, views, purchases).
 #Similarity Computation – Uses similarity metrics (e.g., cosine similarity, Pearson correlation) to find similar users or items.
 #Recommendation Generation – Predicts missing ratings based on the behavior of similar users/items.
 #Ranking & Filtering – Sorts recommendations by relevance and presents them to the user.

 #Types of Collaborative Filtering:
#User-Based CF – Finds users with similar tastes and recommends items they liked.
#Item-Based CF – Finds similar items and recommends them based on past interactions.
#Matrix Factorization (e.g., SVD, ALS) – Uses mathematical techniques to reduce dimensionality and uncover hidden patterns.


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB

Missing Values:
anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

Dataset Preview:
   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

        

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['genre'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['rating'].fillna(df['rating'].mean(), inplace=True)



Evaluation Metrics:
Precision: 1.00
Recall: 0.00
F1-score: 0.00


In [4]:
df=df[features]

In [6]:
df[features]

Unnamed: 0,genre,rating
0,2686,0.924370
1,161,0.911164
2,534,0.909964
3,3240,0.900360
4,534,0.899160
...,...,...
12289,2903,0.297719
12290,2903,0.313325
12291,2903,0.385354
12292,2903,0.397359
