In [8]:
import pandas as pd
df = pd.read_csv("anime.csv")
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [1]:
# Data Preprocessing:
# Load the dataset into a suitable data structure (e.g., pandas DataFrame).
# Handle missing values, if any.
# Explore the dataset to understand its structure and attributes.

import pandas as pd

# Load the dataset
df = pd.read_csv("anime.csv")

# Handle missing values (example: fill with mean for numerical columns)
numerical_cols = df.select_dtypes(include=['number']).columns
for col in numerical_cols:
    df[col] = df[col].fillna(df[col].mean())


# Explore the dataset
print(df.head())  # Display first 5 rows
print(df.info())  # Display data types and non-null counts
print(df.describe())  # Descriptive statistics for numerical columns
print(df.isnull().sum()) # Check the number of missing values in each column

# Example: Explore unique values of a categorical column
# print(df['genre'].unique())




   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  

In [12]:
# prompt: Feature Extraction:
# Decide on the features that will be used for computing similarity (e.g., genres, user ratings).
# Convert categorical features into numerical representations if necessary.
# Normalize numerical features if required

import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Assuming df is already loaded and preprocessed as in the previous code

# Feature Selection and Engineering
features = ['genre', 'rating', 'members']  # Example features

# Convert 'genre' (categorical) to numerical using one-hot encoding
df = pd.get_dummies(df, columns=['genre'], prefix='genre')

# Normalize numerical features (rating and members) using MinMaxScaler
scaler = MinMaxScaler()
df[['rating_scaled', 'members_scaled']] = scaler.fit_transform(df[['rating', 'members']])

# Create a feature matrix
feature_matrix = df[list(df.filter(regex='genre')) + ['rating_scaled', 'members_scaled']]
print(feature_matrix.head())


   genre_Action  genre_Action, Adventure  \
0         False                    False   
1         False                    False   
2         False                    False   
3         False                    False   
4         False                    False   

   genre_Action, Adventure, Cars, Comedy, Sci-Fi, Shounen  \
0                                              False        
1                                              False        
2                                              False        
3                                              False        
4                                              False        

   genre_Action, Adventure, Cars, Mecha, Sci-Fi, Shounen, Sports  \
0                                              False               
1                                              False               
2                                              False               
3                                              False               
4                           

In [3]:
#  Recommendation System:
# Design a function to recommend anime based on cosine similarity.
# Given a target anime, recommend a list of similar anime based on cosine similarity scores.
# Experiment with different threshold values for similarity scores to adjust the recommendation list size.

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

def recommend_anime(target_anime, threshold=0.5):
    # Preprocess the data
    df['genre'] = df['genre'].fillna('')

    # Create a count vectorizer
    cv = CountVectorizer()
    count_matrix = cv.fit_transform(df['genre'])

    # Calculate cosine similarity
    cosine_sim = cosine_similarity(count_matrix)

    # Get the index of the target anime
    try:
        target_index = df[df['name'] == target_anime].index[0]
    except IndexError:
        print(f"Anime '{target_anime}' not found in the dataset.")
        return []

    # Get similarity scores for the target anime
    similarity_scores = list(enumerate(cosine_sim[target_index]))

    # Sort the anime based on similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Recommend anime above the threshold
    recommendations = []
    for anime_index, score in similarity_scores:
        if score > threshold and anime_index != target_index:
            recommendations.append(
                (df['name'][anime_index], score)
            )
    return recommendations

# Example usage
recommendations = recommend_anime("Naruto", threshold=0.6)  # Adjust the threshold
if recommendations:
    print("Recommended Anime:")
    for anime, score in recommendations:
        print(f"- {anime} (Similarity Score: {score:.2f})")
else:
    print("No recommendations found for the specified anime and threshold.")


Recommended Anime:
- Boruto: Naruto the Movie (Similarity Score: 1.00)
- Naruto: Shippuuden (Similarity Score: 1.00)
- Boruto: Naruto the Movie - Naruto ga Hokage ni Natta Hi (Similarity Score: 1.00)
- Naruto x UT (Similarity Score: 1.00)
- Naruto: Shippuuden Movie 4 - The Lost Tower (Similarity Score: 1.00)
- Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsugu Mono (Similarity Score: 1.00)
- Naruto Shippuuden: Sunny Side Battle (Similarity Score: 1.00)
- Naruto Soyokazeden Movie: Naruto to Mashin to Mitsu no Onegai Dattebayo!! (Similarity Score: 1.00)
- Kyutai Panic Adventure! (Similarity Score: 0.93)
- Dragon Ball Z (Similarity Score: 0.88)
- Dragon Ball Kai (2014) (Similarity Score: 0.88)
- Dragon Ball Kai (Similarity Score: 0.88)
- Medaka Box Abnormal (Similarity Score: 0.88)
- Dragon Ball Z Movie 15: Fukkatsu no F (Similarity Score: 0.88)
- Dragon Ball Super (Similarity Score: 0.88)
- Medaka Box (Similarity Score: 0.88)
- Tenjou Tenge (Similarity Score: 0.88)
- Dragon Ball Z: Summer 

In [7]:
#  Evaluation:
# Split the dataset into training and testing sets.
# Evaluate the recommendation system using appropriate metrics such as precision, recall, and F1-score.
# Analyze the performance of the recommendation system and identify areas of improvement.

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

import pandas as pd
df = pd.read_csv("anime.csv")

# Feature Engineering (Example:  I'll likely need more sophisticated features)
# For this example, we'll use 'genre' as a simple feature.  In a real-world scenario,
# you would use more relevant features like user ratings, watch history, etc.
df['genre_list'] = df['genre'].str.split(', ')

# Create user-item interaction matrix (replace with your actual interaction data)
# This is a placeholder example;  adapt to your data structure.
#  For example: user_anime_ratings
user_anime_ratings = pd.DataFrame({
    'user_id': [1, 1, 2, 2, 3, 3],
    'anime_id': [1, 2, 1, 3, 2, 3],
    'rating': [5, 4, 3, 5, 2, 4]  # Example ratings
})

# Split data
train_data, test_data = train_test_split(user_anime_ratings, test_size=0.2, random_state=42)

# Placeholder for a recommendation model (REPLACE THIS WITH YOUR ACTUAL MODEL)
def simple_recommendation_model(user_id, train_data):
    # Example: Recommend the top 2 most frequently rated anime
    recommendations = train_data.groupby('anime_id')['rating'].count().sort_values(ascending=False).head(2).index.tolist()
    return recommendations

# Make predictions on the test set
predictions = []
actual = []
for user_id, anime_id, rating in test_data.values:
  recommendations = simple_recommendation_model(user_id, train_data)
  predictions.append(anime_id in recommendations)  # Check if the actual anime was recommended
  actual.append(1)  # Treat as relevant because it was rated


# Calculate evaluation metrics
precision = precision_score(actual, predictions)
recall = recall_score(actual, predictions)
f1 = f1_score(actual, predictions)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

# Analysis and Areas of Improvement:
# 1. Feature Engineering:  Explore user demographics, watch history, reviews, and other features.
# 2. Model Selection: Try different recommendation algorithms (collaborative filtering, content-based filtering, hybrid).
# 3. Hyperparameter Tuning: Optimize parameters for your chosen model.
# 4. Data Quality:  Clean and preprocess data, handle missing values appropriately.
# 5. Cold-Start Problem:  Address recommendations for new users or anime with limited data.
# 6. Evaluation Metrics: Consider more comprehensive evaluation metrics like NDCG or MAP.


Precision: 1.0
Recall: 0.5
F1-score: 0.6666666666666666


In [3]:
#Interview Questions:
# 1. Can you explain the difference between user-based and item-based collaborative filtering?

def explain_collaborative_filtering():
    """Explains collaborative filtering and its types."""
    print("Collaborative filtering is a recommendation technique that predicts user preferences based on the preferences of similar users or items.")
    print("It leverages the 'wisdom of the crowd' – if users A and B have similar tastes in the past, then A is likely to enjoy what B liked recently.")
    print("\nThere are two main types:")
    print("\n1. User-based Collaborative Filtering:")
    print("   - Finds users with similar tastes to a target user.  The system identifies users with similar rating patterns for items they've both interacted with (e.g., movies, products).")
    print("   - Predicts ratings for items based on the ratings of similar users.  It aggregates the ratings of those similar users for items the target user hasn't yet rated, providing a prediction of how the target user might rate them.")
    print("   - Pros: Relatively simple to implement.")
    print("   - Cons: Can suffer from scalability issues as the user base grows (comparing a user to every other user becomes computationally expensive). Susceptible to popularity bias (over-recommending popular items because many similar users have interacted with them).  Also suffers from the 'cold start' problem for new users with limited interaction history.")

    print("\n2. Item-based Collaborative Filtering:")
    print("   - Finds items similar to items a target user has liked. It identifies items that are frequently rated similarly by users.")
    print("   - Predicts ratings for new items based on how similar items have been rated.  If a user liked item A, and item B is similar to A (based on how other users rated them), the system predicts the user will also like item B.")
    print("   - Pros: More scalable compared to user-based filtering (item similarity is relatively static). Less susceptible to changes in user behavior (user profiles can change, but item similarity is more stable).")
    print("   - Cons: Might not capture diverse user preferences as well as user-based (doesn't consider individual user nuances as much). Also suffers from the 'cold start' problem for new items with limited interaction history.")

def answer_interview_questions():
    explain_collaborative_filtering()
    

answer_interview_questions()


Collaborative filtering is a recommendation technique that predicts user preferences based on the preferences of similar users or items.
It leverages the 'wisdom of the crowd' – if users A and B have similar tastes in the past, then A is likely to enjoy what B liked recently.

There are two main types:

1. User-based Collaborative Filtering:
   - Finds users with similar tastes to a target user.  The system identifies users with similar rating patterns for items they've both interacted with (e.g., movies, products).
   - Predicts ratings for items based on the ratings of similar users.  It aggregates the ratings of those similar users for items the target user hasn't yet rated, providing a prediction of how the target user might rate them.
   - Pros: Relatively simple to implement.
   - Cons: Can suffer from scalability issues as the user base grows (comparing a user to every other user becomes computationally expensive). Susceptible to popularity bias (over-recommending popular items 

In [5]:
#  2. What is collaborative filtering, and how does it work?

def explain_collaborative_filtering():
    """Explains collaborative filtering and its types."""
    print("Collaborative filtering is a recommendation technique that predicts user preferences based on the preferences of similar users or items.")
    print("It leverages the 'wisdom of the crowd' – if users A and B have similar tastes in the past, then A is likely to enjoy what B liked recently.")
    print("\nThere are two main types:")
    print("\n1. User-based Collaborative Filtering:")
    print("   - Finds users with similar tastes to a target user.  The system identifies users with similar rating patterns for items they've both interacted with (e.g., movies, products).")
    print("   - Predicts ratings for items based on the ratings of similar users.  It aggregates the ratings of those similar users for items the target user hasn't yet rated, providing a prediction of how the target user might rate them.")
    print("   - Pros: Relatively simple to implement.")
    print("   - Cons: Can suffer from scalability issues as the user base grows (comparing a user to every other user becomes computationally expensive). Susceptible to popularity bias (over-recommending popular items because many similar users have interacted with them).  Also suffers from the 'cold start' problem for new users with limited interaction history.")

    print("\n2. Item-based Collaborative Filtering:")
    print("   - Finds items similar to items a target user has liked. It identifies items that are frequently rated similarly by users.")
    print("   - Predicts ratings for new items based on how similar items have been rated.  If a user liked item A, and item B is similar to A (based on how other users rated them), the system predicts the user will also like item B.")
    print("   - Pros: More scalable compared to user-based filtering (item similarity is relatively static). Less susceptible to changes in user behavior (user profiles can change, but item similarity is more stable).")
    print("   - Cons: Might not capture diverse user preferences as well as user-based (doesn't consider individual user nuances as much). Also suffers from the 'cold start' problem for new items with limited interaction history.")


In [7]:
# p2. What is collaborative filtering, and how does it work?

def explain_collaborative_filtering():
    """Explains collaborative filtering and its types."""
    print("Collaborative filtering is a recommendation technique that predicts user preferences based on the preferences of similar users or items.")
    print("It leverages the 'wisdom of the crowd' – if users A and B have similar tastes in the past, then A is likely to enjoy what B liked recently.")
    print("\nThere are two main types:")
    print("\n1. User-based Collaborative Filtering:")
    print("   - Finds users with similar tastes to a target user.  The system identifies users with similar rating patterns for items they've both interacted with (e.g., movies, products).")
    print("   - Predicts ratings for items based on the ratings of similar users.  It aggregates the ratings of those similar users for items the target user hasn't yet rated, providing a prediction of how the target user might rate them.")
    print("   - Pros: Relatively simple to implement.")
    print("   - Cons: Can suffer from scalability issues as the user base grows (comparing a user to every other user becomes computationally expensive). Susceptible to popularity bias (over-recommending popular items because many similar users have interacted with them).  Also suffers from the 'cold start' problem for new users with limited interaction history.")

    print("\n2. Item-based Collaborative Filtering:")
    print("   - Finds items similar to items a target user has liked. It identifies items that are frequently rated similarly by users.")
    print("   - Predicts ratings for new items based on how similar items have been rated.  If a user liked item A, and item B is similar to A (based on how other users rated them), the system predicts the user will also like item B.")
    print("   - Pros: More scalable compared to user-based filtering (item similarity is relatively static). Less susceptible to changes in user behavior (user profiles can change, but item similarity is more stable).")
    print("   - Cons: Might not capture diverse user preferences as well as user-based (doesn't consider individual user nuances as much). Also suffers from the 'cold start' problem for new items with limited interaction history.")

def answer_interview_questions():
    explain_collaborative_filtering()
    

answer_interview_questions()


Collaborative filtering is a recommendation technique that predicts user preferences based on the preferences of similar users or items.
It leverages the 'wisdom of the crowd' – if users A and B have similar tastes in the past, then A is likely to enjoy what B liked recently.

There are two main types:

1. User-based Collaborative Filtering:
   - Finds users with similar tastes to a target user.  The system identifies users with similar rating patterns for items they've both interacted with (e.g., movies, products).
   - Predicts ratings for items based on the ratings of similar users.  It aggregates the ratings of those similar users for items the target user hasn't yet rated, providing a prediction of how the target user might rate them.
   - Pros: Relatively simple to implement.
   - Cons: Can suffer from scalability issues as the user base grows (comparing a user to every other user becomes computationally expensive). Susceptible to popularity bias (over-recommending popular items 