In [2]:
import pandas as pd

In [4]:
df = pd.read_csv('anime.csv')

In [6]:
print(df.head())

   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  


In [8]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB
None


In [10]:
print(df.isnull().sum())

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64


In [32]:
df['rating'].fillna(df['rating'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['rating'].fillna(df['rating'].mean(), inplace=True)


In [16]:
df.columns

Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members'], dtype='object')

In [18]:
df.dropna(subset=['anime_id', 'name', 'genre'], inplace=True)

In [20]:
print(df.head())

   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  


In [22]:
from sklearn.preprocessing import MultiLabelBinarizer
df['genre'] = df['genre'].apply(lambda x: x.split(', ') if pd.notnull(x) else [])
# One-hot encode the genres
mlb = MultiLabelBinarizer()
genres_encoded = pd.DataFrame(mlb.fit_transform(df['genre']), columns=mlb.classes_, index=df.index)

In [24]:
df['normalized_rating'] = (df['rating'] - df['rating'].min()) / (df['rating'].max() - df['rating'].min())

In [26]:
features = pd.concat([genres_encoded, df['normalized_rating']], axis=1)
print(features.head())

   Action  Adventure  Cars  Comedy  Dementia  Demons  Drama  Ecchi  Fantasy  \
0       0          0     0       0         0       0      1      0        0   
1       1          1     0       0         0       0      1      0        1   
2       1          0     0       1         0       0      0      0        0   
3       0          0     0       0         0       0      0      0        0   
4       1          0     0       1         0       0      0      0        0   

   Game  ...  Slice of Life  Space  Sports  Super Power  Supernatural  \
0     0  ...              0      0       0            0             1   
1     0  ...              0      0       0            0             0   
2     0  ...              0      0       0            0             0   
3     0  ...              0      0       0            0             0   
4     0  ...              0      0       0            0             0   

   Thriller  Vampire  Yaoi  Yuri  normalized_rating  
0         0        0     0     0

In [28]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between all anime
cosine_sim = cosine_similarity(features)

# Display the similarity matrix (optional)
print(cosine_sim)

[[1.         0.29880771 0.13644987 ... 0.15085865 0.15492584 0.1737458 ]
 [0.29880771 1.         0.36135915 ... 0.11708593 0.12024259 0.13484933]
 [0.13644987 0.36135915 1.         ... 0.116948   0.12010094 0.13469047]
 ...
 [0.15085865 0.11708593 0.116948   ... 1.         0.99994581 0.99824985]
 [0.15492584 0.12024259 0.12010094 ... 0.99994581 1.         0.99881138]
 [0.1737458  0.13484933 0.13469047 ... 0.99824985 0.99881138 1.        ]]


In [30]:
#Build a Recommendation Function

# Create a Series to map anime titles to their indices
indices = pd.Series(df.index, index=df['name']).drop_duplicates()

def recommend_anime(name, cosine_sim=cosine_sim, num_recommendations=5):
    # Get the index of the anime that matches the name
    idx = indices[name]
    
    # Get similarity scores for the target anime
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort anime by similarity scores in descending order, exclude the anime itself
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:num_recommendations + 1]
    
    # Get indices of the recommended anime
    anime_indices = [i[0] for i in sim_scores]
    
    # Return the top recommended anime names
    return df['name'].iloc[anime_indices]

# Test the recommendation function
print(recommend_anime('Naruto'))


615                                    Naruto: Shippuuden
1103    Boruto: Naruto the Movie - Naruto ga Hokage ni...
486                              Boruto: Naruto the Movie
1343                                          Naruto x UT
1472          Naruto: Shippuuden Movie 4 - The Lost Tower
Name: name, dtype: object


In [34]:
from sklearn.model_selection import train_test_split

# Example: Split the dataset into training and testing sets
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [36]:
def evaluate_precision_recall(anime_name, top_n=5):
    # Retrieve the genre of the target anime
    target_genre = df[df['name'] == anime_name]['genre'].values[0]
    
    # Get the top N recommended anime for the given anime
    recommended_anime = recommend_anime(anime_name, num_recommendations=top_n)
    
    # Identify the relevant recommendations (anime that match the target genre)
    relevant_recommendations = recommended_anime[recommended_anime.isin(df[df['genre'].apply(lambda x: any(genre in x for genre in target_genre))]['name'])]
    
    # Precision = (relevant recommendations) / (Total recommendations)
    precision = len(relevant_recommendations) / len(recommended_anime)
    
    # Recall = (relevant recommendations) / (Total relevant anime in the dataset)
    total_relevant = df[df['genre'].apply(lambda x: any(genre in x for genre in target_genre))]
    recall = len(relevant_recommendations) / len(total_relevant)
    
    return precision, recall

In [38]:
precision, recall = evaluate_precision_recall('Naruto')
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

Precision: 1.00
Recall: 0.00


In [None]:
def mean_average_precision(df, recommend_func, top_n=5):
    total_precision = 0
    count = 0
    
    for anime in df['name']:
        # Get the genre of the current anime
        target_genre = df[df['name'] == anime]['genre'].values[0]
    
        recommended_anime = recommend_func(anime, num_recommendations=top_n)
        
        # Check the relevance of recommendations based on genre matching
        relevant_recommendations = recommended_anime[recommended_anime.isin(df[df['genre'].apply(lambda x: any(genre in x for genre in target_genre))]['name'])]
        
        # Calculate precision at each recommendation rank
        precision_at_k = len(relevant_recommendations) / len(recommended_anime)
        total_precision += precision_at_k
        count += 1
    
    # Mean Average Precision (MAP)
    map_score = total_precision / count
    return map_score


map_score = mean_average_precision(df, recommend_anime)
print(f"Mean Average Precision (MAP): {map_score:.2f}")