In [4]:
#Data Processing
import pandas as pd

# Load the dataset
anime_data = pd.read_csv('anime.csv')
anime_data
# Display the first few rows and the structure of the dataset
print(anime_data.head())

# Handle missing values
anime_data.fillna({'genre': 'Unknown', 'average_rating': 0, 'num_users': 0}, inplace=True)




   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  


In [76]:
anime_data.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [80]:
# Print the columns of the DataFrame to check their names
print(anime_data.columns)


Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members'], dtype='object')


In [82]:
# Check the first few rows and the data type of the 'genre' column
print(anime_data['genre'].head())
print(anime_data['genre'].dtype)


0                 Drama, Romance, School, Supernatural
1    Action, Adventure, Drama, Fantasy, Magic, Mili...
2    Action, Comedy, Historical, Parody, Samurai, S...
3                                     Sci-Fi, Thriller
4    Action, Comedy, Historical, Parody, Samurai, S...
Name: genre, dtype: object
object


In [84]:
from sklearn.preprocessing import MultiLabelBinarizer

# Check the data type of the 'genre' column
print(anime_data['genre'].dtype)  # This should show you it's a list

# Use MultiLabelBinarizer to convert genres to numerical format
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(anime_data['genre'])
genre_df = pd.DataFrame(genre_matrix, columns=mlb.classes_)

# Combine genre features with the original DataFrame
anime_data = pd.concat([anime_data, genre_df], axis=1)

# Drop the original 'genre' column if you no longer need it
anime_data = anime_data.drop(columns=['genre'])


object


In [6]:
#Feature Extarction
# Assume anime_data is your processed DataFrame
# Select the columns you want to use for the recommendation system
features = anime_data.drop(columns=['anime_id', 'name', 'type', 'episodes'])  # Adjust based on your DataFrame

# Normalize numerical features if not done yet
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numerical_columns = ['rating', 'members']  # Adjust based on your DataFrame

# Normalize only if those columns exist
if all(col in features.columns for col in numerical_columns):
    features[numerical_columns] = scaler.fit_transform(features[numerical_columns])
else:
    print("Some numerical columns are missing:", [col for col in numerical_columns if col not in features.columns])


In [94]:
# Check for NaN values in the features DataFrame
print(features.isnull().sum())


rating     230
members      0
             0
,            0
-            0
A            0
C            0
D            0
E            0
F            0
G            0
H            0
J            0
K            0
L            0
M            0
P            0
R            0
S            0
T            0
U            0
V            0
Y            0
a            0
c            0
d            0
e            0
f            0
g            0
h            0
i            0
j            0
k            0
l            0
m            0
n            0
o            0
p            0
r            0
s            0
t            0
u            0
v            0
w            0
y            0
dtype: int64


In [98]:
# Drop rows with any NaN values
features = features.dropna()


In [102]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(features)
cosine_sim


array([[ 1.        ,  0.76371585,  0.78021971, ..., -0.08353616,
        -0.06920652,  0.0078349 ],
       [ 0.76371585,  1.        ,  0.58457252, ..., -0.08603419,
        -0.0817536 , -0.05697744],
       [ 0.78021971,  0.58457252,  1.        , ...,  0.07192618,
         0.08877003,  0.17658951],
       ...,
       [-0.08353616, -0.08603419,  0.07192618, ...,  1.        ,
         0.99958576,  0.98370625],
       [-0.06920652, -0.0817536 ,  0.08877003, ...,  0.99958576,
         1.        ,  0.98847286],
       [ 0.0078349 , -0.05697744,  0.17658951, ...,  0.98370625,
         0.98847286,  1.        ]])

In [108]:
def recommend_anime(name, num_recommendations=5):
    # Get the index of the anime that matches the title
    idx = anime_data[anime_data['name'] == name].index[0]
    
    # Get the pairwise similarity scores for all anime with the given anime
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the anime based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the most similar anime
    sim_scores = sim_scores[1:num_recommendations + 1]
    
    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]
    
    # Return the top most similar anime
    return anime_data['name'].iloc[anime_indices]

# Example usage
print(recommend_anime('Naruto', 5))

615    Naruto: Shippuuden
582                Bleach
159          Angel Beats!
281          Kill la Kill
445      Mirai Nikki (TV)
Name: name, dtype: object


In [139]:
#Evaluation

from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_recommendations(ground_truth, num_recommendations=5):
    precision_list = []
    recall_list = []
    f1_list = []

    for name, true_anime in ground_truth.items():
        recommendations = recommend_anime(name, num_recommendations)
        
        # Convert recommendations and true_anime to sets for easier calculations
        recommended_set = set(recommendations)
        true_set = set(true_anime)

        # Calculate true positives, false positives, and false negatives
        true_positive = len(recommended_set.intersection(true_set))
        false_positive = len(recommended_set) - true_positive
        false_negative = len(true_set) - true_positive

        # Calculate precision, recall, and F1 score
        precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
        recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
        f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0

        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

    # Average scores
    avg_precision = sum(precision_list) / len(precision_list)
    avg_recall = sum(recall_list) / len(recall_list)
    avg_f1 = sum(f1_list) / len(f1_list)

    print(f'Average Precision: {avg_precision}')
    print(f'Average Recall: {avg_recall}')
    print(f'Average F1 Score: {avg_f1}')
    
evaluate_recommendations(ground_truth, num_recommendations=5)


Anime 'Attack on Titan' not found in the dataset.
Average Precision: 0.1
Average Recall: 0.16666666666666666
Average F1 Score: 0.125
