In [2]:
# Step 1: Data Preprocessing
import pandas as pd
import numpy as np

# Load the dataset
anime_df = pd.read_csv('anime.csv')

# Display the first few rows of the dataset
print(anime_df.head())

# Handle missing values
anime_df.isnull().sum()

# Fill missing values or drop rows/columns with missing values based on the analysis
anime_df.dropna(inplace=True)

# Explore the dataset
print(anime_df.info())
print(anime_df.describe())

# Step 2: Feature Extraction
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler

# Convert genres to numerical representations
anime_df['genre'] = anime_df['genre'].apply(lambda x: x.split(', ') if pd.notnull(x) else [])

mlb = MultiLabelBinarizer()
genre_df = pd.DataFrame(mlb.fit_transform(anime_df['genre']), columns=mlb.classes_, index=anime_df.index)

# Normalize numerical features (e.g., ratings)
scaler = MinMaxScaler()
anime_df[['rating']] = scaler.fit_transform(anime_df[['rating']])

# Combine genre and numerical features
features_df = pd.concat([anime_df[['rating']], genre_df], axis=1)

# Display the feature dataframe
print(features_df.head())

# Step 3: Recommendation System
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(features_df, features_df)

# Function to recommend anime
def recommend_anime(title, cosine_sim=cosine_sim, df=anime_df, features_df=features_df):
    # Check if the title exists in the DataFrame
    if title not in df['name'].values:
        return "Anime title not found in the dataset."
    
    indices = pd.Series(df.index, index=df['name']).drop_duplicates()
    idx = indices[title]
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    
    anime_indices = [i[0] for i in sim_scores]
    
    return df['name'].iloc[anime_indices]

# Example usage
print(recommend_anime('Naruto'))

# Step 4: Evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# Split the dataset
train, test = train_test_split(anime_df, test_size=0.2, random_state=42)

# Placeholder for evaluation function
def evaluate_recommendations(train, test, k=10):
    y_true = []
    y_pred = []
    
    for title in test['name']:
        if title not in train['name'].values:
            continue
        
        true_genres = set(test[test['name'] == title]['genre'].values[0])
        recommended = recommend_anime(title)
        
        if isinstance(recommended, str):
            continue  # Skip if recommendation returns an error message
        
        pred_genres = set(train[train['name'].isin(recommended)]['genre'].values[0])
        
        y_true.append(true_genres)
        y_pred.append(pred_genres)
    
    # Calculate precision, recall, and f1-score
    precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
    
    return precision, recall, f1

# Evaluate the system
precision, recall, f1 = evaluate_recommendations(train, test)
print(f'Precision: {precision}, Recall: {recall}, F1-Score: {f1}')


   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  
<class 'pandas.core.frame.DataFrame'>
Index: 12017 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
--- 