# Data Pre-processing

### Importing Libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score

### Loading dataset

In [2]:
# Load the dataset
anime_df = pd.read_csv('anime.csv')

### Handling missing values

In [3]:
# Check for missing values
print(anime_df.isnull().sum())

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64


In [4]:
# Handle missing values in 'genre' and 'type' columns by filling with 'Unknown'
anime_df['genre'].fillna('Unknown', inplace=True)
anime_df['type'].fillna('Unknown', inplace=True)

# Handle missing values in 'rating' column by filling with the mean rating
mean_rating = anime_df['rating'].mean()
anime_df['rating'].fillna(mean_rating, inplace=True)

In [5]:
print(anime_df.isnull().sum())

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64


### Explore the dataset

In [6]:
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [7]:
# Display information about the dataset
print(anime_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12294 non-null  object 
 3   type      12294 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12294 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB
None


In [8]:
# Unique values in categorical columns
print(anime_df['type'].unique())

['Movie' 'TV' 'OVA' 'Special' 'Music' 'ONA' 'Unknown']


In [9]:
anime_df.type.value_counts()

type
TV         3787
OVA        3311
Movie      2348
Special    1676
ONA         659
Music       488
Unknown      25
Name: count, dtype: int64

In [10]:
# Summary statistics of numerical columns
print(anime_df.describe())

           anime_id        rating       members
count  12294.000000  12294.000000  1.229400e+04
mean   14058.221653      6.473902  1.807134e+04
std    11455.294701      1.017096  5.482068e+04
min        1.000000      1.670000  5.000000e+00
25%     3484.250000      5.900000  2.250000e+02
50%    10260.500000      6.550000  1.550000e+03
75%    24794.500000      7.170000  9.437000e+03
max    34527.000000     10.000000  1.013917e+06


# Feature Extraction:

In [11]:
# Convert genres into numerical representations
anime_df['genre'] = anime_df['genre'].astype('category')
anime_df['genre_code'] = anime_df['genre'].cat.codes

# Normalize ratings
anime_df['rating_normalized'] = (anime_df['rating'] - anime_df['rating'].min()) / (anime_df['rating'].max() - anime_df['rating'].min())

# Features to be used for similarity computation
features = ['genre_code', 'rating_normalized']

In [12]:
features

['genre_code', 'rating_normalized']

# Recommendation System:

## Content-based Recommendation System

In [13]:
def content_based_recommendation(anime_name, anime_df):
    # Ensure 'genre' column is not null
    #anime_df['genre'] = anime_df['genre'].fillna('Unknown')

    # Convert 'genre' column to string if it's not already
    anime_df['genre'] = anime_df['genre'].astype(str)

    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(anime_df['genre'])

    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

    idx = anime_df[anime_df['name'] == anime_name].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:11]

    anime_indices = [i[0] for i in sim_scores]
    anime_recommendations = anime_df['name'].iloc[anime_indices].tolist()
    similarity_scores = [i[1] for i in sim_scores]

    # Create a DataFrame to display the recommendations in tabular form
    recommendations_df = pd.DataFrame({
        'Anime Name': anime_recommendations,
        'Similarity Score': similarity_scores
    })

    return recommendations_df

In [14]:
# Fullmetal Alchemist: Brotherhood     Kimi no Na wa.    Magi: The Kingdom of Magic
recommendations = content_based_recommendation("Fullmetal Alchemist: Brotherhood", anime_df)
print(recommendations)

                                      Anime Name  Similarity Score
0                            Fullmetal Alchemist          0.973258
1  Fullmetal Alchemist: The Sacred Star of Milos          0.973258
2      Fullmetal Alchemist: Brotherhood Specials          0.955605
3            Tales of Vesperia: The First Strike          0.874084
4                                 Tide-Line Blue          0.834075
5               Fullmetal Alchemist: Reflections          0.812005
6                     Magi: The Kingdom of Magic          0.784349
7                   Magi: The Labyrinth of Magic          0.784349
8                    Magi: Sinbad no Bouken (TV)          0.784349
9                         Magi: Sinbad no Bouken          0.784349


# Evaluation for Content-based Recommendation System
## Average Similarity Score and MAP

In [15]:
def evaluate1_content_based_model(anime_name, anime_df):
    # Get recommendations
    recommendations_df = content_based_recommendation(anime_name, anime_df)

    # Calculate average similarity score
    avg_similarity_score = recommendations_df['Similarity Score'].mean()

    # Calculate mean average precision (MAP)
    relevant_anime = anime_df[anime_df['name'] == anime_name]
    relevant_genre = relevant_anime['genre'].iloc[0]
    relevant_anime_set = set(relevant_genre.split(','))
    
    total_precision = 0.0
    relevant_count = 0
    for i, anime in enumerate(recommendations_df['Anime Name']):
        anime_genre = anime_df[anime_df['name'] == anime]['genre'].iloc[0]
        anime_genre_set = set(anime_genre.split(','))
        precision = len(relevant_anime_set.intersection(anime_genre_set)) / len(anime_genre_set)
        total_precision += precision
        if len(relevant_anime_set.intersection(anime_genre_set)) > 0:
            relevant_count += 1

    if relevant_count == 0:
        mean_average_precision = 0.0
    else:
        mean_average_precision = total_precision / relevant_count

    return avg_similarity_score, mean_average_precision

In [16]:
# Fullmetal Alchemist: Brotherhood     Kimi no Na wa.    Magi: The Kingdom of Magic
anime_name = "Fullmetal Alchemist: Brotherhood"
avg_similarity_score, mean_average_precision = evaluate1_content_based_model(anime_name, anime_df)
print(f"Average Similarity Score: {avg_similarity_score}")
print(f"Mean Average Precision (MAP): {mean_average_precision}")

Average Similarity Score: 0.8559681531666964
Mean Average Precision (MAP): 0.925


## Precision, recall and f1_score

In [17]:
def evaluate2_content_based_model(anime_name, anime_df, true_positive_threshold=0.5):
    # Get recommendations
    recommendations_df = content_based_recommendation(anime_name, anime_df)

    # True positive if similarity score >= threshold
    true_positives = recommendations_df['Similarity Score'] >= true_positive_threshold
    num_true_positives = true_positives.sum()

    # False positive if similarity score < threshold
    num_false_positives = len(recommendations_df) - num_true_positives

    # True positives in top 10 recommendations
    top_10_true_positives = recommendations_df.head(10)['Similarity Score'] >= true_positive_threshold
    num_top_10_true_positives = top_10_true_positives.sum()

    # Calculate precision
    precision = num_true_positives / (num_true_positives + num_false_positives) if num_true_positives + num_false_positives > 0 else 0

    # Calculate recall
    recall = num_true_positives / len(recommendations_df)

    # Calculate F1 score
    f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

    # Calculate precision at 10 (P@10)
    precision_at_10 = num_top_10_true_positives / 10

    return precision, recall, f1_score, precision_at_10

In [18]:
# Fullmetal Alchemist: Brotherhood     Kimi no Na wa.    Magi: The Kingdom of Magic
anime_name = "Kimi no Na wa."
precision, recall, f1_score, precision_at_10 = evaluate2_content_based_model(anime_name, anime_df)
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"Precision at 10: {precision_at_10}")

Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Precision at 10: 1.0


## Collaborative Filtering Recommendation

In [19]:
def recommend_anime():
    # Defining functions here
    def recommend_similar_anime(target_anime_id, anime_df, features, threshold=0.5, top_n=10):
        target_anime = anime_df[anime_df['anime_id'] == target_anime_id]
        target_features = target_anime[features].values.reshape(1, -1)
        
        anime_df = anime_df[anime_df['anime_id'] != target_anime_id]
        anime_features = anime_df[features].values
        
        similarity_scores = cosine_similarity(target_features, anime_features).flatten()
        similar_anime_indices = similarity_scores.argsort()[::-1]
        
        recommended_anime = []
        for idx in similar_anime_indices:
            if similarity_scores[idx] >= threshold:
                anime_id = anime_df.iloc[idx]['anime_id']
                anime_name = anime_df.iloc[idx]['name']
                recommended_anime.append((anime_id, anime_name, similarity_scores[idx]))
        
        return recommended_anime[:top_n]
    
    # Prompting user to input the anime ID
    target_anime_id = int(input("Enter the Anime ID: "))
    
    # Get recommendations for the input anime ID
    recommendations = recommend_similar_anime(target_anime_id, anime_df, features)
    
    # Convert recommendations to a DataFrame
    recommendations_df = pd.DataFrame(recommendations, columns=['Anime ID', 'Anime Name', 'Similarity Score'])
    
    # Display input anime along with recommendations in table format
    input_anime = anime_df[anime_df['anime_id'] == target_anime_id][['anime_id', 'name']]
    input_anime.columns = ['Anime ID', 'Anime Name']
    
    print("\nInput Anime:")
    print(input_anime.to_string(index=False))
    print("\nRecommended Anime:")
    print(recommendations_df.to_string(index=False))

In [20]:
# Call the function to execute the recommendation process
recommend_anime()       # 290   200   32281    5114    28977    9253

Enter the Anime ID: 5114

Input Anime:
 Anime ID                       Anime Name
     5114 Fullmetal Alchemist: Brotherhood

Recommended Anime:
 Anime ID                                       Anime Name  Similarity Score
     2473 Ginga Ojousama Densetsu Yuna: Kanashimi no Siren               1.0
     3232                Bannou Bunka Neko-Musume Specials               1.0
     1412                                        Lupin III               1.0
     3036                                      Tobe! Isami               1.0
      751                                Bomberman Jetters               1.0
     1194                              Coyote Ragtime Show               1.0
      375                  Bannou Bunka Neko-Musume (1998)               1.0
     4504                       Kinkyuu Hasshin Saver Kids               1.0
    12113   Berserk: Ougon Jidai-hen II - Doldrey Kouryaku               1.0
     8853                             Chouriki Robo Galatt               1.0


# Evaluation

In [21]:
def evaluate_collaborative_filtering_model(target_anime_id, anime_df, features, threshold=0.5, top_n=10):
    # Define function to recommend similar anime
    def recommend_similar_anime(target_anime_id, anime_df, features, threshold=0.5, top_n=10):
        target_anime = anime_df[anime_df['anime_id'] == target_anime_id]
        target_features = target_anime[features].values.reshape(1, -1)
        
        anime_df = anime_df[anime_df['anime_id'] != target_anime_id]
        anime_features = anime_df[features].values
        
        similarity_scores = cosine_similarity(target_features, anime_features).flatten()
        similar_anime_indices = similarity_scores.argsort()[::-1]
        
        recommended_anime = []
        for idx in similar_anime_indices:
            if similarity_scores[idx] >= threshold:
                anime_id = anime_df.iloc[idx]['anime_id']
                anime_name = anime_df.iloc[idx]['name']
                recommended_anime.append((anime_id, anime_name, similarity_scores[idx]))
        
        return recommended_anime[:top_n]
    
    # Get recommendations for the input anime ID
    recommendations = recommend_similar_anime(target_anime_id, anime_df, features, threshold, top_n)
    
    # Convert recommendations to a DataFrame
    recommendations_df = pd.DataFrame(recommendations, columns=['Anime ID', 'Anime Name', 'Similarity Score'])
    
    # True positive if recommended anime is relevant (present in the top_n)
    true_positives = recommendations_df['Anime ID'] == target_anime_id
    num_true_positives = true_positives.sum()

    # Calculate precision
    precision = num_true_positives / top_n

    # Calculate recall
    relevant_anime = anime_df[anime_df['anime_id'] == target_anime_id]
    total_relevant_anime = len(anime_df[anime_df['anime_id'] != target_anime_id])
    recall = num_true_positives / total_relevant_anime

    # Calculate F1 score
    f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

    return precision, recall, f1_score

# Example usage
target_anime_id = 17# Example anime ID
precision, recall, f1_score = evaluate_collaborative_filtering_model(target_anime_id, anime_df, features)
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")

Precision: 0.0
Recall: 0.0
F1 Score: 0


# Conclusion
#### From the above evaluations we can say that Collaborative Filtering Recommendation for this dataset is not good since the data lacks users info.
#### From the above evaluations we can say that Content-based Recommendation System is performing good for this dataset.