In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv("anime_with_synopsis.csv");
df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,Synopsis
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...


In [3]:
df.shape

(16214, 5)

In [4]:
df.isnull().sum()

MAL_ID      0
Name        0
Score       0
Genres      0
Synopsis    8
dtype: int64

In [5]:
missing_synopsis = df[df['Synopsis'].isnull()]
print("Rows with missing synopsis:")
print(missing_synopsis)

Rows with missing synopsis:
       MAL_ID                                               Name    Score  \
11451   34755  Kuma no Gakkou: Patissier Jackie to Ohisama no...  Unknown   
11469   34794                                Yukai na Animal Bus  Unknown   
13686   38475                                   Yuru Camp△ Movie  Unknown   
15025   40714  Youkai Watch Jam: Youkai Gakuen Y - N to no So...     6.28   
15747   42717                   Kaeru no Pickles: Kimochi no Iro  Unknown   
16056   44848                                        Iii Icecrin  Unknown   
16110   45731                   Argonavis from BanG Dream! Movie  Unknown   
16120   46095                          Vivy: Fluorite Eye's Song  Unknown   

                                           Genres Synopsis  
11451                                Comedy, Kids      NaN  
11469                                Comedy, Kids      NaN  
13686                       Comedy, Slice of Life      NaN  
15025  Comedy, Demons, Kids, Super

In [6]:

df['Synopsis'].fillna('No synopsis available', inplace=True)
missing_synopsis = df[df['Synopsis'].isnull()]
print("Rows with missing synopsis:")
print(missing_synopsis)

Rows with missing synopsis:
Empty DataFrame
Columns: [MAL_ID, Name, Score, Genres, Synopsis]
Index: []


In [7]:
unknown_scores= df[df['Score']== "Unknown"]
print(unknown_scores)

       MAL_ID                                               Name    Score  \
1347     1547                                   Obake no Q-tarou  Unknown   
1439     1656                                    PostPet Momobin  Unknown   
1512     1739                         Shibawanko no Wa no Kokoro  Unknown   
1619     1863                            Silk Road Shounen Yuuto  Unknown   
1808     2073  Hengen Taima Yakou Karura Mau! Sendai Kokeshi ...  Unknown   
...       ...                                                ...      ...   
16209   48481                    Daomu Biji Zhi Qinling Shen Shu  Unknown   
16210   48483                                       Mieruko-chan  Unknown   
16211   48488                    Higurashi no Naku Koro ni Sotsu  Unknown   
16212   48491                        Yama no Susume: Next Summit  Unknown   
16213   48492                                      Scarlet Nexus  Unknown   

                                                  Genres  \
1347         Co

In [8]:

df['Score'].fillna('Unknown', inplace=True)
df['Score'] = pd.to_numeric(df['Score'], errors='coerce')
imputer = SimpleImputer(strategy='median')
df['Score'] = imputer.fit_transform(df[['Score']])
row_1347 = df.iloc[1347]  
print(row_1347)

MAL_ID                                                   1547
Name                                         Obake no Q-tarou
Score                                                    6.57
Genres            Comedy, School, Slice of Life, Supernatural
Synopsis    Q-taro, a monster, is living with the Ohara fa...
Name: 1347, dtype: object


In [9]:
# Step 1: Split genres string into individual genres
genres_split = df['Genres'].str.split(', ')

# Step 2: Flatten the list of genres
all_genres = [genre for genres_list in genres_split.dropna() for genre in genres_list]

# Step 3: Use value_counts to count occurrences of each genre
genre_counts = pd.Series(all_genres).value_counts()

# Display the count of different genres
print("Count of Different Genres:")
print(genre_counts)


Count of Different Genres:
Comedy           5975
Action           3846
Fantasy          3202
Adventure        2942
Kids             2665
Drama            2589
Sci-Fi           2551
Music            2241
Shounen          2003
Slice of Life    1912
Romance          1852
School           1577
Supernatural     1410
Historical       1129
Mecha            1094
Magic            1056
Seinen            830
Ecchi             767
Mystery           721
Sports            708
Shoujo            688
Parody            649
Super Power       627
Military          572
Dementia          510
Space             491
Demons            434
Horror            430
Martial Arts      416
Game              386
Harem             357
Psychological     340
Police            243
Samurai           202
Cars              133
Vampire           133
Thriller          130
Shounen Ai        100
Josei              96
Shoujo Ai          79
Unknown            63
Yaoi               31
dtype: int64


In [10]:
scaler = MinMaxScaler()
numeric_scores_normalized = scaler.fit_transform(df['Score'].values.reshape(-1, 1))

synopsis_vectorizer = TfidfVectorizer(stop_words='english')
synopsis_matrix = synopsis_vectorizer.fit_transform(df['Synopsis'].fillna(''))

# Create a TF-IDF vectorizer for genres
genres_vectorizer = TfidfVectorizer(stop_words='english')
genres_matrix = genres_vectorizer.fit_transform(df['Genres'])

# Combine the TF-IDF matrices for synopses and genres
combined_matrix = hstack([synopsis_matrix, genres_matrix,numeric_scores_normalized])


# Calculate cosine similarity for combined features
combined_similarity = cosine_similarity(combined_matrix)

# Display the cosine similarity matrix for combined features
combined_similarity_df = pd.DataFrame(combined_similarity, index=df['Name'], columns=df['Name'])
print("Cosine Similarity Matrix for Combined Features (Synopsis and Genres):")
print(combined_similarity_df.tail(10))

Cosine Similarity Matrix for Combined Features (Synopsis and Genres):
Name                             Cowboy Bebop  \
Name                                            
SK∞: Crazy Rock Jam                  0.269809   
Kyoukai Senki                        0.295811   
D_Cide Traumerei                     0.394066   
Tsuki to Laika to Nosferatu          0.442421   
Wan Jie Shen Zhu 3rd Season          0.323482   
Daomu Biji Zhi Qinling Shen Shu      0.288584   
Mieruko-chan                         0.266046   
Higurashi no Naku Koro ni Sotsu      0.229828   
Yama no Susume: Next Summit          0.331497   
Scarlet Nexus                        0.327231   

Name                             Cowboy Bebop: Tengoku no Tobira    Trigun  \
Name                                                                         
SK∞: Crazy Rock Jam                                     0.218705  0.262498   
Kyoukai Senki                                           0.288261  0.293865   
D_Cide Traumerei             

In [13]:
def recommend_anime(anime_name, similarity_matrix, df, num_recommendations=5):
    idx = df[df['Name'] == anime_name].index[0]
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations + 1]
    recommendations = [df['Name'][i[0]] for i in sim_scores]
    return recommendations

# Example usage
anime_name = 'One Punch Man'
recommendations = recommend_anime(anime_name, combined_similarity_df.values, df, num_recommendations=20)
print(f"Recommendations for {anime_name}: {recommendations}")


Recommendations for One Punch Man: ['One Punch Man: Road to Hero', 'One Punch Man 2nd Season', 'One Punch Man Specials', 'One Punch Man 2nd Season Specials', 'One Punch Man 2nd Season Commemorative Special', 'Towa no Quon 6: Towa no Quon', 'Towa no Quon 4: Guren no Shoushin', 'Towa no Quon 5: Souzetsu no Raifuku', 'Towa no Quon 1: Utakata no Kaben', 'Toaru Kagaku no Railgun S', 'Toaru Kagaku no Railgun T', 'Samurai Flamenco', 'Toaru Kagaku no Railgun', 'Tokyo ESP', 'K: Missing Kings', 'Toaru Kagaku no Railgun: Misaka-san wa Ima Chuumoku no Mato Desu kara', 'Towa no Quon 2: Konton no Ranbu', 'Kiddy GiRL-AND', 'Canaan', 'Witchblade']
