# Recommendation System

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [2]:
Anime_data=pd.read_csv("anime.csv")

In [3]:
Anime_data

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [4]:
Anime_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [5]:
Anime_data.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [6]:
# Handling missing or null values.
# Fill missing ratings with the average rating
Anime_data['rating'] = Anime_data['rating'].fillna(Anime_data['rating'].mean())

# Replace 'Unknown' in episodes with median
Anime_data['episodes'] = Anime_data['episodes'].replace('Unknown', None).astype(float)
Anime_data['episodes'] = Anime_data['episodes'].fillna(Anime_data['episodes'].median())

# Fill missing genres with 'Unknown'
Anime_data['genre'] = Anime_data['genre'].fillna('Unknown')


In [7]:
Anime_data

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64.0,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51.0,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24.0,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51.0,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1.0,4.15,211
12290,5543,Under World,Hentai,OVA,1.0,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4.0,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1.0,4.98,175


In [8]:
Anime_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12294 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  float64
 5   rating    12294 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 672.5+ KB


In [9]:
Anime_data.describe()

Unnamed: 0,anime_id,episodes,rating,members
count,12294.0,12294.0,12294.0,12294.0
mean,14058.221653,12.095412,6.473902,18071.34
std,11455.294701,46.244062,1.017096,54820.68
min,1.0,1.0,1.67,5.0
25%,3484.25,1.0,5.9,225.0
50%,10260.5,2.0,6.55,1550.0
75%,24794.5,12.0,7.17,9437.0
max,34527.0,1818.0,10.0,1013917.0


In [10]:
# Make sure genre column is split
Anime_data['genre'] = Anime_data['genre'].apply(lambda x: x if isinstance(x, list) else x.split(', '))

# Explode -separate the list to row
exploded = Anime_data.explode('genre')
# one-hot encoding for genres _ convert into binary 
genre_dummies = pd.get_dummies(exploded['genre'])
Anime_features = genre_dummies.groupby(exploded.index).sum()


In [11]:
Anime_features

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Unknown,Vampire,Yaoi,Yuri
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12290,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12291,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12292,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Normalize numerical features - to have same scale
scaler = MinMaxScaler()
numerical_features = scaler.fit_transform(Anime_data[['episodes', 'rating', 'members']])
final_features = pd.concat([Anime_features, 
                            pd.get_dummies(Anime_data['type']),
                            pd.DataFrame(numerical_features, columns=['episodes', 'rating', 'members'])], axis=1)
final_features

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Yuri,Movie,Music,ONA,OVA,Special,TV,episodes,rating,members
0,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0.000000,0.924370,0.197872
1,1,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,1,0.034673,0.911164,0.782770
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0.027518,0.909964,0.112689
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0.012658,0.900360,0.664325
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0.027518,0.899160,0.149186
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0.000000,0.297719,0.000203
12290,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0.000000,0.313325,0.000176
12291,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0.001651,0.385354,0.000211
12292,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0.000000,0.397359,0.000168


In [13]:
# Compute cosine similarity as it compares features 
Anime_cos_sim = cosine_similarity(final_features)

# Create a lookup dictionary for anime titles
anime_index = pd.Series(Anime_data.index, index=Anime_data['name']).drop_duplicates()
anime_index

name
Kimi no Na wa.                                            0
Fullmetal Alchemist: Brotherhood                          1
Gintama°                                                  2
Steins;Gate                                               3
Gintama&#039;                                             4
                                                      ...  
Toushindai My Lover: Minami tai Mecha-Minami          12289
Under World                                           12290
Violence Gekiga David no Hoshi                        12291
Violence Gekiga Shin David no Hoshi: Inma Densetsu    12292
Yasuji no Pornorama: Yacchimae!!                      12293
Length: 12294, dtype: int64

In [14]:
#function for recommeding the anime
def recommend_anime(name, top_n=5, threshold = 0.5):
    if name not in anime_index:
        return "Anime not found."

    index = anime_index[name]
    similarity_scores = list(enumerate(Anime_cos_sim[index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    recommended = [Anime_data.iloc[i[0]]['name'] for i in similarity_scores]
    return recommended

In [15]:
# Example 
recommendations = recommend_anime("Naruto", top_n=5)
print("Recommended Anime:")
for i, anime_name in enumerate(recommendations, 1):
    print(f"{i}. {anime_name}")


Recommended Anime:
1. Naruto: Shippuuden
2. Katekyo Hitman Reborn!
3. Dragon Ball Z
4. Bleach
5. Dragon Ball Kai


Since the user based interaction data is not provieded a proxy evaluation is done - by genre overlapping

In [16]:
def evaluate_recommendations_by_genre_overlap(name, top_n=10, threshold=0.5):
    #recommendations 
    recommended_names = recommend_anime(name, top_n=top_n, threshold=threshold)
    
    
    try:
        query_genres = set(Anime_data[Anime_data['name'] == name]['genre'].values[0])
    except IndexError:
        return f"Anime '{name}' not found in the dataset."

    # Count how many  animes have same genre
    overlap_count = 0
    overlaps = []

    for rec_name in recommended_names:
        # Get genres 
        rec_row = Anime_data[Anime_data['name'] == rec_name]
        if not rec_row.empty:
            rec_genres = set(rec_row['genre'].values[0])
            if query_genres & rec_genres:
                overlap_count += 1
                overlaps.append((rec_name, True))
            else:
                overlaps.append((rec_name, False))

    overlap_ratio = overlap_count / top_n

    return {
        'query_anime': name,
        'overlap_ratio': overlap_ratio,
        'details': overlaps
    }


In [17]:
evaluate_recommendations_by_genre_overlap('Naruto', top_n=10)

{'query_anime': 'Naruto',
 'overlap_ratio': 1.0,
 'details': [('Naruto: Shippuuden', True),
  ('Katekyo Hitman Reborn!', True),
  ('Dragon Ball Z', True),
  ('Bleach', True),
  ('Dragon Ball Kai', True),
  ('Dragon Ball Super', True),
  ('Medaka Box', True),
  ('Tenjou Tenge', True),
  ('Medaka Box Abnormal', True),
  ('Dragon Ball Kai (2014)', True)]}

In [18]:
## from the above all the 10 recommended anime share at least one genre with 'Naruto', so the overlap ratio is 1.0
## as it shares it says true, if not false

In [19]:
def genres_from_string(genre_input):
    if isinstance(genre_input, str):
        return set(g.strip().lower() for g in genre_input.split(','))
    elif isinstance(genre_input, list):
        return set(g.strip().lower() for g in genre_input)
    else:
        return set()

def evaluate_with_genre(recommendations, anime_data, k=10):
    precisions = []

    for anime_name, rec_names in recommendations.items():
        # Get genre set of the input anime
        query_row = anime_data[anime_data['name'] == anime_name]
        if query_row.empty:
            continue

        query_genres = genres_from_string(query_row.iloc[0]['genre'])
        if not query_genres:
            continue

        relevant_count = 0

        # Only evaluate top-k recommendations
        for rec_name in rec_names[:k]:
            rec_row = anime_data[anime_data['name'] == rec_name]
            if rec_row.empty:
                continue

            rec_genres = genres_from_string(rec_row.iloc[0]['genre'])
            if query_genres & rec_genres:
                relevant_count += 1

        precision = relevant_count / k
        precisions.append(precision)

    avg_precision = sum(precisions) / len(precisions) if precisions else 0
    return avg_precision


In [20]:
recommendation_dict = {}

for anime_name in ['Naruto', 'Noragami Aragoto', 'Code Geass: Hangyaku no Lelouch']:  
    recommendation_dict[anime_name] = recommend_anime(anime_name, top_n=10)


In [21]:
avg_precision = evaluate_with_genre(recommendation_dict, Anime_data, k=10)
print(f"Average Precision Genre Overlap: {avg_precision:.2f}")
# overlap =1  is a strong indication that the recommedation works

Average Precision Genre Overlap: 1.00


In [22]:
print("Query Anime Genres:", Anime_data.loc[anime_index['Bleach'], 'genre'])
print("Recommended Anime Genres:")
for rec in recommend_anime('Bleach', top_n=5): # rec holds the anime name as it iterates
    genres = Anime_data[Anime_data['name'] == rec]['genre'].values[0]
    print(f"{rec}: {genres}")

Query Anime Genres: ['Action', 'Comedy', 'Shounen', 'Super Power', 'Supernatural']
Recommended Anime Genres:
Katekyo Hitman Reborn!: ['Action', 'Comedy', 'Shounen', 'Super Power']
Code:Breaker: ['Action', 'Comedy', 'School', 'Shounen', 'Super Power', 'Supernatural']
Yozakura Quartet: Hana no Uta: ['Action', 'Comedy', 'Magic', 'Shounen', 'Super Power', 'Supernatural']
Tokyo ESP: ['Action', 'Comedy', 'Sci-Fi', 'Shounen', 'Super Power', 'Supernatural']
Yozakura Quartet: ['Action', 'Comedy', 'Magic', 'Shounen', 'Super Power', 'Supernatural']
