In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
df = pd.read_csv('/content/sample_data/anime.csv')

In [None]:
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [None]:
df.isnull().sum()

Unnamed: 0,0
anime_id,0
name,0
genre,62
type,25
episodes,0
rating,230
members,0


In [None]:
# columns 'genre','type' and 'ratings' has missing values - will treat them
# column 'episode' has datatype as obj, it should be numeric, so will change the datatype

In [None]:
df.genre.fillna('Unknown', inplace = True)

In [None]:
df.type.fillna('Unknown', inplace =True)

In [None]:
df.rating.fillna(df['rating'].mean(), inplace = True)

In [None]:
df.episodes = pd.to_numeric(df.episodes, errors = 'coerce')

In [None]:
df.episodes.fillna(df.episodes.median(), inplace = True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12294 non-null  object 
 3   type      12294 non-null  object 
 4   episodes  12294 non-null  float64
 5   rating    12294 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 672.5+ KB


### Convert genre to numerical column (one hot encoding)

In [None]:
df['genre'] = df['genre'].apply(lambda x: x.split(', '))

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

genre_df = pd.DataFrame(mlb.fit_transform(df['genre']), columns=mlb.classes_, index=df.index)


In [None]:
genre_df

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Unknown,Vampire,Yaoi,Yuri
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12290,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12291,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12292,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Normalize rating and episodes

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [None]:
df[['rating','episodes']] = scaler.fit_transform(df[['rating','episodes']])

### Combining features

In [None]:
features_df = pd.concat([genre_df,df[['rating','episodes']]], axis =1 )

In [None]:
features_df

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Sports,Super Power,Supernatural,Thriller,Unknown,Vampire,Yaoi,Yuri,rating,episodes
0,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0.924370,0.000000
1,1,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0.911164,0.034673
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.909964,0.027518
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0.900360,0.012658
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.899160,0.027518
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.297719,0.000000
12290,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.313325,0.000000
12291,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.385354,0.001651
12292,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.397359,0.000000


## Recommendation System

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_anime(target_anime, features_df, anime_df, top_n=10):
    # Find the index of the target anime
    target_index = df[df['name'] == target_anime].index[0]

    # Calculate cosine similarity between the target anime and all other anime
    similarity_scores = cosine_similarity([features_df.iloc[target_index]], features_df)[0]

    # Get the indices of the top n most similar anime
    similar_indices = similarity_scores.argsort()[::-1][1:top_n+1]

    # Return the names and similarity scores of the top n similar anime
    similar_anime = anime_df.iloc[similar_indices][['name', 'rating']]
    similar_anime['similarity'] = similarity_scores[similar_indices]
    return similar_anime


In [None]:
recommendation = recommend_anime('Death Note', features_df, df)
recommendation

Unnamed: 0,name,rating,similarity
778,Death Note Rewrite,0.740696,0.999107
981,Mousou Dairinin,0.728691,0.919262
144,Higurashi no Naku Koro ni Kai,0.809124,0.908189
334,Higurashi no Naku Koro ni,0.780312,0.823041
1383,Higurashi no Naku Koro ni Rei,0.707083,0.820085
833,Jigoku Shoujo Mitsuganae,0.737095,0.805163
2691,Yakushiji Ryouko no Kaiki Jikenbo,0.662665,0.803063
6323,Saint Luminous Jogakuin,0.540216,0.796973
10785,"Yakushiji Ryouko no Kaiki Jikenbo: Hamachou, V...",0.516206,0.79534
445,Mirai Nikki (TV),0.768307,0.757638


In [None]:
recommendation = recommend_anime('Under World', features_df, df)
recommendation

Unnamed: 0,name,rating,similarity
12287,Tenshi no Habataki Jun,0.319328,0.999985
12288,The Satisfaction,0.32413,0.999952
12260,Hokenshitsu de Aimashou,0.32533,0.999941
12289,Toushindai My Lover: Minami tai Mecha-Minami,0.297719,0.999898
12284,Super Erotic Anime,0.333733,0.999829
12267,Lovely Series,0.292917,0.999825
12269,Milky Gal: Cats Ai,0.289316,0.999758
12245,Prima Donna Mai,0.343337,0.999633
12281,Sakura no Mori,0.343337,0.999633
12276,Original C-V-P Momoko,0.279712,0.999523


### Evaluation

In [None]:
from sklearn.model_selection import train_test_split

# Splitting the dataset into 80-20 ratio
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Creating feature matrices for both training and testing sets
train_features_df = features_df.loc[train_df.index]
test_features_df = features_df.loc[test_df.index]


In [None]:
def get_recommendations(anime_name, train_features_df, top_n=10):
    # To get the index of the anime in the training set
    index = train_df[train_df['name'] == anime_name].index[0]

    # Calculating cosine similarity
    similarity_scores = cosine_similarity([train_features_df.loc[index]], train_features_df)[0]

    # Getting top N most similar anime indices
    similar_indices = similarity_scores.argsort()[::-1][1:top_n+1]

    # Getting the names of the similar anime
    similar_anime = train_df.iloc[similar_indices]['name']

    return similar_anime


In [None]:
example_anime_name = "Death Note"
recommendations = get_recommendations(example_anime_name, train_features_df)


print(recommendations)


778                                     Death Note Rewrite
981                                        Mousou Dairinin
144                          Higurashi no Naku Koro ni Kai
334                              Higurashi no Naku Koro ni
2691                     Yakushiji Ryouko no Kaiki Jikenbo
10785    Yakushiji Ryouko no Kaiki Jikenbo: Hamachou, V...
445                                       Mirai Nikki (TV)
49                                 Boku dake ga Inai Machi
436                               Jigoku Shoujo Futakomori
471                        Kara no Kyoukai 4: Garan no Dou
Name: name, dtype: object


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def get_recommendations(anime_name, train_features_df, top_n=10):
    # Check if the anime name exists in the training set
    anime_indices = train_df[train_df['name'] == anime_name].index

    if len(anime_indices) == 0:
        print(f"Anime '{anime_name}' not found in the training set.")
        return []

    # Get the index of the anime in the training set
    index = anime_indices[0]
    print(f"Found anime '{anime_name}' at index {index}")

    # Calculate cosine similarity
    similarity_scores = cosine_similarity([train_features_df.loc[index]], train_features_df)[0]
    print(f"Cosine similarity scores: {similarity_scores}")

    # Get top N most similar anime indices
    similar_indices = similarity_scores.argsort()[::-1][1:top_n+1]
    print(f"Indices of similar anime: {similar_indices}")

    # Get the names of the similar anime
    similar_anime = train_df.iloc[similar_indices]['name']
    print(f"Similar anime: {similar_anime.tolist()}")

    return similar_anime


In [None]:
# Replace 'Known Anime Name' with an actual anime name from your dataset
anime_name = 'Death Note'
recommended_anime = get_recommendations(anime_name, train_features_df)

# Step 3: Print the Output
print(f"Recommendations for '{anime_name}': {recommended_anime}")


Found anime 'Death Note' at index 40
Cosine similarity scores: [0.35059111 0.11784281 0.18425506 ... 0.08096653 0.12190515 0.25589565]
Indices of similar anime: [8018  486  445 1417  603 8944 2575 6030 6496 2932]
Similar anime: ['Death Note Rewrite', 'Mousou Dairinin', 'Higurashi no Naku Koro ni Kai', 'Higurashi no Naku Koro ni', 'Yakushiji Ryouko no Kaiki Jikenbo', 'Yakushiji Ryouko no Kaiki Jikenbo: Hamachou, Voice &amp; Fiction', 'Mirai Nikki (TV)', 'Boku dake ga Inai Machi', 'Jigoku Shoujo Futakomori', 'Kara no Kyoukai 4: Garan no Dou']
Recommendations for 'Death Note': 778                                     Death Note Rewrite
981                                        Mousou Dairinin
144                          Higurashi no Naku Koro ni Kai
334                              Higurashi no Naku Koro ni
2691                     Yakushiji Ryouko no Kaiki Jikenbo
10785    Yakushiji Ryouko no Kaiki Jikenbo: Hamachou, V...
445                                       Mirai Nikki (TV)
49    

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_recommendation_system(test_df, train_features_df, top_n=10):
    y_true = []
    y_pred = []

    for anime_name in test_df['name']:
        # True genres of the anime
        true_genres = set(test_df[test_df['name'] == anime_name]['genre'].values[0])

        # Get recommendations
        recommendations = get_recommendations(anime_name, train_features_df, top_n=top_n)

        # Assuming each recommended anime is a relevant result
        recommended_genres = set()
        for rec in recommendations:
            recommended_genres.update(train_df[train_df['name'] == rec]['genre'].values[0])

        # Calculate precision, recall, and F1 for this anime
        true_positive = len(true_genres & recommended_genres)
        false_positive = len(recommended_genres - true_genres)
        false_negative = len(true_genres - recommended_genres)

        # Append to true and predicted lists
        y_true.append(1 if true_positive > 0 else 0)
        y_pred.append(1 if true_positive + false_positive > 0 else 0)

    # Compute overall precision, recall, and F1-score
    precision = precision_score(y_true, y_pred, zero_division=1)
    recall = recall_score(y_true, y_pred, zero_division=1)
    f1 = f1_score(y_true, y_pred, zero_division=1)

    return precision, recall, f1

# Example usage
precision, recall, f1 = evaluate_recommendation_system(test_df, train_features_df)
print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}')


Anime 'Suzy&#039;s Zoo: Daisuki! Witzy - Happy Birthday' not found in the training set.
Anime 'Tactics' not found in the training set.
Anime 'Kamen no Maid Guy' not found in the training set.
Anime 'Take Your Way' not found in the training set.
Anime 'Rinkaku' not found in the training set.
Anime 'Suisei no Gargantia: Meguru Kouro, Haruka' not found in the training set.
Anime 'Digital Devil Story: Megami Tensei' not found in the training set.
Anime 'Chuunibyou demo Koi ga Shitai! Ren Lite' not found in the training set.
Anime 'Kobo-chan' not found in the training set.
Anime 'Sekaiichi Hatsukoi Movie: Yokozawa Takafumi no Baai' not found in the training set.
Anime 'Mardock Scramble: The Second Combustion' not found in the training set.
Anime 'Yukiguni no Oujisama' not found in the training set.
Anime 'Mugen Senshi Valis' not found in the training set.
Anime 'Cyborg 009: Call of Justice 1' not found in the training set.
Anime 'Demi-chan wa Kataritai' not found in the training set.
Anime 

## Interview questions

1. Can you explain the difference between user-based and item-based collaborative filtering?

Ans.

a. User-based collaborative filtering: This method finds users who are similar to the target user based on their past behavior (like ratings or purchases).

b. Item-based collaborative filtering: This method looks at the similarity between items rather than users. It finds items that are similar to the ones a user has liked or interacted with.




  2. What is collaborative filtering, and how does it work?

Ans. Collaborative filtering is a technique used in recommendation systems to suggest items (like movies, products, or music) to users based on the preferences or behavior of other users.

Working:

1. Collect User Data: The system looks at data like user ratings, purchases, or clicks.
2. Find Similarities: The system tries to find patterns in this data. It checks how users’ preferences are similar (in user-based filtering) or how items are related (in item-based filtering).
3. Make Predictions: Based on these similarities, the system predicts what the user might like.
