# Recommendation System

In [606]:
import warnings
warnings.filterwarnings(action = 'ignore')

In [608]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [610]:
data = pd.read_csv('anime.csv')

## Data Preprocessing:

In [613]:
data.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [615]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


## convert data types

In [618]:
data['episodes'] = pd.to_numeric(data['episodes'], errors='coerce').fillna(0).astype(int)

In [620]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  int32  
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int32(1), int64(2), object(3)
memory usage: 624.4+ KB


## Remove unwanted columns

In [623]:
data.drop('anime_id', axis = 1, inplace = True)

In [625]:
data.head()

Unnamed: 0,name,genre,type,episodes,rating,members
0,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


## Handle missing values

In [628]:
data.isnull().sum()

name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [629]:
data['rating'].fillna(data['rating'].mean(), inplace = True)

In [630]:
data['genre'].fillna(data['genre'].mode()[0], inplace = True)

In [634]:
data['type'].fillna(data['type'].mode()[0], inplace = True)

In [636]:
data.isnull().sum()

name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [638]:
data.duplicated().sum()

0

## Feature Extraction:

## MultiLabelBinarizer

In [642]:
from sklearn.preprocessing import MultiLabelBinarizer

In [644]:
data['genre'] = data['genre'].str.split(',')

In [646]:
mlb = MultiLabelBinarizer()

In [648]:
anime = mlb.fit_transform(data['genre'])

In [650]:
anime

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [652]:
ani = pd.DataFrame(anime, columns = mlb.classes_)

In [654]:
ani

Unnamed: 0,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,...,Shoujo,Shounen,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12290,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12291,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12292,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [656]:
anime = data.drop('genre', axis = 1).join(ani)

In [658]:
anime

Unnamed: 0,name,type,episodes,rating,members,Adventure,Cars,Comedy,Dementia,Demons,...,Shoujo,Shounen,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi
0,Kimi no Na wa.,Movie,1,9.37,200630,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Fullmetal Alchemist: Brotherhood,TV,64,9.26,793665,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Gintama°,TV,51,9.25,114262,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Steins;Gate,TV,24,9.17,673572,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Gintama&#039;,TV,51,9.16,151266,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,Toushindai My Lover: Minami tai Mecha-Minami,OVA,1,4.15,211,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12290,Under World,OVA,1,4.28,183,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12291,Violence Gekiga David no Hoshi,OVA,4,4.88,219,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12292,Violence Gekiga Shin David no Hoshi: Inma Dens...,OVA,1,4.98,175,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## One-Hot Encoder

In [661]:
encode = pd.get_dummies(anime['type'], dtype = int)

In [663]:
encode

Unnamed: 0,Movie,Music,ONA,OVA,Special,TV
0,1,0,0,0,0,0
1,0,0,0,0,0,1
2,0,0,0,0,0,1
3,0,0,0,0,0,1
4,0,0,0,0,0,1
...,...,...,...,...,...,...
12289,0,0,0,1,0,0
12290,0,0,0,1,0,0
12291,0,0,0,1,0,0
12292,0,0,0,1,0,0


In [665]:
encode = anime.drop('type', axis = 1).join(encode, rsuffix="_encoded")

In [667]:
encode

Unnamed: 0,name,episodes,rating,members,Adventure,Cars,Comedy,Dementia,Demons,Drama,...,Supernatural,Thriller,Vampire,Yaoi,Movie,Music_encoded,ONA,OVA,Special,TV
0,Kimi no Na wa.,1,9.37,200630,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,Fullmetal Alchemist: Brotherhood,64,9.26,793665,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,Gintama°,51,9.25,114262,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Steins;Gate,24,9.17,673572,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,Gintama&#039;,51,9.16,151266,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,Toushindai My Lover: Minami tai Mecha-Minami,1,4.15,211,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
12290,Under World,1,4.28,183,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
12291,Violence Gekiga David no Hoshi,4,4.88,219,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
12292,Violence Gekiga Shin David no Hoshi: Inma Dens...,1,4.98,175,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## Normalize numerical features

In [670]:
from sklearn.preprocessing import MinMaxScaler

In [672]:
scaler = MinMaxScaler()

In [674]:
encode[['episodes', 'rating', 'members']]= scaler.fit_transform(encode[['episodes', 'rating', 'members']])

In [676]:
encode

Unnamed: 0,name,episodes,rating,members,Adventure,Cars,Comedy,Dementia,Demons,Drama,...,Supernatural,Thriller,Vampire,Yaoi,Movie,Music_encoded,ONA,OVA,Special,TV
0,Kimi no Na wa.,0.000550,0.924370,0.197872,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,Fullmetal Alchemist: Brotherhood,0.035204,0.911164,0.782770,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,Gintama°,0.028053,0.909964,0.112689,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Steins;Gate,0.013201,0.900360,0.664325,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,Gintama&#039;,0.028053,0.899160,0.149186,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,Toushindai My Lover: Minami tai Mecha-Minami,0.000550,0.297719,0.000203,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
12290,Under World,0.000550,0.313325,0.000176,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
12291,Violence Gekiga David no Hoshi,0.002200,0.385354,0.000211,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
12292,Violence Gekiga Shin David no Hoshi: Inma Dens...,0.000550,0.397359,0.000168,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## Recommendation System:

In [679]:
from sklearn.metrics.pairwise import cosine_similarity

In [681]:
cosine = cosine_similarity(encode.drop(columns=['name']))

In [682]:
cosine

array([[1.        , 0.13365709, 0.11961517, ..., 0.10011505, 0.10300597,
        0.39389324],
       [0.13365709, 1.        , 0.42879978, ..., 0.07800215, 0.08023483,
        0.09083175],
       [0.11961517, 0.42879978, 1.        , ..., 0.08047458, 0.08278847,
        0.09373139],
       ...,
       [0.10011505, 0.07800215, 0.08047458, ..., 1.        , 0.99996828,
        0.53974669],
       [0.10300597, 0.08023483, 0.08278847, ..., 0.99996828, 1.        ,
        0.54107326],
       [0.39389324, 0.09083175, 0.09373139, ..., 0.53974669, 0.54107326,
        1.        ]])

In [683]:
encode

Unnamed: 0,name,episodes,rating,members,Adventure,Cars,Comedy,Dementia,Demons,Drama,...,Supernatural,Thriller,Vampire,Yaoi,Movie,Music_encoded,ONA,OVA,Special,TV
0,Kimi no Na wa.,0.000550,0.924370,0.197872,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,Fullmetal Alchemist: Brotherhood,0.035204,0.911164,0.782770,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,Gintama°,0.028053,0.909964,0.112689,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Steins;Gate,0.013201,0.900360,0.664325,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,Gintama&#039;,0.028053,0.899160,0.149186,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,Toushindai My Lover: Minami tai Mecha-Minami,0.000550,0.297719,0.000203,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
12290,Under World,0.000550,0.313325,0.000176,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
12291,Violence Gekiga David no Hoshi,0.002200,0.385354,0.000211,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
12292,Violence Gekiga Shin David no Hoshi: Inma Dens...,0.000550,0.397359,0.000168,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [684]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load and preprocess dataset
def load_and_preprocess_data(file_path):
    encode = pd.read_csv(file_path)
    return encode


# Recommend anime function
def recommend_anime(encode, cosine, name, top_n=5):
    # Check if anime exists in dataset
    if name not in encode['name'].values:
        return "Anime not found in dataset"
    
    # Get index of the anime
    idx = encode.index[encode['name'] == name].tolist()[0]
    
    # Get similarity scores
    sim_scores = list(enumerate(cosine[idx]))
    
    # Sort anime based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    
    # Get recommended anime names
    anime_indices = [i[0] for i in sim_scores]
    return encode['name'].iloc[anime_indices].tolist()

# Example Usage
file_path = 'anime.csv'  # Update with actual dataset path
encode = load_and_preprocess_data(file_path)
cosine = compute_similarity(encode)
anime_name = "Steins;Gate"
recommended_anime = recommend_anime(encode, cosine, anime_name)
print(recommended_anime)

['Dragon Ball Z Movie 08: Moetsukiro!! Nessen, Ressen, Chougekisen', 'Touhai Densetsu Akagi: Yami ni Maiorita Tensai', 'Steamboy', 'Onegai☆Teacher: Himitsu na Futari', 'Slayers Next']


In [685]:
def recommend_anime(encode, cosine, name, top_n=5, threshold=0.5):
    if name not in encode['name'].values:
        return "Anime not found in dataset"
    
    idx = encode.index[encode['name'] == name].tolist()[0]
    
    sim_scores = list(enumerate(cosine[idx]))
    
    # Filter by threshold
    sim_scores = [s for s in sim_scores if s[1] >= threshold]
    
    # Sort in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    
    anime_indices = [i[0] for i in sim_scores]
    return encode['name'].iloc[anime_indices].tolist() if anime_indices else "No anime found above threshold"
file_path = 'anime.csv'  # Update with actual dataset path
encode = load_and_preprocess_data(file_path)
cosine = compute_similarity(encode)
anime_name = "Steins;Gate"
recommended_anime = recommend_anime(encode, cosine, anime_name)
print(recommended_anime)

['Dragon Ball Z Movie 08: Moetsukiro!! Nessen, Ressen, Chougekisen', 'Touhai Densetsu Akagi: Yami ni Maiorita Tensai', 'Steamboy', 'Onegai☆Teacher: Himitsu na Futari', 'Slayers Next']


In [686]:
def recommend_anime(encode, cosine, name, top_n=5, threshold=0.3):
    if name not in encode['name'].values:
        return "Anime not found in dataset"
    
    idx = encode.index[encode['name'] == name].tolist()[0]
    
    sim_scores = list(enumerate(cosine[idx]))
    
    # Filter by threshold
    sim_scores = [s for s in sim_scores if s[1] >= threshold]
    
    # Sort in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    
    anime_indices = [i[0] for i in sim_scores]
    return encode['name'].iloc[anime_indices].tolist() if anime_indices else "No anime found above threshold"
file_path = 'anime.csv'  # Update with actual dataset path
encode = load_and_preprocess_data(file_path)
cosine = compute_similarity(encode)
anime_name = "Steins;Gate"
recommended_anime = recommend_anime(encode, cosine, anime_name)
print(recommended_anime)

['Dragon Ball Z Movie 08: Moetsukiro!! Nessen, Ressen, Chougekisen', 'Touhai Densetsu Akagi: Yami ni Maiorita Tensai', 'Steamboy', 'Onegai☆Teacher: Himitsu na Futari', 'Slayers Next']


## Evaluation:

In [688]:
from sklearn.model_selection import train_test_split

# Split dataset (80% training, 20% testing)
train, test = train_test_split(encode, test_size=0.2, random_state=42)

# Print dataset sizes
print("Training set size:", train.shape)
print("Testing set size:", test.shape)


Training set size: (9835, 7)
Testing set size: (2459, 7)


In [689]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_recommendations(test_data, encode, cosine, top_n=10, threshold=0.05):  # Lower threshold
    actual = []
    predicted = []

    for anime_name in test_data['name'].values:
        recommended_anime = recommend_anime(encode, cosine, anime_name, top_n, threshold)

        if isinstance(recommended_anime, list) and recommended_anime:
            actual.append(1)  # This anime has related recommendations
            predicted.append(1 if any(anime in test_data['name'].values for anime in recommended_anime) else 0)
    
    if not actual or not predicted:  # Prevent errors in case of empty lists
        return {'Precision': 'N/A', 'Recall': 'N/A', 'F1-Score': 'N/A'}

    # Compute precision, recall, and F1-score
    precision = precision_score(actual, predicted, average='binary')
    recall = recall_score(actual, predicted, average='binary')
    f1 = f1_score(actual, predicted, average='binary')

    return {'Precision': precision, 'Recall': recall, 'F1-Score': f1}

# Run evaluation again
metrics = evaluate_recommendations(test, encode, cosine)
print(metrics)


{'Precision': 1.0, 'Recall': 0.8971126474176494, 'F1-Score': 0.9457663451232583}


In [None]:
### Analyze the performance of the recommendation system and identify areas of improvement.

What’s Good?
Precision (1.0): All recommended items are relevant. No wrong recommendations.
F1-Score (0.946): The overall system is working well.
What Needs Improvement?
Recall (0.897) is slightly lower: The system is missing some relevant items that should be recommended.

## Interview Questions:

### 1. Can you explain the difference between user-based and item-based collaborative filtering?
#### User-Based: Recommends items based on users with similar preferences.
#### Item-Based: Recommends items similar to those the user has interacted with.
#### User similarity: Pearson correlation, cosine similarity.
#### Item similarity: Cosine similarity, Jaccard similarity.
#### User-Based is dynamic, Item-Based is more stable.

### 2. What is collaborative filtering, and how does it work?
#### A recommendation technique using user-item interactions.
#### Types:
#### User-Based: Finds similar users.
#### Item-Based: Finds similar items.
#### Works by analyzing past user behavior to predict preferences.
#### Used in platforms like Netflix, Amazon, and Spotify