## Assignment of Recommedaton Systems :

### Task 1: Data Preprocessing

In [77]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Step 1: Load the Dataset
anime_df = pd.read_csv('anime.csv')
print("Dataset Loaded.")
print(anime_df.head())


Dataset Loaded.
   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama째   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  


In [78]:
# Step 2: Handle Missing Values
print("\nMissing Values:")
print(anime_df.isnull().sum())





Missing Values:
anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64


In [79]:
anime_df.dropna(inplace=True)
print("\nMissing Values After Dropping:")
print(anime_df.isnull().sum())


Missing Values After Dropping:
anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64


In [80]:
# Step 3: Explore the Dataset
print("\nDataset Shape:", anime_df.shape)



Dataset Shape: (12017, 7)


In [81]:
print("Column Names:", anime_df.columns)


Column Names: Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members'], dtype='object')


In [82]:
print("\nData Types:\n", anime_df.dtypes)


Data Types:
 anime_id      int64
name         object
genre        object
type         object
episodes     object
rating      float64
members       int64
dtype: object


In [83]:
print("\nSummary Statistics:\n", anime_df.describe())



Summary Statistics:
            anime_id        rating       members
count  12017.000000  12017.000000  1.201700e+04
mean   13638.001165      6.478264  1.834888e+04
std    11231.076675      1.023857  5.537250e+04
min        1.000000      1.670000  1.200000e+01
25%     3391.000000      5.890000  2.250000e+02
50%     9959.000000      6.570000  1.552000e+03
75%    23729.000000      7.180000  9.588000e+03
max    34519.000000     10.000000  1.013917e+06


In [84]:
# Unique values for categorical columns
print("\nUnique Values for 'type':", anime_df['type'].unique())



Unique Values for 'type': ['Movie' 'TV' 'OVA' 'Special' 'Music' 'ONA']


In [85]:
print("Unique Values for 'genre':", anime_df['genre'].unique())

Unique Values for 'genre': ['Drama, Romance, School, Supernatural'
 'Action, Adventure, Drama, Fantasy, Magic, Military, Shounen'
 'Action, Comedy, Historical, Parody, Samurai, Sci-Fi, Shounen' ...
 'Action, Comedy, Hentai, Romance, Supernatural' 'Hentai, Sports'
 'Hentai, Slice of Life']


In [86]:
# Additional exploration
print("\nGenre Distribution:")
print(anime_df['genre'].value_counts().head(10))



Genre Distribution:
genre
Hentai                   816
Comedy                   521
Music                    297
Kids                     197
Comedy, Slice of Life    174
Dementia                 137
Fantasy, Kids            128
Comedy, Kids             112
Fantasy                  110
Drama, Kids              105
Name: count, dtype: int64


In [87]:
print("\nRating Distribution:")
print(anime_df['rating'].value_counts().head(10))


Rating Distribution:
rating
6.00    141
7.00     98
6.50     90
6.25     84
5.00     76
6.75     72
6.67     68
6.38     67
6.80     67
5.67     66
Name: count, dtype: int64


## Task 2:Feature Extraction:


In [89]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load preprocessed dataset
anime_df = pd.read_csv('anime.csv')



In [90]:
# Handle missing values in 'genre' column
anime_df['genre'] = anime_df['genre'].fillna('Unknown')
print(anime_df)


       anime_id                                               name  \
0         32281                                     Kimi no Na wa.   
1          5114                   Fullmetal Alchemist: Brotherhood   
2         28977                                           Gintama째   
3          9253                                        Steins;Gate   
4          9969                                      Gintama&#039;   
...         ...                                                ...   
12289      9316       Toushindai My Lover: Minami tai Mecha-Minami   
12290      5543                                        Under World   
12291      5621                     Violence Gekiga David no Hoshi   
12292      6133  Violence Gekiga Shin David no Hoshi: Inma Dens...   
12293     26081                   Yasuji no Pornorama: Yacchimae!!   

                                                   genre   type episodes  \
0                   Drama, Romance, School, Supernatural  Movie        1   
1      

In [91]:
# Convert non-numerical values in 'rating' and 'episodes' to NaN
anime_df['rating'] = pd.to_numeric(anime_df['rating'], errors='coerce')
anime_df['episodes'] = pd.to_numeric(anime_df['episodes'], errors='coerce')



In [92]:
# Drop rows with NaN values in 'rating' or 'episodes'
anime_df = anime_df.dropna(subset=['rating', 'episodes'])
print(anime_df)

       anime_id                                               name  \
0         32281                                     Kimi no Na wa.   
1          5114                   Fullmetal Alchemist: Brotherhood   
2         28977                                           Gintama째   
3          9253                                        Steins;Gate   
4          9969                                      Gintama&#039;   
...         ...                                                ...   
12289      9316       Toushindai My Lover: Minami tai Mecha-Minami   
12290      5543                                        Under World   
12291      5621                     Violence Gekiga David no Hoshi   
12292      6133  Violence Gekiga Shin David no Hoshi: Inma Dens...   
12293     26081                   Yasuji no Pornorama: Yacchimae!!   

                                                   genre   type  episodes  \
0                   Drama, Romance, School, Supernatural  Movie       1.0   
1    

In [93]:
# Decide on features for similarity computation
features = ['genre', 'rating', 'episodes']



In [94]:
# Convert categorical 'genre' to numerical representation using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
genre_tfidf = vectorizer.fit_transform(anime_df['genre'])



In [95]:
# Convert categorical 'type' to numerical representation using LabelEncoder
le = LabelEncoder()
anime_df['type_encoded'] = le.fit_transform(anime_df['type'])



In [96]:
# Create a new dataframe with the selected features
anime_features = pd.DataFrame.sparse.from_spmatrix(genre_tfidf).add_prefix('genre_')
anime_features['rating'] = anime_df['rating']
anime_features['episodes'] = anime_df['episodes']
anime_features['type_encoded'] = anime_df['type_encoded']


In [97]:
# Separate numerical features
numerical_features = anime_df[['rating', 'episodes']]

print(anime_df)

       anime_id                                               name  \
0         32281                                     Kimi no Na wa.   
1          5114                   Fullmetal Alchemist: Brotherhood   
2         28977                                           Gintama째   
3          9253                                        Steins;Gate   
4          9969                                      Gintama&#039;   
...         ...                                                ...   
12289      9316       Toushindai My Lover: Minami tai Mecha-Minami   
12290      5543                                        Under World   
12291      5621                     Violence Gekiga David no Hoshi   
12292      6133  Violence Gekiga Shin David no Hoshi: Inma Dens...   
12293     26081                   Yasuji no Pornorama: Yacchimae!!   

                                                   genre   type  episodes  \
0                   Drama, Romance, School, Supernatural  Movie       1.0   
1    

In [98]:
# Scale numerical features using StandardScaler
scaler = StandardScaler()
scaled_numerical_features = scaler.fit_transform(numerical_features)



In [99]:
# Update anime_features with scaled numerical features
anime_features[['rating', 'episodes']] = scaled_numerical_features

print("Feature Extraction Complete.")
print(anime_features.head())

Feature Extraction Complete.
    genre_0   genre_1  genre_2  genre_3  genre_4  genre_5  genre_6  genre_7  \
0         0         0        0        0        0        0        0        0   
1  0.294928  0.316591        0        0        0        0        0        0   
2  0.251292         0        0        0        0  0.20138        0        0   
3         0         0        0        0        0        0        0        0   
4  0.251292         0        0        0        0  0.20138        0        0   

    genre_8  genre_9  ...  genre_40  genre_41  genre_42  genre_43  genre_44  \
0  0.438406        0  ...         0  0.545939         0         0         0   
1  0.334639        0  ...         0         0         0         0         0   
2         0        0  ...         0         0         0         0         0   
3         0        0  ...         0         0  0.834067         0         0   
4         0        0  ...         0         0         0         0         0   

   genre_45  genre_46

## Task 3: Recommendation System:

In [101]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
warnings.filterwarnings("ignore")


# Load preprocessed dataset
anime_df = pd.read_csv('anime.csv')



In [102]:
# Handle missing values in 'genre' column
anime_df['genre'] = anime_df['genre'].fillna('Unknown')

# Convert non-numerical values in 'rating' and 'episodes' to NaN
anime_df['rating'] = pd.to_numeric(anime_df['rating'], errors='coerce')
anime_df['episodes'] = pd.to_numeric(anime_df['episodes'], errors='coerce')


In [103]:
# Drop rows with NaN values in 'rating' or 'episodes'
anime_df = anime_df.dropna(subset=['rating', 'episodes'])

# Decide on features for similarity computation
features = ['genre', 'rating', 'episodes']

# Convert categorical 'genre' to numerical representation using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
genre_tfidf = vectorizer.fit_transform(anime_df['genre'])

# Convert categorical 'type' to numerical representation using LabelEncoder
le = LabelEncoder()
anime_df['type_encoded'] = le.fit_transform(anime_df['type'])


In [104]:
# Create a new dataframe with the selected features
anime_features = pd.DataFrame.sparse.from_spmatrix(genre_tfidf).add_prefix('genre_')
anime_features['rating'] = anime_df['rating']
anime_features['episodes'] = anime_df['episodes']
anime_features['type_encoded'] = anime_df['type_encoded']


In [105]:
# Scale numerical features using StandardScaler
scaler = StandardScaler()
scaled_numerical_features = scaler.fit_transform(anime_df[['rating', 'episodes']])
anime_features[['rating', 'episodes']] = scaled_numerical_features


# Check for NaN values and replace with mean
imputer = SimpleImputer(strategy='mean')
anime_features = pd.DataFrame(imputer.fit_transform(anime_features), columns=anime_features.columns)


In [106]:
# Calculate cosine similarity matrix
cosine_sim = cosine_similarity(anime_features)


def recommend_anime(anime_id, num_recommendations=5, threshold=0.5):
    """
    Recommend anime based on cosine similarity.

    Parameters:
    anime_id (int): Target anime ID.
    num_recommendations (int, optional): Number of recommendations. Defaults to 5.
    threshold (float, optional): Minimum similarity score. Defaults to 0.5.

    Returns:
    list: Recommended anime IDs.
    """
    # Get the anime's similarity scores
    similarity_scores = cosine_sim[anime_id]
    
    # Exclude the anime itself
    similarity_scores[anime_id] = 0
    
    # Get the top N similar anime IDs above the threshold
    top_similar_ids = np.argsort(-similarity_scores)[similarity_scores > threshold][:num_recommendations]
    
    # Return the recommended anime IDs
    return top_similar_ids


In [107]:
# Test the recommendation function
anime_id = 0  # Replace with the desired anime ID
threshold_values = [0.3, 0.5, 0.7]  # Experiment with different thresholds

for threshold in threshold_values:
    recommended_anime_ids = recommend_anime(anime_id, threshold=threshold)
    print(f"Threshold: {threshold}, Recommended Anime IDs: {recommended_anime_ids}")



Threshold: 0.3, Recommended Anime IDs: [ 15  25  11 399  35]
Threshold: 0.5, Recommended Anime IDs: [  611   460 10319   209   194]
Threshold: 0.7, Recommended Anime IDs: [  460 10319   209   194  8098]


In [108]:
# Get anime titles for recommended IDs
recommended_anime_titles = anime_df.iloc[recommended_anime_ids]['name']

print("Recommended Anime Titles:")
print(recommended_anime_titles)

Recommended Anime Titles:
462                         Nekomonogatari: Kuro
10464    Taka no Tsume 8: Yoshida-kun no X-Files
210                              Nagi no Asukara
195                                 Sennen Joyuu
8152         Aoyo, Kaette Koi: Tokyo Dai Kuushuu
Name: name, dtype: object


## Task 4: Evaluation

In [110]:
def evaluate_recommendation(anime_id, num_recommendations=5, threshold=0.5):
    # Get recommended anime IDs
    recommended_anime_ids = recommend_anime(anime_id, num_recommendations, threshold)
    
    # Filter out-of-bounds indices
    recommended_anime_ids = [id for id in recommended_anime_ids if id < len(test_anime_labels)]
    
    # Get true labels (anime names)
    true_labels = test_anime_labels.iloc[recommended_anime_ids]
    
    # Get predicted labels (recommended anime names)
    predicted_labels = test_anime_labels.iloc[recommended_anime_ids]
    
    # Calculate precision, recall, and F1-score
    precision = precision_score(true_labels, predicted_labels, average='macro')
    recall = recall_score(true_labels, predicted_labels, average='macro')
    f1 = f1_score(true_labels, predicted_labels, average='macro')
    
    return precision, recall, f1


In [111]:
# Evaluate recommendation system
anime_id = 0  # Replace with the desired anime ID
threshold_values = [0.3, 0.5, 0.7]  # Experiment with different thresholds
num_recommendations = 5

for threshold in threshold_values:
    try:
        precision, recall, f1 = evaluate_recommendation(anime_id, num_recommendations, threshold)
        print(f"Threshold: {threshold}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")
    except Exception as e:
        print(f"Threshold: {threshold}, Error: {str(e)}")

Threshold: 0.3, Error: name 'test_anime_labels' is not defined
Threshold: 0.5, Error: name 'test_anime_labels' is not defined
Threshold: 0.7, Error: name 'test_anime_labels' is not defined
