In [40]:

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score


In [2]:
#load the data set
df=pd.read_csv('anime.csv')

In [3]:
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
df.tail()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175
12293,26081,Yasuji no Pornorama: Yacchimae!!,Hentai,Movie,1,5.46,142


In [5]:
df.isnull().sum()

Unnamed: 0,0
anime_id,0
name,0
genre,62
type,25
episodes,0
rating,230
members,0


In [6]:
df.duplicated().sum()

0

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [8]:
df.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [9]:

# Impute missing values (example with mean imputation for numerical columns)
for col in df.select_dtypes(include=np.number).columns:
    df[col] = df[col].fillna(df[col].mean())

# Drop rows with missing values (alternative approach)
df.dropna(inplace=True)
df.isnull().sum()


Unnamed: 0,0
anime_id,0
name,0
genre,0
type,0
episodes,0
rating,0
members,0


In [10]:
# 1. Print the available columns to check for typos or changes in the column name:
print(df.columns)
# Assuming 'genres' is a column in your DataFrame
genres_encoded = pd.get_dummies(df['genre'], prefix='genre')

# Concatenate the encoded genres with the original DataFrame
df = pd.concat([df, genres_encoded], axis=1)


Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members'], dtype='object')


In [15]:
# Impute missing values (example with mean imputation for numerical columns)
for col in df.select_dtypes(include=np.number).columns:
    df[col] = df[col].fillna(df[col].mean())

# Drop rows with missing values (alternative approach)
df.dropna(inplace=True)
df.isnull().sum()

# ... (rest of your code) ...

# Normalize numerical features (e.g., episodes, rating, members)
from sklearn.preprocessing import MinMaxScaler

numerical_features = ['episodes', 'rating', 'members']

# Convert 'Unknown' to NaN in numerical_features columns
for feature in numerical_features:
    df[feature] = pd.to_numeric(df[feature], errors='coerce')  # Convert 'Unknown' to NaN

# Impute NaN values with the mean (or another strategy)
for feature in numerical_features:
    df[feature] = df[feature].fillna(df[feature].mean())

scaler = MinMaxScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])
print(df[numerical_features])

       episodes    rating   members
0      0.000000  0.924370  0.197872
1      0.034673  0.911164  0.782770
2      0.027518  0.909964  0.112689
3      0.012658  0.900360  0.664325
4      0.027518  0.899160  0.149186
...         ...       ...       ...
12289  0.000000  0.297719  0.000203
12290  0.000000  0.313325  0.000176
12291  0.001651  0.385354  0.000211
12292  0.000000  0.397359  0.000168
12293  0.000000  0.454982  0.000135

[12210 rows x 3 columns]


In [29]:
def recommend_anime(df, target_anime_name, feature_cols, top_n=10, threshold=0.5):
    """
    Recommends anime based on cosine similarity.

    Args:
        df: The pandas DataFrame containing anime data.
        target_anime_name: The name of the anime to find recommendations for.
        feature_cols: A list of columns to use for similarity calculation.
        top_n: The maximum number of recommendations to return.
        threshold: The minimum similarity score for a recommendation.
 Returns:
        A list of recommended anime names.

    """
    1. #Data Preprocessing (if necessary)
    # ...

    # 2. Feature Selection
    selected_features_df = df[feature_cols]

    # 3. Feature Scaling (optional but recommended)
    scaler = MinMaxScaler()
    selected_features_df = pd.DataFrame(scaler.fit_transform(selected_features_df),
                                        columns=selected_features_df.columns,
                                        index=selected_features_df.index)

    # 4. Cosine Similarity Calculation
    cosine_sim = cosine_similarity(selected_features_df, selected_features_df)

    # 5. Get Recommendations
    # Find the index of the target anime
    try:
        target_anime_index = df[df['name'] == target_anime_name].index[0]
    except IndexError:
        print(f"Anime '{target_anime_name}' not found in the dataset.")
        return []  # Return an empty list if anime not found

    # Get similarity scores for the target anime
    similarity_scores = list(enumerate(cosine_sim[target_anime_index]))

    # Sort anime by similarity score in descending order
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Filter recommendations based on threshold and top_n
    recommendations = []
    for index, score in similarity_scores:
        if score >= threshold and index != target_anime_index and len(recommendations) < top_n:
            recommendations.append((df['name'][index], score))  # Append with score

    return recommendations

# Example Usage with Threshold Experimentation
# Assuming you have a DataFrame called 'df' with anime data and genre dummy variables
feature_columns = ['genre_Action', 'genre_Adventure', 'genre_Comedy', 'rating']
target_anime = "Death Note"

# Experiment with different thresholds
thresholds = [0.6, 0.7, 0.8]  # Try different values
for threshold in thresholds:
    recommendations = recommend_anime(df, target_anime, feature_columns, threshold=threshold)
    print(f"\nRecommendations for '{target_anime}' (Threshold: {threshold}):")
    if recommendations:
        for anime, score in recommendations:
            print(f"- {anime} (Similarity Score: {score:.2f})")
    else:
        print("No recommendations found for this threshold.")




Recommendations for 'Death Note' (Threshold: 0.6):
- Kimi no Na wa. (Similarity Score: 1.00)
- Fullmetal Alchemist: Brotherhood (Similarity Score: 1.00)
- Gintama° (Similarity Score: 1.00)
- Steins;Gate (Similarity Score: 1.00)
- Gintama&#039; (Similarity Score: 1.00)
- Haikyuu!!: Karasuno Koukou VS Shiratorizawa Gakuen Koukou (Similarity Score: 1.00)
- Hunter x Hunter (2011) (Similarity Score: 1.00)
- Ginga Eiyuu Densetsu (Similarity Score: 1.00)
- Gintama Movie: Kanketsu-hen - Yorozuya yo Eien Nare (Similarity Score: 1.00)
- Gintama&#039;: Enchousen (Similarity Score: 1.00)

Recommendations for 'Death Note' (Threshold: 0.7):
- Kimi no Na wa. (Similarity Score: 1.00)
- Fullmetal Alchemist: Brotherhood (Similarity Score: 1.00)
- Gintama° (Similarity Score: 1.00)
- Steins;Gate (Similarity Score: 1.00)
- Gintama&#039; (Similarity Score: 1.00)
- Haikyuu!!: Karasuno Koukou VS Shiratorizawa Gakuen Koukou (Similarity Score: 1.00)
- Hunter x Hunter (2011) (Similarity Score: 1.00)
- Ginga Eiy

In [39]:

# ... (Load and preprocess your data as before) ...

def recommend_anime(df, target_anime_name, feature_cols, top_n=10, threshold=0.5):
    """Recommends anime based on cosine similarity with threshold adjustment."""
    # ... (Your existing code for feature selection, scaling, and similarity calculation) ...

    selected_features_df = df[feature_cols]
    scaler = MinMaxScaler()
    selected_features_df = pd.DataFrame(scaler.fit_transform(selected_features_df),
                                        columns=selected_features_df.columns,
                                        index=selected_features_df.index)
    cosine_sim = cosine_similarity(selected_features_df, selected_features_df)

    try:
        target_anime_index = df[df['name'] == target_anime_name].index[0]
    except IndexError:
        print(f"Anime '{target_anime_name}' not found in the dataset.")
        return []  # Return an empty list if anime not found

    similarity_scores = list(enumerate(cosine_sim[target_anime_index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    recommendations = []
    for index, score in similarity_scores:
        if score >= threshold and index != target_anime_index and len(recommendations) < top_n:
            # Use .iloc to access by position within the current DataFrame (df)
            # Use df.index[index] to get the actual index
            recommendations.append((df.loc[df.index[index], 'name'], score))

    # Ensure a list is always returned, even if empty
    return recommendations if recommendations else []


# Example Usage with Data Splitting and Evaluation
feature_columns = ['genre_Action', 'genre_Adventure', 'genre_Comedy', 'rating']
target_anime = "Death Note"

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Get recommendations
recommendations = recommend_anime(train_df, target_anime, feature_columns, threshold=0.6)

# Evaluation
# Get actual relevant anime for evaluation (using 'genre' as relevance indicator)
# Assuming 'genre' is a column containing comma-separated genres
actual_relevant_anime = test_df[test_df['name'] == target_anime]['genre'].str.split(', ').tolist()
actual_relevant_anime = actual_relevant_anime[0] if actual_relevant_anime else []  # Handle potential empty list

recommended_anime_names = [anime for anime, _ in recommendations]

# Calculate precision, recall, and F1-score
# For precision and recall, we need to define what is considered a "relevant" recommendation.
# Here, we'll assume a recommendation is relevant if it shares at least one genre with the target anime.
relevant_recommendations = [anime for anime in recommended_anime_names if any(genre in df[df['name'] == anime]['genre'].iloc[0].split(', ') for genre in actual_relevant_anime)]

precision = precision_score([1] * len(relevant_recommendations) + [0] * (len(recommended_anime_names) - len(relevant_recommendations)),
                           [1] * len(actual_relevant_anime) + [0] * (len(recommended_anime_names) - len(actual_relevant_anime)),
                           zero_division=1)  # Handle cases where there are no relevant recommendations or actual relevant anime

recall = recall_score([1] * len(relevant_recommendations) + [0] * (len(recommended_anime_names) - len(relevant_recommendations)),
                      [1] * len(actual_relevant_anime) + [0] * (len(recommended_anime_names) - len(actual_relevant_anime)),
                      zero_division=1)

f1 = f1_score([1] * len(relevant_recommendations) + [0] * (len(recommended_anime_names) - len(relevant_recommendations)),
              [1] * len(actual_relevant_anime) + [0] * (len(recommended_anime_names) - len(actual_relevant_anime)),
              zero_division=1)

print(f"\nRecommendations for '{target_anime}' (Threshold: 0.6):")
if recommendations:
    for anime, score in recommendations:
        print(f"- {anime} (Similarity Score: {score:.2f})")
else:
    print("No recommendations found for this threshold.")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

# Analysis and Improvement
# ... (Analyze the results and identify potential areas for improvement) ...



Recommendations for 'Death Note' (Threshold: 0.6):
- Fate/kaleid liner Prisma☆Illya Specials (Similarity Score: 1.00)
- Narara Wondeogongju (Similarity Score: 1.00)
- Dragon Collection (Similarity Score: 1.00)
- Pokemon Omega Ruby &amp; Alpha Sapphire: Mega Special Animation (Similarity Score: 1.00)
- Recorder to Randoseru Mi☆ (Similarity Score: 1.00)
- Hokuto no Ken: Yuria-den (Similarity Score: 1.00)
- Gegege no Kitarou (2007) (Similarity Score: 1.00)
- Socket (Similarity Score: 1.00)
- Mori no Youki na Kobito-tachi: Belfy to Lillibit (Similarity Score: 1.00)
- Flanders no Inu (Movie) (Similarity Score: 1.00)

Precision: 1.00
Recall: 1.00
F1-score: 1.00
