# Step 1: Load the Dataset

In [11]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [12]:
anime_df = pd.read_csv('anime.csv')

# Step 2: Data Preprocessing

In [13]:
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [14]:
# Check for missing values
print(anime_df.isnull().sum())

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64


# Explore dataset

In [15]:
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [16]:
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [17]:
anime_df.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


# Step 3: Feature Extraction

In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
le = LabelEncoder()

In [25]:
le = LabelEncoder() 
anime_df['type'] = le.fit_transform(anime_df['type'])
anime_df['genre'] = le.fit_transform(anime_df['genre'])

In [27]:
scaler = StandardScaler()
anime_df[['rating', 'members']] = scaler.fit_transform(anime_df[['rating','members']])

In [28]:
# Selecting features for similarity computation
features = anime_df[['type', 'genre', 'episodes', 'rating', 'members']]

# Step 4: Recommendation System

In [29]:
# from sklearn.metrics.pairwise import cosine_similarity

In [30]:
anime_features = pd.concat([pd.DataFrame(genre_matrix.toarray()), anime_df[['episodes', 'rating']]], axis=1)

In [36]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [37]:
features = anime_df[['rating', 'members']]

In [43]:
anime_df.replace('Unknown', float('nan'), inplace=True)

In [44]:
anime_df.dropna(subset=['rating', 'members'], inplace=True)

In [45]:
features = anime_df[['rating', 'members']].astype(float)
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

In [46]:
cosine_sim = cosine_similarity(features_scaled)

In [50]:
def recommend_anime(anime_title, cosine_sim=cosine_sim, df=anime_df, top_n=10): 
    # Create a reverse mapping of indices and anime titles
    indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [60]:
indices = {
    'Naruto': 0,
    'One Piece': 1,
    'Attack on Titan': 2
}

anime_title = 'One Piece'
idx = indices[anime_title] 

In [61]:
idx = indices[anime_title]

In [65]:
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

In [69]:
def get_recommendations(anime_title):
    recommendations = {
        'Naruto': ['Boruto', 'One Piece', 'Bleach'],
        'One Piece': ['Naruto', 'Fairy Tail', 'Attack on Titan'],
        'Attack on Titan': ['Tokyo Ghoul', 'Death Note', 'Fullmetal Alchemist']
    }

In [72]:
print(get_recommendations("Anime Title Example"))

None


# Step 4: Evaluation

In [73]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

In [74]:
train, test = train_test_split(anime_df, test_size=0.2, random_state=42)

In [None]:
# Interview Questions:

# 1. Explain the difference between user-based and item-based collaborative filtering
# User-based focuses on user similarities (users with similar ratings history),
# while item-based focuses on item similarities (items with similar ratings by different users).

# 2. What is collaborative filtering, and how does it work?
# Collaborative filtering uses user behaviors to predict items a user might like,
# based on either similar users (user-based) or similar items (item-based).