### trying out content based filtering 

In [7]:
import pandas as pd 
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
user_input = pd.read_csv('User_input.csv', on_bad_lines='skip')

In [9]:
user_input.head()

Unnamed: 0,Username,Gender,Mean Score,Completed,Birth_Year,user_id,anime_id,Anime Title,rating,Genres,...,Genre_Gourmet,Genre_Hentai,Genre_Horror,Genre_Mystery,Genre_Romance,Genre_Sci-Fi,Genre_Slice of Life,Genre_Sports,Genre_Supernatural,Genre_Suspense
0,Xinil,Male,7.37,233.0,1985,1,21,One Piece,9,"Action, Adventure, Fantasy",...,0,0,0,0,0,0,0,0,0,0
1,Xinil,Male,7.37,233.0,1985,1,48,.hack//Sign,7,"Adventure, Fantasy, Mystery",...,0,0,0,1,0,0,0,0,0,0
2,Xinil,Male,7.37,233.0,1985,1,320,A Kite,5,"Action, Drama, Hentai",...,0,1,0,0,0,0,0,0,0,0
3,Xinil,Male,7.37,233.0,1985,1,49,Aa! Megami-sama!,8,"Comedy, Romance, Supernatural",...,0,0,0,0,1,0,0,0,1,0
4,Xinil,Male,7.37,233.0,1985,1,304,Aa! Megami-sama! Movie,8,"Comedy, Romance, Supernatural",...,0,0,0,0,1,0,0,0,1,0


In [10]:
df = user_input
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3033293 entries, 0 to 3033292
Data columns (total 31 columns):
 #   Column               Dtype  
---  ------               -----  
 0   Username             object 
 1   Gender               object 
 2   Mean Score           float64
 3   Completed            float64
 4   Birth_Year           int64  
 5   user_id              int64  
 6   anime_id             int64  
 7   Anime Title          object 
 8   rating               int64  
 9   Genres               object 
 10  Genre_Action         int64  
 11  Genre_Adventure      int64  
 12  Genre_Avant Garde    int64  
 13  Genre_Award Winning  int64  
 14  Genre_Boys Love      int64  
 15  Genre_Comedy         int64  
 16  Genre_Drama          int64  
 17  Genre_Ecchi          int64  
 18  Genre_Erotica        int64  
 19  Genre_Fantasy        int64  
 20  Genre_Girls Love     int64  
 21  Genre_Gourmet        int64  
 22  Genre_Hentai         int64  
 23  Genre_Horror         int64  
 24

In [11]:
df = pd.get_dummies(df, columns=['Gender'], prefix='Gender_')
# df_new = df.drop(columns=['Gender'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3033293 entries, 0 to 3033292
Data columns (total 33 columns):
 #   Column               Dtype  
---  ------               -----  
 0   Username             object 
 1   Mean Score           float64
 2   Completed            float64
 3   Birth_Year           int64  
 4   user_id              int64  
 5   anime_id             int64  
 6   Anime Title          object 
 7   rating               int64  
 8   Genres               object 
 9   Genre_Action         int64  
 10  Genre_Adventure      int64  
 11  Genre_Avant Garde    int64  
 12  Genre_Award Winning  int64  
 13  Genre_Boys Love      int64  
 14  Genre_Comedy         int64  
 15  Genre_Drama          int64  
 16  Genre_Ecchi          int64  
 17  Genre_Erotica        int64  
 18  Genre_Fantasy        int64  
 19  Genre_Girls Love     int64  
 20  Genre_Gourmet        int64  
 21  Genre_Hentai         int64  
 22  Genre_Horror         int64  
 23  Genre_Mystery        int64  
 24

In [20]:
df = df.drop(columns=['Genres'])

In [21]:
df.isna().sum()

Username               0
Mean Score             0
Completed              0
Birth_Year             0
user_id                0
anime_id               0
Anime Title            0
rating                 0
Genre_Action           0
Genre_Adventure        0
Genre_Avant Garde      0
Genre_Award Winning    0
Genre_Boys Love        0
Genre_Comedy           0
Genre_Drama            0
Genre_Ecchi            0
Genre_Erotica          0
Genre_Fantasy          0
Genre_Girls Love       0
Genre_Gourmet          0
Genre_Hentai           0
Genre_Horror           0
Genre_Mystery          0
Genre_Romance          0
Genre_Sci-Fi           0
Genre_Slice of Life    0
Genre_Sports           0
Genre_Supernatural     0
Genre_Suspense         0
Gender__Female         0
Gender__Male           0
Gender__Non-Binary     0
dtype: int64

In [27]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

df['user_id'] = df['user_id'].astype(str)
df['anime_id'] = pd.to_numeric(df['anime_id'], errors='coerce')
df = df.dropna(subset=['anime_id'])  # Drop rows with invalid anime_id
df['anime_id'] = df['anime_id'].astype(int)

# Encode user_id
user_encoder = LabelEncoder()
df['user_id_enc'] = user_encoder.fit_transform(df['user_id'])

# Genre columns
genre_cols = [col for col in df.columns if col.startswith('Genre_')]

# Features and target
X = pd.concat([df[['user_id_enc']], df[genre_cols]], axis=1)
y = df['rating']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Neural network model (Scikit-learn)
model = MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=300, random_state=42)
model.fit(X_train, y_train)

# Evaluate
preds = model.predict(X_test)
rmse = root_mean_squared_error(y_test, preds)
print(f"Test RMSE: {rmse:.3f}")
def recommend_for_user(user_id, top_n=10):
    user_enc = user_encoder.transform([str(user_id)])[0]

    # Get anime profiles
    anime_profiles = df[['anime_id', 'Anime Title'] + genre_cols].drop_duplicates('anime_id')

    # Filter unseen anime
    seen_anime_ids = df[df['user_id_enc'] == user_enc]['anime_id'].unique()
    unseen = anime_profiles[~anime_profiles['anime_id'].isin(seen_anime_ids)].copy()

    # Build input
    X_unseen = pd.concat([
        pd.Series([user_enc] * len(unseen), name='user_id_enc'),
        unseen[genre_cols].reset_index(drop=True)
    ], axis=1)

    # Predict and sort
    unseen['predicted_rating'] = model.predict(X_unseen)
    return unseen.sort_values(by='predicted_rating', ascending=False)[['Anime Title', 'predicted_rating']].head(top_n)

# 🔍 Example usage
recommendations = recommend_for_user("1")  # Pass user_id as a string
print(recommendations)



Test RMSE: 1.677
                                            Anime Title  predicted_rating
338               Pokemon Movie 01: Mewtwo no Gyakushuu          9.049713
17878                               One Piece Film: Red          9.049713
140201  Chiisana Eiyuu: Kani to Tamago to Toumei Ningen          8.894372
10516                    Lupin III: Cagliostro no Shiro          8.863139
6722                                       Sennen Joyuu          8.774777
321                                  Majo no Takkyuubin          8.751261
7744                                      Golgo 13 (TV)          8.609997
4241                                        Banana Fish          8.609997
22691                                  Mori no Densetsu          8.598336
645494                        Wu Liuqi: Xuanwu Guo Pian          8.589660


In [28]:
import joblib

# Save the trained model
joblib.dump(model, 'anime_rating_model.pkl')

# Also save the label encoder if needed for inference
joblib.dump(user_encoder, 'user_encoder.pkl')


['user_encoder.pkl']

In [29]:
# Load the model
model = joblib.load('anime_rating_model.pkl')

# Load the user encoder
user_encoder = joblib.load('user_encoder.pkl')


In [32]:
model.predict(X_test[:10])

array([7.61205293, 6.53912424, 6.97449499, 7.01766491, 7.67095784,
       7.16203893, 6.61415179, 7.69100461, 7.30512604, 6.65818098])

In [33]:
y_test[:10]

428608     9
1169160    6
2235765    6
1214451    7
2731830    4
2730274    8
651720     9
2476124    8
1988535    6
170489     6
Name: rating, dtype: int64