In [1]:
#pip list

In [2]:
import pandas as pd
rating_df = pd.read_csv('rating.csv')
print(rating_df.shape)
print(rating_df.head())

(7813737, 3)
   user_id  anime_id  rating
0        1        20      -1
1        1        24      -1
2        1        79      -1
3        1       226      -1
4        1       241      -1


In [3]:
#The data is too large. So, some users are removed to create a more simpler and easier to compute data.
#rating_df = rating_df[rating_df['user_id']]
#Normalizing the range of rating.
rating_df['rating'] = rating_df['rating'].apply(lambda x: 0 if x == -1 else x // 2)
rating_df['rating'].unique()
#rating_df = rating_df.reset_index()

array([0, 5, 4, 3, 1, 2], dtype=int64)

In [4]:
#The rating data now has 5000 movies and 1000 users
rating_df['anime_id'].unique().shape

(11200,)

In [5]:
import pandas as pd
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate
# Define the reader with the rating scale
reader = Reader(rating_scale=(0, 5))

data = Dataset.load_from_df(rating_df[['user_id', 'anime_id', 'rating']], reader)

In [6]:
# Split the data into train and test sets
trainset, testset = train_test_split(data, test_size=0.25)
model = SVD()
model.fit(trainset)
predictions = model.test(testset)

cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9245  0.9246  0.9244  0.9245  0.9235  0.9243  0.0004  
MAE (testset)     0.6220  0.6220  0.6222  0.6217  0.6215  0.6219  0.0002  
Fit time          68.70   73.98   73.12   72.51   72.93   72.25   1.84    
Test time         14.23   17.79   15.21   14.42   14.36   15.20   1.34    


{'test_rmse': array([0.92450547, 0.92464905, 0.92441143, 0.92454017, 0.92352689]),
 'test_mae': array([0.62201754, 0.62198109, 0.6221855 , 0.6217394 , 0.62150149]),
 'fit_time': (68.7048921585083,
  73.98269486427307,
  73.12318539619446,
  72.50614404678345,
  72.93399596214294),
 'test_time': (14.227971315383911,
  17.785907983779907,
  15.207601070404053,
  14.422444105148315,
  14.36286449432373)}

In [7]:
'''
The RMSE and MAE values are quite low, indicating that the SVD model is performing well on the dataset.
Both RMSE and MAE have very low standard deviations, which means the model's performance is consistent across all the folds during cross-validation.
'''
#Testing a random data from dataframe
test_rating  = rating_df[(rating_df['user_id'] == 101) & (rating_df['anime_id'] == 20)]['rating'].values[0]
print(f'The actual rating of user_id (101) and anime_id (20) is {test_rating}')

prediction = model.predict(101, 20)
print(prediction)
print(f'The predicted rating of user_id 101 and anime_id 20 is {prediction.est}')
# From the random data testing as well we can conclude that the model is able to accurately predict the rating based on the movie id and user id.

The actual rating of user_id (101) and anime_id (20) is 5
user: 101        item: 20         r_ui = None   est = 3.41   {'was_impossible': False}
The predicted rating of user_id 101 and anime_id 20 is 3.414449526436156


Content Based Filtering 

In [8]:
import pandas as pd
df = pd.read_csv('anime.csv')
anime_ids_set = set(rating_df['anime_id'])
df = df[df['anime_id'].apply(lambda x: x in anime_ids_set)]
#df = df[df['anime_id'].apply(lambda x: )]
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11197 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  11197 non-null  int64  
 1   name      11197 non-null  object 
 2   genre     11165 non-null  object 
 3   type      11196 non-null  object 
 4   episodes  11197 non-null  object 
 5   rating    11194 non-null  float64
 6   members   11197 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 699.8+ KB


In [10]:
df.isnull().sum()

anime_id     0
name         0
genre       32
type         1
episodes     0
rating       3
members      0
dtype: int64

In [11]:
#Dropping the rows whose anime ratings are unknown.
df = df.dropna(subset=['rating', 'genre'])

In [12]:
df.type.unique()

array(['Movie', 'TV', 'OVA', 'Special', 'Music', 'ONA'], dtype=object)

In [13]:
import re
def replace_comma_with_space(text):
    text = re.sub(r',', '', text)
    return text
df['genre'] = df['genre'].apply(replace_comma_with_space)

In [14]:
df.genre

0                        Drama Romance School Supernatural
1        Action Adventure Drama Fantasy Magic Military ...
2        Action Comedy Historical Parody Samurai Sci-Fi...
3                                          Sci-Fi Thriller
4        Action Comedy Historical Parody Samurai Sci-Fi...
                               ...                        
12289                                               Hentai
12290                                               Hentai
12291                                               Hentai
12292                                               Hentai
12293                                               Hentai
Name: genre, Length: 11162, dtype: object

In [15]:
df['combined_features'] = df['genre'] + ' ' + df['type']

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

feature_names = tfidf.get_feature_names_out()

content_matrix = tfidf_matrix.toarray()

In [17]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim

array([[1.        , 0.13288214, 0.        , ..., 0.        , 0.        ,
        0.24334889],
       [0.13288214, 1.        , 0.22204906, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.22204906, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        0.64264945],
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        0.64264945],
       [0.24334889, 0.        , 0.        , ..., 0.64264945, 0.64264945,
        1.        ]])

In [None]:
def main(prediction_model):
    print("Hello User. Welcome to the Recommendation App for animes")
    print("Press 1 if you want to see a predicted rating of the anime you will be watching")
    print("Press 2 if you want us to recommend the top animes according to liked animes")
    print("Press 3 to exit")
    valid_number = False
    while valid_number == False:
        try:
            number_input = int(input("Input number to contine: "))
            if (number_input > 0 and number_input < 4):
                valid_number = True
            else:
                print("Enter a valid number!")
        except:
            print("Input Error")
    if number_input == 1:
        print('Disclaimer: Only the user registered can perform predictive analysis!')
        try:
            user_id = int(input('Enter your user_id: '))
        
            if user_id not in rating_df['user_id']:
                print("User id doesn't exist")
                exit()
            else:
                anime_name_found = False
                while anime_name_found == False:
                    anime_name = input('Enter the anime name: ').lower()
                    for index, name in enumerate(df['name']):
                        if name.lower() == anime_name:
                            anime_name_found= True
                            anime_index = df['anime_id'][index]
                            # print(anime_index)
                            # print(user_id)
                            prediction = prediction_model.predict(user_id, anime_index)
                            print(f'The predicted rating of user_id {user_id} for "{anime_name}" is {prediction.est}')
                    if anime_name_found == False:
                        print("Anime name couldnt be found. Try again!")
        except:
            print("User id doesn't exist")
            exit()
    if number_input == 3:
        print('Thank you for the visit!')
        exit()
    if number_input == 2:
        #print(f'Anime List: {df['name']}')
        valid_anime = False
        while valid_anime == False:
            anime_name = input('Enter the anime name: ').lower()
            for i in df['name']:
                if anime_name == i.lower():
                    valid_anime = True
            if valid_anime == False:
                print('Entered Anime not in the list. Try again!')
        try:
            anime_index = df[df['name'].str.lower() == anime_name].index[0]
        except IndexError:
            print("Anime not found!")
            exit()

        similarity_scores = list(enumerate(cosine_sim[anime_index]))

        sorted_similar_animes = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:]

        top_5_animes = sorted_similar_animes[:5]
        print("Top 5 similar animes to", anime_name, "are:")
        for i in top_5_animes:
            print(df['name'].iloc[i[0]])
        
            
            
        

In [None]:
main(model)