In [None]:

#importing the libraries
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
anime_df = pd.read_csv('/content/anime.csv')

In [None]:
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [None]:
anime_df.shape

(12294, 7)

In [None]:
anime_df = anime_df.drop(['genre','type','episodes','members'], axis=1)

In [None]:
anime_df.head()

Unnamed: 0,anime_id,name,rating
0,32281,Kimi no Na wa.,9.37
1,5114,Fullmetal Alchemist: Brotherhood,9.26
2,28977,Gintama°,9.25
3,9253,Steins;Gate,9.17
4,9969,Gintama&#039;,9.16


In [None]:
anime_df_duplicates = anime_df.duplicated(subset='anime_id')

In [None]:
anime_df_duplicates.value_counts()

False    12294
dtype: int64

In [None]:
anime_df_null = pd.isnull(anime_df)

In [None]:
anime_df_null

Unnamed: 0,anime_id,name,rating
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
12289,False,False,False
12290,False,False,False
12291,False,False,False
12292,False,False,False


In [None]:
anime_df_null.value_counts()

anime_id  name   rating
False     False  False     12064
                 True        230
dtype: int64

In [None]:
anime_df.dropna()

Unnamed: 0,anime_id,name,rating
0,32281,Kimi no Na wa.,9.37
1,5114,Fullmetal Alchemist: Brotherhood,9.26
2,28977,Gintama°,9.25
3,9253,Steins;Gate,9.17
4,9969,Gintama&#039;,9.16
...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,4.15
12290,5543,Under World,4.28
12291,5621,Violence Gekiga David no Hoshi,4.88
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,4.98


In [None]:
ratings_df = pd.read_csv('/content/rating.csv')

In [None]:
ratings_df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [None]:
ratings_df.shape

(7813737, 3)

In [None]:
ratings_df_duplicates = ratings_df.duplicated(subset=['user_id','anime_id'])

In [None]:
ratings_df_duplicates.value_counts()

False    7813730
True           7
dtype: int64

In [None]:
ratings_df = ratings_df.drop_duplicates(subset=['user_id','anime_id'])

In [None]:
ratings_df.shape

(7813730, 3)

In [None]:
ratings_df = ratings_df[ratings_df['rating'] != -1]

In [None]:
ratings_df.shape

(6337234, 3)

In [None]:
#filtering the users who have rated atleast 200 movies x
counts = ratings_df['user_id'].value_counts()
ratings_df = ratings_df[ratings_df['user_id'].isin(counts[counts >= 200].index)]

In [None]:
#dropping the rows with ratings 1-5
mask = (ratings_df['rating'] == -1) | (ratings_df['rating'] == 1) | (ratings_df['rating'] == 2) | (ratings_df['rating'] == 3) | (ratings_df['rating'] == 4) | (ratings_df['rating'] == 5)

ratings_df = ratings_df.loc[~mask]


In [None]:
ratings_df.shape

(2896427, 3)

In [None]:
#changing the scale of ratings of 6-10 to 1-5 
def change_rating(rating):
    if rating == 6:
        return 1
    elif rating == 7:
        return 2
    elif rating == 8:
        return 3
    elif rating == 9:
        return 4
    elif rating == 10:
        return 5
    
ratings_df['rating'] = ratings_df['rating'].apply(change_rating)

In [None]:
ratings_df.head()

Unnamed: 0,user_id,anime_id,rating
302,5,6,3
303,5,15,1
304,5,17,1
305,5,18,1
306,5,20,1


In [None]:
ratings_df.dtypes

user_id     int64
anime_id    int64
rating      int64
dtype: object

In [None]:
#converting the data types from int64 to int32
data_type = {'user_id': 'int32','anime_id':'int32','rating': 'int32'}
ratings_df = ratings_df.astype(data_type)

In [None]:
import re
def text_cleaning(text):
    text = re.sub(r'&quot;', '', text)
    text = re.sub(r'.hack//', '', text)
    text = re.sub(r'&#039;', '', text)
    text = re.sub(r'A&#039;s', '', text)
    text = re.sub(r'I&#039;', 'I\'', text)
    text = re.sub(r'&amp;', 'and', text)
    text = re.sub(r'°','',text)

    return text

anime_df['name'] = anime_df['name'].apply(text_cleaning)

In [None]:
anime_df.head()

Unnamed: 0,anime_id,name,rating
0,32281,Kimi no Na wa.,9.37
1,5114,Fullmetal Alchemist: Brotherhood,9.26
2,28977,Gintama,9.25
3,9253,Steins;Gate,9.17
4,9969,Gintama,9.16


### Pearson Correlation


In [None]:
#user input on which we would provide the user recommendations
user_input = [{'name':'Plastic Memories','score':'9'},
              {'name':'Kimi no Na wa.','score':'10'},
              {'name':'Koe no Katachi','score':'9'},
              {'name':'Toradora!','score':'8'},
              {'name':'Nisekoi','score':'7'}
             ]
anime_input = pd.DataFrame(user_input)
anime_input

Unnamed: 0,name,score
0,Plastic Memories,9
1,Kimi no Na wa.,10
2,Koe no Katachi,9
3,Toradora!,8
4,Nisekoi,7


In [None]:
#Filtering out the movies by title
input_uid = anime_df[anime_df['name'].isin(anime_input['name'].tolist())]
#Then merging it so we can get the movieId. It's implicitly merging it by title.
anime_input = pd.merge(input_uid, anime_input, on ='name')
anime_input

Unnamed: 0,anime_id,name,rating,score
0,32281,Kimi no Na wa.,9.37,10
1,28851,Koe no Katachi,9.05,9
2,4224,Toradora!,8.45,8
3,27775,Plastic Memories,7.95,9
4,18897,Nisekoi,7.91,7


In [None]:
anime_input.drop('rating',axis =1)

Unnamed: 0,anime_id,name,score
0,32281,Kimi no Na wa.,10
1,28851,Koe no Katachi,9
2,4224,Toradora!,8
3,27775,Plastic Memories,9
4,18897,Nisekoi,7


In [None]:
user_subset = ratings_df[ratings_df['anime_id'].isin(anime_input['anime_id'].tolist())]

In [None]:
user_subset.head()

Unnamed: 0,user_id,anime_id,rating
449,5,4224,3
667,5,18897,2
1007,7,18897,8
1081,7,27775,8
1708,17,4224,7


In [None]:
user_subset.dtypes

user_id     int64
anime_id    int64
rating      int64
dtype: object

In [None]:
user_subset.sort_values('user_id',axis=0, ascending= True)

Unnamed: 0,user_id,anime_id,rating
449,5,4224,3
667,5,18897,2
1007,7,18897,8
1081,7,27775,8
1708,17,4224,7
...,...,...,...
7812911,73503,27775,6
7813069,73506,4224,8
7813223,73507,4224,10
7813419,73510,4224,8


In [None]:
user_subset_group = user_subset.groupby(['user_id'])

In [None]:
user_subset_group.get_group(5)

Unnamed: 0,user_id,anime_id,rating
449,5,4224,3
667,5,18897,2


In [None]:
user_subset_group = sorted(user_subset_group,  key=lambda x: len(x[1]), reverse=True)

In [None]:
user_subset_group[0:3]

[(2378,
          user_id  anime_id  rating
  231112     2378      4224       9
  231381     2378     18897       8
  231534     2378     27775       8
  231571     2378     28851       8
  231665     2378     32281       9),
 (18051,
           user_id  anime_id  rating
  1863757    18051      4224       9
  1863951    18051     18897       8
  1864036    18051     27775       7
  1864056    18051     28851      10
  1864113    18051     32281      10),
 (21856,
           user_id  anime_id  rating
  2268781    21856      4224      10
  2269017    21856     18897       9
  2269050    21856     27775       5
  2269056    21856     28851      10
  2269065    21856     32281      10)]

In [None]:
user_subset_group = user_subset_group[0:150]

In [None]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}


for name, group in user_subset_group:
    group = group.sort_values(by='anime_id')
    anime_input = anime_input.sort_values(by='anime_id')
    nRatings = len(group)
    temp_df = anime_input[anime_input['anime_id'].isin(group['anime_id'].tolist())]
    tempRatingList = temp_df['rating'].tolist()
    tempGroupList = group['rating'].tolist()
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0


In [None]:
pearsonCorrelationDict.items()

dict_items([(2378, 0.5094143624775788), (18051, 0.9277115577805838), (21856, 0.6392635264571561), (29577, 0.7885974068408472), (36240, 0.6065120007896642), (40915, 0.8620858441928098), (45583, 0.8292908279004353), (56426, 0.8359986281140589), (63199, 0.08227287400315647), (271, 0.46125005497439786), (598, 0.832904637878156), (786, 0.4612500549744258), (894, 0.46125005497445365), (937, 0.7636408039485945), (996, -0.9891454708864988), (1013, 0.7373707948564309), (1309, 0.7123897432809194), (1441, 0.8533801784550874), (1497, 0.7123897432809194), (1504, 0.7123897432809194), (1522, 0.029441492870681322), (1620, 0.9323139409057453), (1862, 0.3245248565955841), (1963, 0.9891454708864696), (1987, 0.5528942001258226), (2016, 0.7431403796815361), (2025, 0.46125005497445365), (2050, 0.9323139409057453), (2197, 0.8246071096548084), (2562, 0.832904637878156), (2673, 0.832904637878156), (2723, 0.9323139409057453), (2810, 0.9323139409057453), (2820, -0.02403887826635348), (2951, 0.337705962868984), (

In [None]:
pearson_df = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')#keys are the columns of the data frame that's why orient=index
pearson_df.columns = ['similarity_index']
pearson_df['user_id'] = pearson_df.index
pearson_df.index = range(len(pearson_df))
pearson_df.head()

Unnamed: 0,similarity_index,user_id
0,0.509414,2378
1,0.927712,18051
2,0.639264,21856
3,0.788597,29577
4,0.606512,36240


In [None]:
optimal_users=pearson_df.sort_values(by='similarity_index', ascending=False)[0:50]
optimal_users.head()

Unnamed: 0,similarity_index,user_id
149,0.989145,24509
141,0.989145,21842
111,0.989145,17319
61,0.989145,9746
60,0.989145,9722


In [None]:
optimal_users_rating=optimal_users.merge(ratings_df, left_on='user_id', right_on='user_id', how='inner')
optimal_users_rating.head()

Unnamed: 0,similarity_index,user_id,anime_id,rating
0,0.989145,24509,45,7
1,0.989145,24509,59,7
2,0.989145,24509,60,8
3,0.989145,24509,101,8
4,0.989145,24509,104,5


In [None]:
optimal_users_rating['weighted_rating'] = optimal_users_rating['similarity_index']*optimal_users_rating['rating']
optimal_users_rating.head()

Unnamed: 0,similarity_index,user_id,anime_id,rating,weighted_rating
0,0.989145,24509,45,7,6.924018
1,0.989145,24509,59,7,6.924018
2,0.989145,24509,60,8,7.913164
3,0.989145,24509,101,8,7.913164
4,0.989145,24509,104,5,4.945727


In [None]:
temp_optimal_users_rating = optimal_users_rating.groupby('anime_id').sum()[['similarity_index','weighted_rating']]
temp_optimal_users_rating.columns = ['sum_similarity_index','sum_weighted_rating']
temp_optimal_users_rating.head()

Unnamed: 0_level_0,sum_similarity_index,sum_weighted_rating
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,17.554799,148.14116
5,6.537633,53.282107
6,7.635529,63.062523
7,0.989145,5.934873
15,1.872873,14.050669


In [None]:
recommendation_df = pd.DataFrame()

recommendation_df['recommendation score'] = temp_optimal_users_rating['sum_weighted_rating']/temp_optimal_users_rating['sum_similarity_index']
recommendation_df['anime_id'] = temp_optimal_users_rating.index
recommendation_df.head()

Unnamed: 0_level_0,recommendation score,anime_id
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,8.438784,1
5,8.150061,5
6,8.25909,6
7,6.0,7
15,7.502201,15


In [None]:
recommendation_df = recommendation_df.sort_values(by='recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,recommendation score,anime_id
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1
11441,10.0,11441
1639,10.0,1639
708,10.0,708
864,10.0,864
28851,10.0,28851
46,10.0,46
2294,10.0,2294
2269,10.0,2269
32792,10.0,32792
32281,9.801588,32281


In [None]:
anime_df.loc[anime_df['anime_id'].isin(recommendation_df.head(20)['anime_id'].tolist())]

Unnamed: 0,anime_id,name,rating
0,32281,Kimi no Na wa.,9.37
1,5114,Fullmetal Alchemist: Brotherhood,9.26
2,28977,Gintama°,9.25
3,9253,Steins;Gate,9.17
8,15335,Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...,9.1
9,15417,Gintama&#039;: Enchousen,9.11
10,4181,Clannad: After Story,9.06
11,28851,Koe no Katachi,9.05
37,31757,Kizumonogatari II: Nekketsu-hen,8.73
38,19,Monster,8.72









### Model Based Approach


In [None]:
#installing the surprise library
!pip install surprise



In [None]:
#importing the surprise library
from surprise import Reader, Dataset

In [None]:
ratings_df

Unnamed: 0,user_id,anime_id,rating
302,5,6,3
303,5,15,1
304,5,17,1
305,5,18,1
306,5,20,1
...,...,...,...
7813325,73507,7817,3
7813327,73507,8074,5
7813328,73507,8197,1
7813332,73507,8440,2


In [None]:
#Taking the sample of the ratings data
ratings_df_sample = ratings_df.sample(n = 5000)

In [None]:
ratings_df_sample

Unnamed: 0,user_id,anime_id,rating
4353340,41135,18247,1
1447045,13877,11887,5
3097528,28623,28677,3
7048913,65836,17833,2
1349040,12794,33028,4
...,...,...,...
6270505,58457,5681,4
3394894,31270,20,1
3252313,30026,10793,5
106919,1103,20689,1


In [None]:
ratings_df_sample.dtypes

user_id     int32
anime_id    int32
rating      int32
dtype: object

In [None]:
#setting the rating scale based on which surprise library will read the ratings
reader = Reader(rating_scale=(1, 5))

In [None]:
data = Dataset.load_from_df(ratings_df_sample[['user_id', 'anime_id', 'rating']], reader)

In [None]:
from sklearn.model_selection import KFold

In [None]:
#dividing the data into folds 
data_kfolds = KFold(n_splits=5)
data_kfolds.get_n_splits(data)

5

In [None]:
#importing the required models from surprise
from surprise import SVD
from surprise import NMF
from surprise import KNNBasic
from surprise.model_selection import cross_validate

In [None]:
#SVD model on anime dataset
algo = SVD()
cross_validate(algo, data, measures=['RMSE'])

{'fit_time': (0.30190134048461914,
  0.2888014316558838,
  0.288593053817749,
  0.29541611671447754,
  0.29166531562805176),
 'test_rmse': array([1.17185077, 1.17280163, 1.19718909, 1.17346375, 1.16647576]),
 'test_time': (0.007628679275512695,
  0.007535219192504883,
  0.010761737823486328,
  0.0078122615814208984,
  0.007178544998168945)}

In [None]:
#NMF model
algo_nmf = NMF()
cross_validate(algo_nmf, data, measures=['RMSE'])

{'fit_time': (0.715430498123169,
  0.7151010036468506,
  0.7221238613128662,
  0.7319111824035645,
  0.7120246887207031),
 'test_rmse': array([1.30101899, 1.29533914, 1.25096118, 1.29277511, 1.2749393 ]),
 'test_time': (0.00867605209350586,
  0.008176803588867188,
  0.00693202018737793,
  0.0072329044342041016,
  0.0077037811279296875)}

In [None]:
#KNNBasic model
algo_knn = KNNBasic()
cross_validate(algo_knn, data, measures=['RMSE'])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


{'fit_time': (0.3378603458404541,
  0.30477309226989746,
  0.31087207794189453,
  0.455716609954834,
  0.29841184616088867),
 'test_rmse': array([1.19799833, 1.18269195, 1.22387459, 1.21480132, 1.21830251]),
 'test_time': (0.012763261795043945,
  0.01077580451965332,
  0.011993885040283203,
  0.012804746627807617,
  0.011966228485107422)}

In [None]:
from collections import defaultdict

#The function to return the Top n predictions
def get_top_n(predictions, n=20):
  
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [None]:
#training the model
trainset = data.build_full_trainset()
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f420d315090>

In [None]:
# Testing on the values not present in the trainset
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

In [None]:
#creating an empty dataframe in which predictions will be stored
prediction_df = pd.DataFrame()

In [None]:
user_ids = []
anime_ids = []

In [None]:
top_n = get_top_n(predictions, n=20)

# # Print the recommended items for each user
for uid, user_ratings in top_n.items():
#     print(uid, [iid for (iid, _) in user_ratings])
    user_ids.append(uid)
    anime_ids.append([iid for (iid, _) in user_ratings])

In [None]:
#Appending the user_ids and anime_ids as a column in the dataframe
prediction_df['user_id'] = user_ids
prediction_df['predictions'] = anime_ids

In [None]:
prediction_df.sort_values(by = 'user_id')

Unnamed: 0,user_id,predictions
1360,7,"[199, 431, 6880, 9253, 16498, 205, 6547, 5114,..."
2249,17,"[31043, 431, 199, 16067, 164, 2167, 2001, 1981..."
3240,43,"[199, 2167, 6746, 4059, 21939, 9253, 31043, 12..."
2940,46,"[199, 431, 16498, 2001, 6547, 4059, 164, 2904,..."
3250,123,"[6547, 199, 431, 164, 5114, 12189, 9253, 16498..."
...,...,...
3565,73422,"[6547, 19815, 31043, 431, 2904, 199, 31240, 50..."
1245,73457,"[6547, 2904, 431, 164, 16498, 19815, 22297, 20..."
1683,73499,"[2904, 6547, 9253, 199, 431, 31043, 164, 16498..."
1610,73502,"[6547, 2904, 199, 431, 164, 11597, 9253, 6880,..."


In [None]:
prediction_user = prediction_df[prediction_df['user_id'] == 123]

In [None]:
prediction_user

Unnamed: 0,user_id,predictions
3250,123,"[6547, 199, 431, 164, 5114, 12189, 9253, 16498..."


In [None]:
recommendations_list = prediction_user.at[3250,'predictions']

In [None]:
type(recommendations_list)

list

In [None]:
recommendation_df = pd.DataFrame()

In [None]:
recommendation_df['anime_id'] = recommendations_list

In [None]:
anime_name = anime_df[anime_df['anime_id'].isin(recommendation_df['anime_id'].tolist())]

In [None]:
recommendation_for_user = pd.merge(anime_name,recommendation_df,on = 'anime_id') 

In [None]:
#recommendations generated
recommendation_for_user

Unnamed: 0,anime_id,name,rating
0,5114,Fullmetal Alchemist: Brotherhood,9.26
1,9253,Steins;Gate,9.17
2,11061,Hunter x Hunter (2011),9.13
3,4181,Clannad: After Story,9.06
4,2904,Code Geass: Hangyaku no Lelouch R2,8.98
5,199,Sen to Chihiro no Kamikakushi,8.93
6,1575,Code Geass: Hangyaku no Lelouch,8.83
7,164,Mononoke Hime,8.81
8,457,Mushishi,8.78
9,431,Howl no Ugoku Shiro,8.74


In [None]:
actually_watched_by_user = pd.merge(user_anime_name_head,recommendation_for_user,how='inner',on = 'anime_id') 

In [None]:
actually_watched_by_user

Unnamed: 0,user_id,anime_id,rating_x,name_x,rating_y,name_y,rating
0,123,16498,5,Shingeki no Kyojin,8.54,Shingeki no Kyojin,8.54
1,123,457,5,Mushishi,8.78,Mushishi,8.78


In [None]:
user=ratings_df[ratings_df['user_id']==123]

In [None]:
user = user.sort_values(by='rating',ascending =False)

In [None]:
user.head(20)

Unnamed: 0,user_id,anime_id,rating
9629,123,5300,5
9880,123,22789,5
9618,123,4081,5
9865,123,21939,5
9854,123,21405,5
9849,123,21105,5
9702,123,10379,5
9840,123,20651,5
9834,123,19945,5
9832,123,19775,5


In [None]:
user_anime = anime_df[anime_df['anime_id'].isin(user['anime_id'].tolist())]

In [None]:
user_anime_name = pd.merge(user,anime_df,on = 'anime_id') 

In [None]:
user_anime_name.sort_values(by = 'rating_x' , ascending=False , inplace=True)

In [None]:
user_anime_name_head= user_anime_name.head(20)

In [None]:
user_anime_name_head

Unnamed: 0,user_id,anime_id,rating_x,name,rating_y
0,123,5300,5,Zoku Natsume Yuujinchou,8.64
29,123,24701,5,Mushishi Zoku Shou 2nd Season,8.88
21,123,13125,5,Shinsekai yori,8.53
22,123,12883,5,Tsuritama,7.84
23,123,12431,5,Uchuu Kyoudai,8.59
24,123,11665,5,Natsume Yuujinchou Shi,8.75
25,123,11113,5,Usagi Drop Specials,8.07
26,123,9989,5,Ano Hi Mita Hana no Namae wo Bokutachi wa Mada...,8.62
27,123,22583,5,Uchuu Kyoudai: Number Zero,7.22
30,123,30230,5,Diamond no Ace: Second Season,8.5
