In [17]:
import pandas as pd

In [18]:
fileName = "rating"
mangas_df = pd.read_csv('../Collaborative Filtering/mangas.csv')
behaviors_df = pd.read_csv(f'../Collaborative Filtering/behaviors_{fileName}.csv')

In [19]:
behaviors_df[fileName].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  behaviors_df[fileName].fillna(0, inplace=True)


In [20]:
user_id_list = behaviors_df['userId'].unique()
manga_id_list = mangas_df['_id'].unique()
expanded_data = []

In [21]:
for user_id in user_id_list:
    for manga_id in manga_id_list:
        expanded_data.append({'userId': user_id, '_id': manga_id})

In [22]:
expanded_df = pd.DataFrame(expanded_data)


In [23]:
expanded_df.rename(columns={'_id': 'mangaId'}, inplace=True)
merged_df = pd.merge(expanded_df, behaviors_df, on=['userId', 'mangaId'], how='left')


In [24]:
merged_df[fileName] = merged_df[fileName].fillna(0)
merged_df.drop(columns=['updatedAt'], inplace=True)
merged_df

Unnamed: 0,userId,mangaId,rating
0,5f8923f3948be104b0830fd1,662a9e73d2a9173d53c01b9d,0.5
1,5f8923f3948be104b0830fd1,662a9e74d2a9173d53c01b9e,2.5
2,5f8923f3948be104b0830fd1,662a9e75d2a9173d53c01b9f,0.0
3,5f8923f3948be104b0830fd1,662a9e76d2a9173d53c01ba0,0.0
4,5f8923f3948be104b0830fd1,662a9e77d2a9173d53c01ba1,0.0
...,...,...,...
745,5f892410948be104b0830fee,662a9e87d2a9173d53c01bb1,0.0
746,5f892410948be104b0830fee,662a9e88d2a9173d53c01bb2,0.0
747,5f892410948be104b0830fee,662a9e89d2a9173d53c01bb3,0.0
748,5f892410948be104b0830fee,662a9e8ad2a9173d53c01bb4,0.0


In [25]:
mangas_df.rename(columns={'_id': 'mangaId'}, inplace=True)
ratings = pd.merge(mangas_df, merged_df, on='mangaId').drop(['author','genre'],axis=1)

ratings.head()

Unnamed: 0,mangaId,name,userId,rating
0,662a9e73d2a9173d53c01b9d,One Piece,5f8923f3948be104b0830fd1,0.5
1,662a9e73d2a9173d53c01b9d,One Piece,5f8923f4948be104b0830fd2,0.0
2,662a9e73d2a9173d53c01b9d,One Piece,5f8923f5948be104b0830fd3,0.0
3,662a9e73d2a9173d53c01b9d,One Piece,5f8923f6948be104b0830fd4,0.0
4,662a9e73d2a9173d53c01b9d,One Piece,5f8923f7948be104b0830fd5,0.0


In [26]:
userRatings = ratings.pivot_table(index=['userId'],columns=['mangaId'],values='rating')
userRatings.head()
print("Before: ",userRatings.shape)
userRatings = userRatings.dropna(thresh=10, axis=1).fillna(0,axis=1)
#userRatings.fillna(0, inplace=True)
print("After: ",userRatings.shape)

Before:  (30, 25)
After:  (30, 25)


In [27]:
corrMatrix = userRatings.corr(method='pearson')
corrMatrix.head(100)

mangaId,662a9e73d2a9173d53c01b9d,662a9e74d2a9173d53c01b9e,662a9e75d2a9173d53c01b9f,662a9e76d2a9173d53c01ba0,662a9e77d2a9173d53c01ba1,662a9e78d2a9173d53c01ba2,662a9e79d2a9173d53c01ba3,662a9e7ad2a9173d53c01ba4,662a9e7bd2a9173d53c01ba5,662a9e7cd2a9173d53c01ba6,...,662a9e82d2a9173d53c01bac,662a9e83d2a9173d53c01bad,662a9e84d2a9173d53c01bae,662a9e85d2a9173d53c01baf,662a9e86d2a9173d53c01bb0,662a9e87d2a9173d53c01bb1,662a9e88d2a9173d53c01bb2,662a9e89d2a9173d53c01bb3,662a9e8ad2a9173d53c01bb4,662a9e8bd2a9173d53c01bb5
mangaId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
662a9e73d2a9173d53c01b9d,1.0,-0.021733,-0.094235,-0.07568,-0.062144,-0.082915,-0.053698,-0.090735,-0.088541,-0.077285,...,-0.07568,-0.077285,-0.076779,-0.077285,,-0.077041,-0.053698,-0.077285,-0.077285,0.689818
662a9e74d2a9173d53c01b9e,-0.021733,1.0,0.653251,-0.070538,-0.057922,-0.077282,-0.05005,-0.08457,-0.082525,-0.072034,...,-0.070538,-0.072034,-0.071562,-0.072034,,-0.071807,-0.05005,-0.072034,-0.072034,-0.05005
662a9e75d2a9173d53c01b9f,-0.094235,0.653251,1.0,-0.085285,-0.070031,-0.093439,-0.060514,-0.102251,-0.099779,-0.087094,...,-0.085285,-0.087094,-0.086523,-0.087094,,-0.086819,-0.060514,-0.087094,-0.087094,-0.060514
662a9e76d2a9173d53c01ba0,-0.07568,-0.070538,-0.085285,1.0,0.040173,-0.075041,-0.048599,-0.082118,-0.080133,-0.069945,...,-0.068493,-0.069945,-0.069487,-0.069945,,-0.069725,-0.048599,-0.069945,-0.069945,-0.048599
662a9e77d2a9173d53c01ba1,-0.062144,-0.057922,-0.070031,0.040173,1.0,0.051559,-0.039906,-0.067431,-0.0658,-0.057435,...,-0.056242,-0.057435,-0.057059,-0.057435,,-0.057254,-0.039906,-0.057435,-0.057435,-0.039906
662a9e78d2a9173d53c01ba2,-0.082915,-0.077282,-0.093439,-0.075041,0.051559,1.0,-0.053245,-0.089969,-0.087793,-0.076632,...,-0.075041,-0.076632,-0.07613,-0.076632,,-0.07639,-0.053245,-0.076632,-0.076632,-0.053245
662a9e79d2a9173d53c01ba3,-0.053698,-0.05005,-0.060514,-0.048599,-0.039906,-0.053245,1.0,0.25955,-0.056857,-0.049629,...,-0.048599,-0.049629,-0.049304,-0.049629,,-0.049473,-0.034483,-0.049629,-0.049629,-0.034483
662a9e7ad2a9173d53c01ba4,-0.090735,-0.08457,-0.102251,-0.082118,-0.067431,-0.089969,0.25955,1.0,0.864659,-0.08386,...,-0.082118,-0.08386,-0.08331,-0.08386,,-0.083595,-0.058266,-0.08386,-0.08386,-0.058266
662a9e7bd2a9173d53c01ba5,-0.088541,-0.082525,-0.099779,-0.080133,-0.0658,-0.087793,-0.056857,0.864659,1.0,0.190941,...,-0.080133,-0.081832,-0.081296,-0.081832,,-0.081574,-0.056857,-0.081832,-0.081832,-0.056857
662a9e7cd2a9173d53c01ba6,-0.077285,-0.072034,-0.087094,-0.069945,-0.057435,-0.076632,-0.049629,-0.08386,0.190941,1.0,...,-0.069945,-0.071429,-0.070961,-0.071429,,-0.071203,-0.049629,-0.071429,-0.071429,-0.049629


In [28]:
def get_similar(manga_Id,rating):
    similar_ratings = corrMatrix[manga_Id]*(rating-2.5)
    similar_ratings = similar_ratings.sort_values(ascending=False)
    #print(type(similar_ratings))
    return similar_ratings

In [29]:
test = [("662a9e8bd2a9173d53c01bb5",5),("662a9e86d2a9173d53c01bb0",3),("662a9e8bd2a9173d53c01bb5",1),("662a9e7ed2a9173d53c01ba8",2)]
# similar_movies = pd.DataFrame()
# similar_movies_list = []
# for manga, rating in test:
#     similar_movies_list.append(get_similar(manga, rating))
# similar_movies = pd.concat(similar_movies_list, axis=0).reset_index()
# similar_movies.dropna()

similar_movies_list = []
for manga, rating in test:
    similar_movies_list.append(get_similar(manga, rating))
similar_movies = pd.concat(similar_movies_list, axis=1).sum(axis=1).reset_index()
similar_movies.columns = ['mangaId', 'score']
similar_movies_cleaned = similar_movies.dropna()
similar_movies_cleaned

Unnamed: 0,mangaId,score
0,662a9e8bd2a9173d53c01bb5,1.022038
1,662a9e73d2a9173d53c01b9d,0.724136
2,662a9e7fd2a9173d53c01ba9,-0.012445
3,662a9e79d2a9173d53c01ba3,-0.012445
4,662a9e80d2a9173d53c01baa,-0.012445
5,662a9e88d2a9173d53c01bb2,-0.012445
6,662a9e77d2a9173d53c01ba1,-0.014402
7,662a9e7ed2a9173d53c01ba8,-0.544076
8,662a9e7dd2a9173d53c01ba7,-0.541643
9,662a9e82d2a9173d53c01bac,-0.017539


In [30]:
result = similar_movies.query("mangaId == '662a9e8bd2a9173d53c01bb5'")

In [31]:
similar_movies.sort_values(by='score',ascending=False)

Unnamed: 0,mangaId,score
0,662a9e8bd2a9173d53c01bb5,1.022038
1,662a9e73d2a9173d53c01b9d,0.724136
24,662a9e86d2a9173d53c01bb0,0.0
3,662a9e79d2a9173d53c01ba3,-0.012445
2,662a9e7fd2a9173d53c01ba9,-0.012445
5,662a9e88d2a9173d53c01bb2,-0.012445
4,662a9e80d2a9173d53c01baa,-0.012445
6,662a9e77d2a9173d53c01ba1,-0.014402
9,662a9e82d2a9173d53c01bac,-0.017539
10,662a9e76d2a9173d53c01ba0,-0.017539


In [32]:
test = [("662a9e7dd2a9173d53c01ba7",5)]
# similar_movies = pd.DataFrame()
# similar_movies_list = []
# for manga, rating in test:
#     similar_movies_list.append(get_similar(manga, rating))
# similar_movies = pd.concat(similar_movies_list, axis=0).reset_index()
# similar_movies.dropna()

similar_movies_list = []
for manga, rating in test:
    similar_movies_list.append(get_similar(manga, rating))
similar_movies = pd.concat(similar_movies_list, axis=1).sum(axis=1).reset_index()
similar_movies.columns = ['mangaId', 'score']
similar_movies.dropna()
similar_movies.sort_values(by='score',ascending=False)

Unnamed: 0,mangaId,score
0,662a9e7dd2a9173d53c01ba7,2.5
1,662a9e7ed2a9173d53c01ba8,2.47364
2,662a9e7cd2a9173d53c01ba6,0.39388
24,662a9e86d2a9173d53c01bb0,0.0
3,662a9e80d2a9173d53c01baa,-0.117288
4,662a9e7fd2a9173d53c01ba9,-0.117288
5,662a9e79d2a9173d53c01ba3,-0.117288
6,662a9e88d2a9173d53c01bb2,-0.117288
7,662a9e8bd2a9173d53c01bb5,-0.117288
8,662a9e77d2a9173d53c01ba1,-0.135735
