In [277]:
%pip install -q keras-nightly
%pip install scipy
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [278]:
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [279]:
fileName = "isFollow"
mangas_df = pd.read_csv('../Collaborative Filtering/mangas.csv')
behaviors_df = pd.read_csv(f'../Collaborative Filtering/behaviors_{fileName}.csv')

In [280]:
if fileName == "isFollow":
    behaviors_df['isFollow'] = behaviors_df['isFollow'].replace({True: 1, False: 0}).infer_objects(copy=False)

  behaviors_df['isFollow'] = behaviors_df['isFollow'].replace({True: 1, False: 0}).infer_objects(copy=False)


In [281]:
behaviors_df[fileName].fillna(0, inplace=True)
user_id_list = behaviors_df['userId'].unique()
manga_id_list = mangas_df['_id'].unique()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  behaviors_df[fileName].fillna(0, inplace=True)


In [282]:
expanded_data = [{'userId': user_id, '_id': manga_id} for user_id in user_id_list for manga_id in manga_id_list]
expanded_df = pd.DataFrame(expanded_data)
expanded_df.rename(columns={'_id': 'mangaId'}, inplace=True)

In [283]:
merged_df = pd.merge(expanded_df, behaviors_df, on=['userId', 'mangaId'], how='left')
merged_df[fileName] = merged_df[fileName].fillna(0)
merged_df.drop(columns=['updatedAt'], inplace=True)

In [284]:
mangas_df.rename(columns={'_id': 'mangaId'}, inplace=True)
datas = pd.merge(mangas_df, merged_df, on='mangaId').drop(['author','genre'],axis=1)

userDatas = datas.pivot_table(index=['userId'],columns=['mangaId'],values=fileName)

In [285]:
def standardize(row):
    return (row - row.mean()) / (row.max() - row.min())
df_std = userDatas.apply(standardize).T.fillna(0)

In [286]:
sparse_df = sparse.csr_matrix(df_std.values)
cosine_sim_matrix = pd.DataFrame(cosine_similarity(sparse_df), index=userDatas.columns, columns=userDatas.columns)

corrMatrix = userDatas.corr(method='pearson')

In [287]:
def get_recommendations(user_id, userViews, corrMatrix, num_recommendations=36):
    user_data = userViews.loc[user_id]
    watched_items = user_data[user_data > 0].index.tolist()

    recommendations = {}
    for item in watched_items:
        similar_items = corrMatrix[item].drop(item).sort_values(ascending=False)
        for similar_item, score in similar_items.items():
            if np.isnan(score):  # Kiểm tra và loại bỏ giá trị NaN
                continue
            if similar_item in recommendations:
                recommendations[similar_item] += score
            else:
                recommendations[similar_item] = score

    for item in watched_items:
        if item in recommendations:
            recommendations[item] += 1.0
        else:
            recommendations[item] = 1.0 

    sorted_recommendations = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)
    return sorted_recommendations[:num_recommendations]

In [288]:
user_id = '5f892400948be104b0830fde'
recommendations = get_recommendations(user_id, userDatas, corrMatrix)
recommendations


[('662a9e75d2a9173d53c01b9f', 1.0),
 ('662a9e73d2a9173d53c01b9d', -0.06189844605901726),
 ('662a9e83d2a9173d53c01bad', -0.06189844605901729),
 ('662a9e8ad2a9173d53c01bb4', -0.06189844605901729),
 ('662a9e86d2a9173d53c01bb0', -0.061898446059017294),
 ('662a9e80d2a9173d53c01baa', -0.061898446059017294),
 ('662a9e7cd2a9173d53c01ba6', -0.0618984460590173),
 ('662a9e7ed2a9173d53c01ba8', -0.0618984460590173),
 ('662a9e81d2a9173d53c01bab', -0.061898446059017315),
 ('662a9e84d2a9173d53c01bae', -0.061898446059017315),
 ('662a9e87d2a9173d53c01bb1', -0.06189844605901733),
 ('662a9e8bd2a9173d53c01bb5', -0.061898446059017335),
 ('662a9e7ad2a9173d53c01ba4', -0.08908708063747481),
 ('662a9e7bd2a9173d53c01ba5', -0.08908708063747482),
 ('662a9e7dd2a9173d53c01ba7', -0.08908708063747482),
 ('662a9e7fd2a9173d53c01ba9', -0.08908708063747482),
 ('662a9e76d2a9173d53c01ba0', -0.08908708063747482),
 ('662a9e85d2a9173d53c01baf', -0.08908708063747482),
 ('662a9e79d2a9173d53c01ba3', -0.08908708063747482),
 ('662a

In [289]:
filtered_recommendations = [item_id for item_id, score in recommendations if not np.isnan(score)]
filtered_recommendations

['662a9e75d2a9173d53c01b9f',
 '662a9e73d2a9173d53c01b9d',
 '662a9e83d2a9173d53c01bad',
 '662a9e8ad2a9173d53c01bb4',
 '662a9e86d2a9173d53c01bb0',
 '662a9e80d2a9173d53c01baa',
 '662a9e7cd2a9173d53c01ba6',
 '662a9e7ed2a9173d53c01ba8',
 '662a9e81d2a9173d53c01bab',
 '662a9e84d2a9173d53c01bae',
 '662a9e87d2a9173d53c01bb1',
 '662a9e8bd2a9173d53c01bb5',
 '662a9e7ad2a9173d53c01ba4',
 '662a9e7bd2a9173d53c01ba5',
 '662a9e7dd2a9173d53c01ba7',
 '662a9e7fd2a9173d53c01ba9',
 '662a9e76d2a9173d53c01ba0',
 '662a9e85d2a9173d53c01baf',
 '662a9e79d2a9173d53c01ba3',
 '662a9e89d2a9173d53c01bb3',
 '662a9e77d2a9173d53c01ba1']