In [76]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

In [39]:
score_df = pd.read_csv("../data/user-bookscore.csv", sep=',')

In [40]:
score_df

Unnamed: 0,user_id,book_id,score
0,2021001,193931483,8
1,2021001,234520993,3
2,2021001,193862021,2
3,2021001,176870,3
4,2021001,281019523,10
5,2021001,281761253,9
6,2021001,157472828,4
7,2021001,183884717,7
8,2021002,187043457,8
9,2021002,257288899,10


In [41]:
score_table = pd.pivot_table(score_df, values='score', index=['book_id'], columns=['user_id'], aggfunc=np.sum)

In [42]:
score_table

user_id,2021001,2021002,2021003,2021004,2021005
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
176870,3.0,9.0,,,9.0
13376773,,9.0,,,
29425554,,,6.0,,
105937256,,,7.0,,
157472828,4.0,,,3.0,
177439970,,,3.0,,
183884717,7.0,,10.0,,
187043457,,8.0,,9.0,3.0
193725370,,,8.0,,9.0
193862021,2.0,10.0,,,9.0


In [43]:
score_table = score_table.fillna(0)

In [44]:
score_table

user_id,2021001,2021002,2021003,2021004,2021005
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
176870,3.0,9.0,0.0,0.0,9.0
13376773,0.0,9.0,0.0,0.0,0.0
29425554,0.0,0.0,6.0,0.0,0.0
105937256,0.0,0.0,7.0,0.0,0.0
157472828,4.0,0.0,0.0,3.0,0.0
177439970,0.0,0.0,3.0,0.0,0.0
183884717,7.0,0.0,10.0,0.0,0.0
187043457,0.0,8.0,0.0,9.0,3.0
193725370,0.0,0.0,8.0,0.0,9.0
193862021,2.0,10.0,0.0,0.0,9.0


In [45]:
with open('../data/booklist_id.csv','r', encoding='UTF-8') as f:
    line = f.read()
booklist_id = line.split(',')

In [46]:
book_df = pd.DataFrame(index=booklist_id)
book_df.index.name = 'book_id'
book_df.columns.name = 'user_id'

In [47]:
book_df

user_id
book_id
193931483
194830103
247882118
280955238
279726923
...
169013713
443125
3599432
2361724


In [48]:
usboscore_df = book_df.join(score_table)

In [49]:
usboscore_df

user_id,2021001,2021002,2021003,2021004,2021005
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
193931483,,,,,
194830103,,,,,
247882118,,,,,
280955238,,,,,
279726923,,,,,
...,...,...,...,...,...
169013713,,,,,
443125,,,,,
3599432,,,,,
2361724,,,,,


In [51]:
usboscore_df=usboscore_df.fillna(0)

In [52]:
usboscore_df

user_id,2021001,2021002,2021003,2021004,2021005
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
193931483,0.0,0.0,0.0,0.0,0.0
194830103,0.0,0.0,0.0,0.0,0.0
247882118,0.0,0.0,0.0,0.0,0.0
280955238,0.0,0.0,0.0,0.0,0.0
279726923,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...
169013713,0.0,0.0,0.0,0.0,0.0
443125,0.0,0.0,0.0,0.0,0.0
3599432,0.0,0.0,0.0,0.0,0.0
2361724,0.0,0.0,0.0,0.0,0.0


In [107]:
score_table_T = score_table.transpose()

In [108]:
user_sim = cosine_similarity(score_table_T, score_table_T)

# cosine_similarity()로 반환된 Numpy 행렬을 도서명으로 매핑해 DataFrame으로 변환
user_sim_df = pd.DataFrame(data=user_sim, index=score_table.columns,
                           columns=score_table.columns)
print(user_sim_df.shape)
print(user_sim_df)

(5, 5)
user_id   2021001   2021002   2021003   2021004   2021005
user_id                                                  
2021001  1.000000  0.342049  0.209329  0.554556  0.226091
2021002  0.342049  1.000000  0.000000  0.378314  0.779806
2021003  0.209329  0.000000  1.000000  0.023993  0.164602
2021004  0.554556  0.378314  0.023993  1.000000  0.258840
2021005  0.226091  0.779806  0.164602  0.258840  1.000000


In [67]:
# 예측 평점 계산식 함수 구현
def predict_score(score_arr, user_sim_arr):

    # dot : 내적을 이용한 가중합 계산
    score_pred = score_arr.dot(user_sim_arr) / np.array([np.abs(user_sim_arr).sum(axis=1)])
    return score_pred

score_pred = predict_score(score_table.values, user_sim_df.values)
score_pred_matrix = pd.DataFrame(data=score_pred, index=score_table.index, 
                                   columns = score_table.columns)

In [68]:
# 사용자가 평점을 부여한 도서에 대해서만 예측 성능 평가 MSE를 구한다.
def get_mse(pred, actual):
    # 평점이 있는 실제 도서만 추출
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

print('MSE : ', get_mse(score_pred, score_table.values ))

MSE :  5.554270404761636


In [69]:
# 특정 도서와 가장 비슷한 유사도를 가지는 도서에 대해서만 유사도 벡터를 적용하는 예측 평점 계산식 함수 구현
def predict_score_topsim(scores_arr, user_sim_arr, n=20):

    # 사용자-아이템 평점 행렬 크기만큼 0으로 채운 예측 행렬 초기화
    pred = np.zeros(scores_arr.shape)
    
    # 사용자-아이템 평점 행렬의 열 크기만큼 루프 수행.
    for col in range(scores_arr.shape[1]):
        
        # 유사도 행렬에서 유사도가 큰 순으로 n개 데이터 행렬의 인덱스 반환
        top_n_user = [np.argsort(user_sim_arr[:, col])[:-n-1:-1]]
        
        # 개인화된 예측 평점을 계산
        for row in range(scores_arr.shape[0]):
            pred[row, col] = user_sim_arr[col, :][top_n_user].dot(scores_arr[row,
                                                                   :][top_n_user].T)
            pred[row, col] /= np.sum(np.abs(user_sim_arr[col, :][top_n_user]))
            
    return pred

In [70]:
score_pred = predict_score_topsim(score_table.values, user_sim_df.values, n=20)

print('아이템 기반 최근접 Top-20 이웃 MSE : ', get_mse(score_pred, score_table.values))

# 계산된 예측 평점 데이터를 DataFrame으로 변경
score_pred_table = pd.DataFrame(data=score_pred, index=score_table.index,
                                   columns=score_table.columns)

아이템 기반 최근접 Top-20 이웃 MSE :  5.554270404761636


  pred[row, col] = user_sim_arr[col, :][top_n_user].dot(scores_arr[row,
  pred[row, col] /= np.sum(np.abs(user_sim_arr[col, :][top_n_user]))


In [72]:
score_table

user_id,2021001,2021002,2021003,2021004,2021005
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
176870,3.0,9.0,0.0,0.0,9.0
13376773,0.0,9.0,0.0,0.0,0.0
29425554,0.0,0.0,6.0,0.0,0.0
105937256,0.0,0.0,7.0,0.0,0.0
157472828,4.0,0.0,0.0,3.0,0.0
177439970,0.0,0.0,3.0,0.0,0.0
183884717,7.0,0.0,10.0,0.0,0.0
187043457,0.0,8.0,0.0,9.0,3.0
193725370,0.0,0.0,8.0,0.0,9.0
193862021,2.0,10.0,0.0,0.0,9.0


In [73]:
user_rating_id = score_table.loc[:, 2021002]
user_rating_id[ user_rating_id > 0 ].sort_values(ascending=False)[:5]

book_id
257288899    10.0
193862021    10.0
216841680     9.0
13376773      9.0
176870        9.0
Name: 2021002, dtype: float64

In [None]:
def get_unseen_books(score_table, user_id):
    # user_id로 입력받은 사용자의 모든 도서 정보를 추출해 Series로 반환함.
    # 반환된 user_scores은 도서명(title)을 인덱스로 가지는 Series 객체임.
    user_score = score_table.loc[user_id, :]
    
    # user_score이 0보다 크면 기존에 읽은 도서. 대상 인덱스를 추출해 list 객체로 만듦.
    already_seen = user_score[ user_score>0 ].index.tolist()
    
    # 모든 도서명을 list 객체로 만듦.
    bobook_list = score_table.columns.tolist()
    
    # list comprehension으로 already_seen에 해당하는 도서는 books_list에서 제외함.
    unseen_list = [ movie for book in bobook_list if book not in already_seen ]
    
    return unseen_list

In [93]:
user_sim

array([[1.        , 0.34204942, 0.209329  , 0.55455564, 0.22609074],
       [0.        , 0.34204942, 0.37831374, 0.77980572, 1.        ],
       [0.209329  , 0.        , 1.        , 0.02399258, 0.1646025 ],
       [0.55455564, 0.37831374, 0.02399258, 1.        , 0.25883967],
       [0.22609074, 0.77980572, 0.1646025 , 0.25883967, 1.        ]])

In [109]:
user_sim_df

user_id,2021001,2021002,2021003,2021004,2021005
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021001,1.0,0.342049,0.209329,0.554556,0.226091
2021002,0.342049,1.0,0.0,0.378314,0.779806
2021003,0.209329,0.0,1.0,0.023993,0.164602
2021004,0.554556,0.378314,0.023993,1.0,0.25884
2021005,0.226091,0.779806,0.164602,0.25884,1.0


In [123]:
score_table

user_id,2021001,2021002,2021003,2021004,2021005
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
176870,3.0,9.0,0.0,0.0,9.0
13376773,0.0,9.0,0.0,0.0,0.0
29425554,0.0,0.0,6.0,0.0,0.0
105937256,0.0,0.0,7.0,0.0,0.0
157472828,4.0,0.0,0.0,3.0,0.0
177439970,0.0,0.0,3.0,0.0,0.0
183884717,7.0,0.0,10.0,0.0,0.0
187043457,0.0,8.0,0.0,9.0,3.0
193725370,0.0,0.0,8.0,0.0,9.0
193862021,2.0,10.0,0.0,0.0,9.0


In [165]:
know_user = 2021002 # 추천 받고자 하는 user의 id

rec_list = pd.DataFrame(index=score_table.index)

for j in score_table.index:
    sum = 0.0
    count = 0
    for i in user_sim_df.index:
        u = user_sim_df.loc[i, know_user]
        b = score_table.loc[j, i]
        if b != 0:
            result = u * b
            sum+=result
            count+=1
    rec_list.loc[j,know_user] = sum / count

In [166]:
rec_list

Unnamed: 0_level_0,2021002
book_id,Unnamed: 1_level_1
176870,5.681467
13376773,9.0
29425554,0.0
105937256,0.0
157472828,1.251569
177439970,0.0
183884717,1.197173
187043457,4.581414
193725370,3.509126
193862021,5.900783


In [167]:
unread_filter = score_table[know_user] == 0

In [168]:
unread_filter

book_id
176870       False
13376773     False
29425554      True
105937256     True
157472828     True
177439970     True
183884717     True
187043457    False
193725370     True
193862021    False
193931483    False
210545323     True
216841680    False
218571649     True
234520993     True
246391275     True
257288899    False
281019523    False
281761253     True
Name: 2021002, dtype: bool

In [169]:
rec_list = rec_list.loc[unread_filter]

In [170]:
rec_list[know_user].sort_values(ascending=False)

book_id
193725370    3.509126
218571649    3.026510
281761253    2.554855
157472828    1.251569
183884717    1.197173
234520993    0.720363
246391275    0.000000
210545323    0.000000
177439970    0.000000
105937256    0.000000
29425554     0.000000
Name: 2021002, dtype: float64

In [171]:
score_table

user_id,2021001,2021002,2021003,2021004,2021005
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
176870,3.0,9.0,0.0,0.0,9.0
13376773,0.0,9.0,0.0,0.0,0.0
29425554,0.0,0.0,6.0,0.0,0.0
105937256,0.0,0.0,7.0,0.0,0.0
157472828,4.0,0.0,0.0,3.0,0.0
177439970,0.0,0.0,3.0,0.0,0.0
183884717,7.0,0.0,10.0,0.0,0.0
187043457,0.0,8.0,0.0,9.0,3.0
193725370,0.0,0.0,8.0,0.0,9.0
193862021,2.0,10.0,0.0,0.0,9.0
