In [1]:
import pandas as pd
import numpy as np
import random
import os
from scipy.spatial.distance import pdist, squareform
import scipy
from implicit.nearest_neighbours import bm25_weight
import implicit


def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

print('pandas version: ', pd.__version__)
print('numpy version: ', np.__version__)
print('scipy version: ', scipy.__version__)
print('implicit version: ', implicit.__version__)

pandas version:  2.0.0
numpy version:  1.24.2
scipy version:  1.10.1
implicit version:  0.7.2


In [2]:
view_log = pd.read_csv('data/view_log.csv')
article_info = pd.read_csv('data/article_info.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

In [3]:
# view count를 bm25 weight로 바꿔주기
user_article_matrix = view_log.groupby(['userID', 'articleID']).size().unstack(fill_value=0)
user_article_matrix = bm25_weight(user_article_matrix, K1=3.95, B=0.2)
user_article_matrix = pd.DataFrame.sparse.from_spmatrix(user_article_matrix)
user_article_matrix.index = view_log.groupby(['userID', 'articleID']).size().unstack(fill_value=0).index
user_article_matrix.columns = view_log.groupby(['userID', 'articleID']).size().unstack(fill_value=0).columns

# bm25 weight를 이용해서 유저간 코사인 유사도 구해주기
user_similarity = pdist(user_article_matrix, metric='cosine')
user_similarity = 1 - squareform(user_similarity)

# 모든 user/item pair별 추천 점수 계산
user_predicted_scores = user_similarity.dot(user_article_matrix) / np.array([np.abs(user_similarity).sum(axis=1)]).T

# 이미 조회한 기사 포함해서 추천
recommendations = []
for idx, user in enumerate(user_article_matrix.index):    
    # 높은 점수대로 정렬
    sorted_indices = user_predicted_scores[idx].argsort()[::-1]
    sorted_recommend = [article for article in user_article_matrix.columns[sorted_indices]]
    
    for article in sorted_recommend:
        recommendations.append([user, article])
        
sorted_recommendations_bm25 = pd.DataFrame(recommendations, columns=['userID', 'articleID'])

In [4]:
# bm25 weight이용해서 logistic matrix factorization 트레이닝하기
user_article_matrix = view_log.groupby(['userID', 'articleID']).size().unstack(fill_value=0)
user_article_matrix = bm25_weight(user_article_matrix, K1=3.95, B=0.2)
Xui_csr = scipy.sparse.csr_matrix(user_article_matrix)

enu = view_log.groupby(['userID', 'articleID']).size().unstack(fill_value=0).index
items = list(view_log.groupby(['userID', 'articleID']).size().unstack(fill_value=0).columns)

model = implicit.cpu.lmf.LogisticMatrixFactorization(factors=300,
                                        learning_rate=0.2159180023813538,
                                        regularization=7.44635012137320653,
                                        iterations=250,
                                        neg_prop=100,
                                        random_state=42)

model.fit(Xui_csr)

  0%|          | 0/250 [00:00<?, ?it/s]

In [5]:
recommendations = []
for idx, user in enumerate(enu):     
    sorted_recommend = model.recommend(idx, Xui_csr[idx], N=len(items), filter_already_liked_items=False)
    sorted_recommend = [items[r] for r in sorted_recommend[0]]
    for article in sorted_recommend:
        recommendations.append([user, article])

sorted_recommendations_lmf = pd.DataFrame(recommendations, columns=['userID', 'articleID'])

In [6]:
# 각 df의 user/item pair마다 랭킹을 매겨주고 총합 랭킹이 높은 5개의 article 추천
sorted_recommendations_bm25['rank'] = sorted_recommendations_bm25.groupby('userID').cumcount() + 1
sorted_recommendations_lmf['rank'] = sorted_recommendations_lmf.groupby('userID').cumcount() + 1

df = sorted_recommendations_lmf.merge(sorted_recommendations_bm25,
                                      how='inner',
                                      left_on=['userID', 'articleID'],
                                      right_on=['userID', 'articleID'])
df['rank_total'] = df['rank_x'] + df['rank_y']
top5 = df.groupby('userID', group_keys=False).apply(lambda x: x.sort_values('rank_total').head(5))
top5.reset_index(drop=True, inplace=True)
top5 = top5.drop(columns=['rank_x', 'rank_y', 'rank_total'])
top5

Unnamed: 0,userID,articleID
0,USER_0000,ARTICLE_0411
1,USER_0000,ARTICLE_2834
2,USER_0000,ARTICLE_2255
3,USER_0000,ARTICLE_2316
4,USER_0000,ARTICLE_1345
...,...,...
7070,USER_1420,ARTICLE_0030
7071,USER_1420,ARTICLE_0614
7072,USER_1420,ARTICLE_0714
7073,USER_1420,ARTICLE_1732


In [7]:
# top5.to_csv('submission/bm25_with_cosine_and_lmf_reproduce_with_42_with_pop_and_random.csv', index=False)