In [52]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


view_log_train = pd.read_csv('../data/view_log.csv')
article_info = pd.read_csv('../data/article_info.csv')
submission = pd.read_csv('../submission/sample_submission.csv')
view_log = view_log_train

In [53]:
view_log_train.head(100)

Unnamed: 0,userID,articleID,userRegion,userCountry
0,USER_0000,ARTICLE_0661,NY,US
1,USER_0000,ARTICLE_2316,NY,US
2,USER_0000,ARTICLE_1345,NY,US
3,USER_0000,ARTICLE_1089,NY,US
4,USER_0000,ARTICLE_1484,NY,US
...,...,...,...,...
95,USER_0005,ARTICLE_0564,MG,BR
96,USER_0005,ARTICLE_0931,MG,BR
97,USER_0005,ARTICLE_2223,MG,BR
98,USER_0005,ARTICLE_1577,MG,BR


In [55]:
# TF-IDF 벡터화
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(article_info['Content'])
# 기사 ID와 인덱스 매핑
indices = pd.Series(article_info.index, index=article_info['articleID']).drop_duplicates()


In [70]:
# 사용자 프로필 벡터 생성
def create_user_profile(user_id):
    user_articles = view_log[view_log['userID'] == user_id]['articleID']
    user_article_indices = [indices[article] for article in user_articles if article in indices]

    if len(user_article_indices) == 0:
        return np.zeros((1, tfidf_matrix.shape[1]))

    user_tfidf_matrix = tfidf_matrix[user_article_indices]
    user_profile = np.asarray(user_tfidf_matrix.mean(axis=0)).flatten()

    return user_profile

In [71]:
# 특정 사용자에 대한 추천 생성
def get_user_recommendations(user_id, n_recommendations=5):
    user_profile = create_user_profile(user_id)

    cosine_similarities = linear_kernel([user_profile], tfidf_matrix).flatten()

    # 유사도에 따라 기사 정렬
    sorted_similar_articles = sorted(enumerate(cosine_similarities), key=lambda x: x[1], reverse=True)

    # 상위 n개의 기사 추천
    top_article_indices = [i[0] for i in sorted_similar_articles[:n_recommendations]]
    top_article_ids = article_info['articleID'].iloc[top_article_indices].tolist()

    return top_article_ids

In [72]:
import numpy as np
# 모든 사용자에 대해 추천 항목 생성
recommendations = []
for user in view_log['userID'].unique():
    top5recommend = get_user_recommendations(user, n_recommendations=5)
    for article in top5recommend:
        recommendations.append([user, article])

In [76]:
# 추천 결과를 데이터프레임으로 변환
top_recommendations = pd.DataFrame(recommendations, columns=['userID', 'articleID'])

In [77]:
top_recommendations

Unnamed: 0,userID,articleID
0,USER_0000,ARTICLE_2255
1,USER_0000,ARTICLE_0411
2,USER_0000,ARTICLE_1270
3,USER_0000,ARTICLE_2081
4,USER_0000,ARTICLE_1809
...,...,...
7070,USER_1420,ARTICLE_0030
7071,USER_1420,ARTICLE_2622
7072,USER_1420,ARTICLE_1848
7073,USER_1420,ARTICLE_0614


In [78]:
top_recommendations.to_csv('../submission/tf_idf_baseline_2.csv', index=False)