In [2]:
import pandas as pd
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset

view_log_train = pd.read_csv('../data/view_log.csv')
article_info = pd.read_csv('../data/article_info.csv')
submission = pd.read_csv('../submission/sample_submission.csv')

In [3]:
view_log_train.head(100)

Unnamed: 0,userID,articleID,userRegion,userCountry
0,USER_0000,ARTICLE_0661,NY,US
1,USER_0000,ARTICLE_2316,NY,US
2,USER_0000,ARTICLE_1345,NY,US
3,USER_0000,ARTICLE_1089,NY,US
4,USER_0000,ARTICLE_1484,NY,US
...,...,...,...,...
95,USER_0005,ARTICLE_0564,MG,BR
96,USER_0005,ARTICLE_0931,MG,BR
97,USER_0005,ARTICLE_2223,MG,BR
98,USER_0005,ARTICLE_1577,MG,BR


In [10]:
article_info.head()

Unnamed: 0,articleID,Title,Content,Format,Language,userID,userCountry,userRegion
0,ARTICLE_0000,19 Tips For Everyday Git Use,I've been using git full time for the past 4 y...,HTML,en,USER_0683,,
1,ARTICLE_0001,Intel buys computer vision startup Itseez to i...,Intel has acquired computer vision and machine...,HTML,en,USER_1129,,
2,ARTICLE_0002,Practical End-to-End Testing with Protractor,One of the reasons AngularJS is so great to wo...,HTML,en,USER_0256,,
3,ARTICLE_0003,Corporate venture growth in Brazil is another ...,Despite recent positive news and a renewed int...,HTML,en,USER_1304,,
4,ARTICLE_0004,Cross-channel user experiences with Drupal (aw...,"Last year around this time, I wrote that The B...",HTML,en,USER_0336,,


In [11]:
# 사용자와 기사 ID 추출
users = view_log_train['userID'].unique()
articles = view_log_train['articleID'].unique()

In [23]:
# 사용자와 기사 ID를 인덱스로 변환
user_id_map = {user: idx for idx, user in enumerate(users)}
article_id_map = {article: idx for idx, article in enumerate(articles)}
reverse_article_id_map = {idx: article for article, idx in article_id_map.items()}

In [13]:
# 사용자-기사 상호작용 매트릭스 생성
num_users = len(users)
num_articles = len(articles)
interaction_matrix = np.zeros((num_users, num_articles))

In [14]:
for _, row in view_log_train.iterrows():
    user_idx = user_id_map[row['userID']]
    article_idx = article_id_map[row['articleID']]
    interaction_matrix[user_idx, article_idx] = 1

In [15]:
class InteractionDataset(Dataset):
    def __init__(self, interaction_matrix):
        self.interaction_matrix = interaction_matrix

    def __len__(self):
        return len(self.interaction_matrix)

    def __getitem__(self, idx):
        return torch.tensor(self.interaction_matrix[idx], dtype=torch.float32)

# 데이터셋 및 데이터로더
dataset = InteractionDataset(interaction_matrix)
dataloader = DataLoader(dataset, batch_size=256, shuffle=True)


In [16]:
# 오토인코더 모델 구축
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Linear(input_dim, encoding_dim)
        self.decoder = nn.Linear(encoding_dim, input_dim)

    def forward(self, x):
        encoded = torch.relu(self.encoder(x))
        decoded = torch.sigmoid(self.decoder(encoded))
        return decoded

In [17]:
input_dim = num_articles
encoding_dim = 64
model = Autoencoder(input_dim, encoding_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [18]:
num_epochs = 10
for epoch in range(num_epochs):
    for data in dataloader:
        output = model(data)
        loss = criterion(output, data)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/10], Loss: 0.6824
Epoch [2/10], Loss: 0.6541
Epoch [3/10], Loss: 0.6040
Epoch [4/10], Loss: 0.5400
Epoch [5/10], Loss: 0.4266
Epoch [6/10], Loss: 0.3910
Epoch [7/10], Loss: 0.3548
Epoch [8/10], Loss: 0.3156
Epoch [9/10], Loss: 0.3709
Epoch [10/10], Loss: 0.2588


In [19]:
# 특정 사용자의 아이템 선호도 예측
user_id = 'USER_0000'
user_idx = user_id_map[user_id]
user_input = torch.tensor(interaction_matrix[user_idx], dtype=torch.float32).unsqueeze(0)
predicted_preferences = model(user_input).detach().numpy().flatten()

In [24]:
# 상위 N개의 추천 아이템 선택
top_n = 10
recommended_articles_idx = np.argsort(predicted_preferences)[-top_n:][::-1]

# 추천 아이템 ID 출력
recommended_article_ids = [reverse_article_id_map[idx] for idx in recommended_articles_idx]
print(f"추천 아이템 ID: {recommended_article_ids}")

추천 아이템 ID: ['ARTICLE_2868', 'ARTICLE_1568', 'ARTICLE_2259', 'ARTICLE_2045', 'ARTICLE_2865', 'ARTICLE_0061', 'ARTICLE_2782', 'ARTICLE_1267', 'ARTICLE_0446', 'ARTICLE_2698']


In [13]:
top_recommendations = pd.DataFrame(recommendations, columns=['userID', 'articleID'])

submission['articleID'] = top_recommendations['articleID']

submission.to_csv('../submission/baseline_submission.csv', index=False)


In [14]:
submission

Unnamed: 0,userID,articleID
0,USER_0000,ARTICLE_0411
1,USER_0000,ARTICLE_0664
2,USER_0000,ARTICLE_1568
3,USER_0000,ARTICLE_1230
4,USER_0000,ARTICLE_2255
...,...,...
7070,USER_1420,ARTICLE_0030
7071,USER_1420,ARTICLE_0614
7072,USER_1420,ARTICLE_1901
7073,USER_1420,ARTICLE_0714
