In [52]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset

view_log_train = pd.read_csv('../data/view_log.csv')
article_info = pd.read_csv('../data/article_info.csv')
submission = pd.read_csv('../submission/sample_submission.csv')

In [53]:
view_log_train.head(100)

Unnamed: 0,userID,articleID,userRegion,userCountry
0,USER_0000,ARTICLE_0661,NY,US
1,USER_0000,ARTICLE_2316,NY,US
2,USER_0000,ARTICLE_1345,NY,US
3,USER_0000,ARTICLE_1089,NY,US
4,USER_0000,ARTICLE_1484,NY,US
...,...,...,...,...
95,USER_0005,ARTICLE_0564,MG,BR
96,USER_0005,ARTICLE_0931,MG,BR
97,USER_0005,ARTICLE_2223,MG,BR
98,USER_0005,ARTICLE_1577,MG,BR


In [54]:
view_log_train.drop(['userRegion','userCountry'],axis =1 , inplace=True)
df = view_log_train

In [55]:
# 사용자 및 기사 인덱스 생성
df['user_id'] = df['userID'].astype('category').cat.codes
df['article_id'] = df['articleID'].astype('category').cat.codes

In [56]:
# 학습 및 테스트 데이터셋 분할
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [57]:
# PyTorch 데이터셋 생성
class InteractionDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['user_id'].values, dtype=torch.long)
        self.articles = torch.tensor(df['article_id'].values, dtype=torch.long)
        self.labels = torch.tensor([1] * len(df), dtype=torch.float32)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.articles[idx], self.labels[idx]

In [58]:
train_dataset = InteractionDataset(train)
test_dataset = InteractionDataset(test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [69]:
# BERT4Rec 모델 정의
class BERT4Rec(nn.Module):
    def __init__(self, num_users, num_articles, embedding_size, num_layers, num_heads, dropout_rate, max_seq_length):
        super(BERT4Rec, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.article_embedding = nn.Embedding(num_articles, embedding_size)
        self.positional_embedding = nn.Embedding(max_seq_length, embedding_size)
        self.transformer = nn.Transformer(d_model=embedding_size, nhead=num_heads, num_encoder_layers=num_layers, num_decoder_layers=num_layers, dropout=dropout_rate)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(embedding_size, 1)
        self.max_seq_length = max_seq_length

    def forward(self, user_ids, article_ids):
        user_emb = self.user_embedding(user_ids)
        article_emb = self.article_embedding(article_ids)
        position_ids = torch.arange(article_emb.size(1), device=article_emb.device).unsqueeze(0).expand_as(article_ids)
        position_emb = self.positional_embedding(position_ids)
        x = user_emb + article_emb + position_emb
        x = self.dropout(x)
        x = x.permute(1, 0, 2)  # Transformer expects input shape (seq_length, batch_size, embedding_size)
        transformer_output = self.transformer(x, x)
        transformer_output = transformer_output.permute(1, 0, 2)  # Convert back to (batch_size, seq_length, embedding_size)
        x = transformer_output.mean(dim=1)
        x = self.fc(x)
        return torch.sigmoid(x)

# 하이퍼파라미터 설정
num_users = df['user_id'].nunique()
num_articles = df['article_id'].nunique()
embedding_size = 50
num_layers = 2
num_heads = 2
dropout_rate = 0.1
max_seq_length = 10

In [91]:

model = BERT4Rec(num_users, num_articles, embedding_size, num_layers, num_heads, dropout_rate, max_seq_length)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)



In [97]:
# 모델 학습
def train_model(model, criterion, optimizer, num_epochs=5):
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        for users, articles, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(users.unsqueeze(1), articles.unsqueeze(1))
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * users.size(0)

        epoch_loss = running_loss / len(train_loader.dataset)

        # Validation phase
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for users, articles, labels in val_loader:
                outputs = model(users.unsqueeze(1), articles.unsqueeze(1))
                loss = criterion(outputs.squeeze(), labels)
                val_loss += loss.item() * users.size(0)

        val_loss /= len(val_loader.dataset)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Validation Loss: {val_loss:.4f}')

train_model(model, criterion, optimizer, num_epochs=10)

Epoch 1/10, Loss: 0.0001, Validation Loss: 0.0000
Epoch 2/10, Loss: 0.0000, Validation Loss: 0.0000
Epoch 3/10, Loss: 0.0000, Validation Loss: 0.0000
Epoch 4/10, Loss: 0.0000, Validation Loss: 0.0000
Epoch 5/10, Loss: 0.0000, Validation Loss: 0.0000
Epoch 6/10, Loss: 0.0000, Validation Loss: 0.0000
Epoch 7/10, Loss: 0.0000, Validation Loss: 0.0000
Epoch 8/10, Loss: 0.0000, Validation Loss: 0.0000
Epoch 9/10, Loss: 0.0000, Validation Loss: 0.0000
Epoch 10/10, Loss: 0.0000, Validation Loss: 0.0000


In [98]:

# 특정 사용자에 대한 추천 생성 함수
def recommend_articles(user_id, num_recommendations=5):
    user_index = df[df['userID'] == user_id]['user_id'].iloc[0]
    article_indices = torch.tensor([i for i in range(num_articles)], dtype=torch.long)

    user_indices = torch.tensor([user_index] * num_articles, dtype=torch.long)
    with torch.no_grad():
        predictions = model(user_indices.unsqueeze(1), article_indices.unsqueeze(1)).squeeze().numpy()
    top_articles = predictions.argsort()[-num_recommendations:][::-1]

    recommended_article_ids = df['articleID'].astype('category').cat.categories[top_articles].tolist()
    return recommended_article_ids

# 예시 사용자에 대한 추천
recommended_articles = recommend_articles('USER_0001', 5)
print(f"추천 기사: {recommended_articles}")

추천 기사: ['ARTICLE_3007', 'ARTICLE_0985', 'ARTICLE_1006', 'ARTICLE_1005', 'ARTICLE_1004']


In [88]:
# 모든 사용자에 대해 추천 생성
recommendations = []
unique_users = df['userID'].unique()

for user in unique_users:
    recommended_articles = recommend_articles(user, 5)
    for article in recommended_articles:
        recommendations.append([user, article])

In [89]:
# 추천 결과를 데이터프레임으로 변환
top_recommendations = pd.DataFrame(recommendations, columns=['userID', 'articleID'])

# 제출 파일 생성
submission = pd.DataFrame(columns=['userID', 'articleID'])
submission['userID'] = top_recommendations['userID']
submission['articleID'] = top_recommendations['articleID']

# 파일 저장
submission.to_csv('../submission/SASRec3.csv', index=False)

print(submission.head())

      userID     articleID
0  USER_0000  ARTICLE_2202
1  USER_0000  ARTICLE_0437
2  USER_0000  ARTICLE_1746
3  USER_0000  ARTICLE_2409
4  USER_0000  ARTICLE_2837
