In [69]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

view_log_train = pd.read_csv('../data/view_log.csv')
article_info = pd.read_csv('../data/article_info.csv')
submission = pd.read_csv('../submission/sample_submission.csv')

In [70]:
len(view_log_train)

42717

In [71]:
view_log_train.nunique()

userID         1415
articleID      2879
userRegion       56
userCountry      21
dtype: int64

In [72]:
view_log_train.head(20)

Unnamed: 0,userID,articleID,userRegion,userCountry
0,USER_0000,ARTICLE_0661,NY,US
1,USER_0000,ARTICLE_2316,NY,US
2,USER_0000,ARTICLE_1345,NY,US
3,USER_0000,ARTICLE_1089,NY,US
4,USER_0000,ARTICLE_1484,NY,US
5,USER_0000,ARTICLE_1033,NY,US
6,USER_0000,ARTICLE_1033,NY,US
7,USER_0000,ARTICLE_2255,NY,US
8,USER_0000,ARTICLE_1260,NY,US
9,USER_0000,ARTICLE_0090,NY,US


In [73]:
view_log_train = view_log_train.rename(columns={'userID': 'viewuserID'})
view_log_train = view_log_train.rename(columns={'userRegion': 'viewuserRegion'})
view_log_train = view_log_train.rename(columns={'userCountry': 'viewuserCountry'})


In [74]:
article_info

Unnamed: 0,articleID,Title,Content,Format,Language,userID,userCountry,userRegion
0,ARTICLE_0000,19 Tips For Everyday Git Use,I've been using git full time for the past 4 y...,HTML,en,USER_0683,,
1,ARTICLE_0001,Intel buys computer vision startup Itseez to i...,Intel has acquired computer vision and machine...,HTML,en,USER_1129,,
2,ARTICLE_0002,Practical End-to-End Testing with Protractor,One of the reasons AngularJS is so great to wo...,HTML,en,USER_0256,,
3,ARTICLE_0003,Corporate venture growth in Brazil is another ...,Despite recent positive news and a renewed int...,HTML,en,USER_1304,,
4,ARTICLE_0004,Cross-channel user experiences with Drupal (aw...,"Last year around this time, I wrote that The B...",HTML,en,USER_0336,,
...,...,...,...,...,...,...,...,...
3003,ARTICLE_3003,Como consumir conteúdo de qualidade em iOS - C...,"Quando iniciei minha jornada em Swift, saindo ...",HTML,pt,USER_0882,BR,MG
3004,ARTICLE_3004,Aurelia 1.0 is Here!!!,It's been an amazing journey to get here and i...,HTML,en,USER_0220,,
3005,ARTICLE_3005,Lessons from converting an app to 100% Kotlin ...,This is part one in a series of posts about Ko...,HTML,en,USER_1010,BR,SP
3006,ARTICLE_3006,ITA está oferecendo 10 cursos gratuitos a dist...,"O Instituto Tecnológico de Aeronáutica (ITA) ,...",HTML,pt,USER_1210,,


In [75]:
# 데이터프레임 병합
merged_df = pd.merge(view_log_train, article_info, on='articleID', how='inner')

In [76]:
merged_df.drop(['Title', 'Content', 'userCountry', 'userRegion'], axis=1, inplace=True)

In [77]:
merged_df

Unnamed: 0,viewuserID,articleID,viewuserRegion,viewuserCountry,Format,Language,userID
0,USER_0000,ARTICLE_0661,NY,US,HTML,en,USER_1304
1,USER_0067,ARTICLE_0661,SP,BR,HTML,en,USER_1304
2,USER_0414,ARTICLE_0661,SP,BR,HTML,en,USER_1304
3,USER_0557,ARTICLE_0661,SP,BR,HTML,en,USER_1304
4,USER_1163,ARTICLE_0661,SP,BR,HTML,en,USER_1304
...,...,...,...,...,...,...,...
42712,USER_1420,ARTICLE_0714,SP,BR,HTML,en,USER_1420
42713,USER_1420,ARTICLE_1711,SP,BR,HTML,pt,USER_1420
42714,USER_1420,ARTICLE_1732,SP,BR,HTML,en,USER_1420
42715,USER_1420,ARTICLE_1732,SP,BR,HTML,en,USER_1420


In [78]:
df = merged_df

In [79]:
df

Unnamed: 0,viewuserID,articleID,viewuserRegion,viewuserCountry,Format,Language,userID
0,USER_0000,ARTICLE_0661,NY,US,HTML,en,USER_1304
1,USER_0067,ARTICLE_0661,SP,BR,HTML,en,USER_1304
2,USER_0414,ARTICLE_0661,SP,BR,HTML,en,USER_1304
3,USER_0557,ARTICLE_0661,SP,BR,HTML,en,USER_1304
4,USER_1163,ARTICLE_0661,SP,BR,HTML,en,USER_1304
...,...,...,...,...,...,...,...
42712,USER_1420,ARTICLE_0714,SP,BR,HTML,en,USER_1420
42713,USER_1420,ARTICLE_1711,SP,BR,HTML,pt,USER_1420
42714,USER_1420,ARTICLE_1732,SP,BR,HTML,en,USER_1420
42715,USER_1420,ARTICLE_1732,SP,BR,HTML,en,USER_1420


In [80]:
# 사용자 및 기사 인덱스 생성
df['user_id'] = df['viewuserID'].astype('category').cat.codes
df['article_id'] = df['articleID'].astype('category').cat.codes

In [83]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset

# 학습 및 테스트 데이터셋 분할
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [96]:
# PyTorch 데이터셋 생성
class InteractionDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['user_id'].values, dtype=torch.long)
        self.articles = torch.tensor(df['article_id'].values, dtype=torch.long)
        self.labels = torch.tensor([1] * len(df), dtype=torch.float32)  # 여기에 레이블(상호작용)을 지정

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.articles[idx], self.labels[idx]

In [119]:
train_dataset = InteractionDataset(train)
test_dataset = InteractionDataset(test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [120]:
# SASRec 모델 정의
class SASRec(nn.Module):
    def __init__(self, num_users, num_articles, embedding_size, max_seq_length, num_heads, num_layers, dropout_rate):
        super(SASRec, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.article_embedding = nn.Embedding(num_articles, embedding_size)
        self.positional_embedding = nn.Embedding(max_seq_length, embedding_size)
        self.layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=embedding_size, nhead=num_heads, dropout=dropout_rate)
            for _ in range(num_layers)
        ])
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(embedding_size, 1)

    def forward(self, user_ids, article_ids):
        user_emb = self.user_embedding(user_ids)
        article_emb = self.article_embedding(article_ids)
        position_ids = torch.arange(article_emb.size(1), device=article_emb.device).unsqueeze(0).expand_as(article_ids)
        position_emb = self.positional_embedding(position_ids)
        x = user_emb + article_emb + position_emb
        x = self.dropout(x)
        for layer in self.layers:
            x = layer(x)
        x = x.mean(dim=1)
        x = self.fc(x)
        return torch.sigmoid(x)

In [121]:
# 하이퍼파라미터 설정
num_users = df['user_id'].nunique()
num_articles = df['article_id'].nunique()
embedding_size = 50
max_seq_length = 10
num_heads = 2
num_layers = 2
dropout_rate = 0.1

In [130]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from torch import optim
# 모델 인스턴스 생성 및 컴파일
model = SASRec(num_users, num_articles, embedding_size, max_seq_length, num_heads, num_layers, dropout_rate)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [131]:
# 모델 학습
def train_model(model, criterion, optimizer, num_epochs=5):
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        for users, articles, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(users.unsqueeze(1), articles.unsqueeze(1))
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * users.size(0)

        epoch_loss = running_loss / len(train_loader.dataset)

        # Validation phase
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for users, articles, labels in val_loader:
                outputs = model(users.unsqueeze(1), articles.unsqueeze(1))
                loss = criterion(outputs.squeeze(), labels)
                val_loss += loss.item() * users.size(0)

        val_loss /= len(val_loader.dataset)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.8f}, Validation Loss: {val_loss:.8f}')

train_model(model, criterion, optimizer, num_epochs=5)

Epoch 1/5, Loss: 0.00339261, Validation Loss: 0.00016737
Epoch 2/5, Loss: 0.00010798, Validation Loss: 0.00004445
Epoch 3/5, Loss: 0.00003620, Validation Loss: 0.00001857
Epoch 4/5, Loss: 0.00001664, Validation Loss: 0.00000924
Epoch 5/5, Loss: 0.00000873, Validation Loss: 0.00000501


In [126]:

# 특정 사용자에 대한 추천 생성 함수
def recommend_articles(user_id, num_recommendations=5):
    user_index = df[df['viewuserID'] == user_id]['user_id'].iloc[0]
    article_indices = torch.tensor([i for i in range(num_articles)], dtype=torch.long)

    user_indices = torch.tensor([user_index] * num_articles, dtype=torch.long)
    with torch.no_grad():
        predictions = model(user_indices.unsqueeze(1), article_indices.unsqueeze(1)).squeeze().numpy()
    top_articles = predictions.argsort()[-num_recommendations:][::-1]

    recommended_article_ids = df['articleID'].astype('category').cat.categories[top_articles].tolist()
    return recommended_article_ids

# 예시 사용자에 대한 추천
recommended_articles = recommend_articles('USER_1420', 5)
print(f"추천 기사: {recommended_articles}")

추천 기사: ['ARTICLE_3007', 'ARTICLE_0985', 'ARTICLE_1006', 'ARTICLE_1005', 'ARTICLE_1004']


In [127]:
# 모든 사용자에 대한 추천 생성
recommendations = []
for user in user_article_matrix.index:
    recommended_articles = recommend_articles(user, 5)
    for article in recommended_articles:
        recommendations.append([user, article])

# 추천 결과를 데이터프레임으로 변환
top_recommendations = pd.DataFrame(recommendations, columns=['userID', 'articleID'])

In [128]:
# 제출 파일 생성
submission = pd.DataFrame(columns=['userID', 'articleID'])
submission['userID'] = top_recommendations['userID']
submission['articleID'] = top_recommendations['articleID']

In [129]:
submission.to_csv('../submission/SASRec3.csv', index=False)