In [13]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, SAGEConv, GATConv
from torch_geometric.data import Data
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from datetime import datetime
import networkx as nx
from sklearn.metrics import precision_score, recall_score, f1_score
import os
# from model import GNNRecommender
from util import get_node_encoder, prepare_temporal_graph_data
from tqdm import tqdm



In [14]:
class GNNRecommender(torch.nn.Module):
    def __init__(self, num_features, hidden_channels=128, embedding_dim=64):
        super(GNNRecommender, self).__init__()
        
        # Encoder
        self.encoder = torch.nn.Sequential(
            GATConv(num_features, hidden_channels, heads=4),
            torch.nn.LayerNorm(hidden_channels * 4),
            torch.nn.ELU(),
            torch.nn.Dropout(0.2),
            
            GATConv(hidden_channels * 4, hidden_channels, heads=2),
            torch.nn.LayerNorm(hidden_channels * 2),
            torch.nn.ELU(),
            torch.nn.Dropout(0.2),
            
            GATConv(hidden_channels * 2, embedding_dim),
            torch.nn.LayerNorm(embedding_dim),
            torch.nn.ELU()
        )
        
        # Decoder
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(embedding_dim, hidden_channels),
            torch.nn.LayerNorm(hidden_channels),
            torch.nn.ELU(),
            torch.nn.Dropout(0.2),
            
            torch.nn.Linear(hidden_channels, num_features),
            torch.nn.LayerNorm(num_features),
            torch.nn.ELU()
        )
        
    def encode(self, x, edge_index, edge_attr=None):
        # 提取節點嵌入
        for layer in self.encoder:
            if isinstance(layer, GATConv):
                x = layer(x, edge_index, edge_attr)
            else:
                x = layer(x)
        return x
    
    def decode(self, x):
        # 解碼節點嵌入
        for layer in self.decoder:
            x = layer(x)
        return x
    
    def forward(self, x, edge_index, edge_attr=None):
        # 編碼
        embeddings = self.encode(x, edge_index, edge_attr)
        # 解碼
        reconstructed = self.decode(embeddings)
        return reconstructed, embeddings
    
    def get_embedding(self, x, edge_index, edge_attr=None):
        # 獲取節點嵌入
        with torch.no_grad():
            return self.encode(x, edge_index, edge_attr)
    

In [16]:
WINDOW_SIZE = 60
REMOVED_SAME_GROUP = True

# 加載數據
collaboration_df = pd.read_csv("/home/bl515-ml/Documents/shaio_jie/sma/Kpop_challenge_analyze/data/collaboration_videos.csv")
collaboration_df['timestamp'] = collaboration_df['timestamp'].astype(float)

# 只使用不同組的藝人
if REMOVED_SAME_GROUP:
    collaboration_df = collaboration_df[collaboration_df['source'].str.split('_').str[0] != collaboration_df['target'].str.split('_').str[0]]

# 獲取節點編碼器
node_encoder = get_node_encoder()

# 只使用歷史數據
itter = collaboration_df[collaboration_df['timestamp'] < datetime(2025, 1, 1).timestamp()]

num_nodes = node_encoder.classes_.shape[0]
x = torch.eye(num_nodes)  # 使用 one-hot 編碼作為初始特徵

# 創建時序數據加載器
snapshots = []
for _, row in tqdm(itter.iterrows()):
    start_time = row['timestamp'] - WINDOW_SIZE * 24 * 3600
    # 獲取當前窗口內的數據
    window_data = collaboration_df[
        (collaboration_df['timestamp'] >= start_time) & 
        (collaboration_df['timestamp'] < row['timestamp'])
    ]
        
    if len(window_data) > 0:
        # 創建邊索引和特徵
        edge_index = []
        edge_feature = []
            
        for _, row in window_data.iterrows():
            artist1_idx = node_encoder.transform([row['source']])[0]
            artist2_idx = node_encoder.transform([row['target']])[0]
                
            edge_index.append([artist1_idx, artist2_idx])
                
            # 計算邊特徵Tuet
            views = row['views'] if 'views' in row else 0
            likes = row['likes'] if 'likes' in row else 0
            comments = row['comments'] if 'comments' in row else 0

            # 組合特徵
            feature = [ views, likes, comments]
            edge_feature.append(feature)
            
            
        # 轉換為張量
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        edge_feature = torch.tensor(edge_feature, dtype=torch.float)

        # 創建圖數據
        data = Data(x=x, edge_index=edge_index, edge_attr=edge_feature)
        snapshots.append(data)
        
        

1156it [00:33, 34.55it/s]


In [4]:
model = GNNRecommender(num_features=num_nodes).to('cuda')
num_epochs = 200

optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# 設定 early stopping 參數
patience = 10
min_delta = 0.001
best_loss = float('inf')
patience_counter = 0

# 修改訓練循環
for epoch in range(num_epochs):
    epoch_loss = 0
    for snapshot in tqdm(snapshots):
        cuda_snapshot = snapshot.to('cuda')
        out, embeddings = model(cuda_snapshot.x, cuda_snapshot.edge_index, cuda_snapshot.edge_attr)
        
        # 計算重構損失
        reconstruction_loss = F.mse_loss(out, cuda_snapshot.x)
        
        # 計算嵌入的相似度損失
        similarity_matrix = F.cosine_similarity(embeddings.unsqueeze(1), embeddings.unsqueeze(0), dim=2)
        
        # 使用邊信息作為正樣本對
        positive_pairs = cuda_snapshot.edge_index.t()
        positive_similarities = similarity_matrix[positive_pairs[:, 0], positive_pairs[:, 1]]
        
        # 計算對比損失
        contrastive_loss = -torch.mean(torch.log(torch.sigmoid(positive_similarities)))
        
        # 總損失
        loss = reconstruction_loss + contrastive_loss
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    avg_loss = epoch_loss / len(snapshots)
    print(f'Epoch {epoch}, Average Loss: {avg_loss:.4f}')
    
    # Early stopping 檢查
    if avg_loss < best_loss - min_delta:
        best_loss = avg_loss
        patience_counter = 0
        # 保存最佳模型
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1
        
    if patience_counter >= patience:
        print(f'Early stopping triggered after {epoch + 1} epochs')
        break


100%|██████████| 1151/1151 [00:05<00:00, 197.93it/s]


Epoch 0, Average Loss: 0.3514


100%|██████████| 1151/1151 [00:05<00:00, 227.13it/s]


Epoch 1, Average Loss: 0.3211


100%|██████████| 1151/1151 [00:05<00:00, 226.54it/s]


Epoch 2, Average Loss: 0.3175


100%|██████████| 1151/1151 [00:05<00:00, 227.43it/s]


Epoch 3, Average Loss: 0.3157


100%|██████████| 1151/1151 [00:05<00:00, 226.14it/s]


Epoch 4, Average Loss: 0.3157


100%|██████████| 1151/1151 [00:05<00:00, 226.13it/s]


Epoch 5, Average Loss: 0.3156


100%|██████████| 1151/1151 [00:05<00:00, 227.62it/s]


Epoch 6, Average Loss: 0.3156


100%|██████████| 1151/1151 [00:05<00:00, 226.80it/s]


Epoch 7, Average Loss: 0.3156


100%|██████████| 1151/1151 [00:05<00:00, 226.20it/s]


Epoch 8, Average Loss: 0.3156


100%|██████████| 1151/1151 [00:05<00:00, 226.42it/s]


Epoch 9, Average Loss: 0.3156


100%|██████████| 1151/1151 [00:05<00:00, 226.03it/s]


Epoch 10, Average Loss: 0.3156


100%|██████████| 1151/1151 [00:05<00:00, 226.20it/s]


Epoch 11, Average Loss: 0.3156


100%|██████████| 1151/1151 [00:05<00:00, 224.64it/s]


Epoch 12, Average Loss: 0.3156


100%|██████████| 1151/1151 [00:05<00:00, 225.40it/s]

Epoch 13, Average Loss: 0.3156
Early stopping triggered after 14 epochs





In [5]:
# 加載最佳模型
model.load_state_dict(torch.load('best_model.pth'))

<All keys matched successfully>

In [6]:

# 只使用歷史數據
test_itter = collaboration_df[collaboration_df['timestamp'] >= datetime(2025, 1, 1).timestamp()]

# 創建時序數據加載器
test_snapshots = []
for _, row in tqdm(test_itter.iterrows()):
    start_time = row['timestamp'] - WINDOW_SIZE * 24 * 3600
    # 獲取當前窗口內的數據
    window_data = collaboration_df[
        (collaboration_df['timestamp'] >= start_time) & 
        (collaboration_df['timestamp'] < row['timestamp'])
    ]
        
    if len(window_data) > 0:
        # 創建邊索引和特徵
        edge_index = []
        edge_feature = []
            
        for _, row in window_data.iterrows():
            artist1_idx = node_encoder.transform([row['source']])[0]
            artist2_idx = node_encoder.transform([row['target']])[0]
                
            edge_index.append([artist1_idx, artist2_idx])
                
            # 計算邊特徵
            views = row['views'] if 'views' in row else 0
            likes = row['likes'] if 'likes' in row else 0
            comments = row['comments'] if 'comments' in row else 0

            # 組合特徵
            feature = [ views, likes, comments]
            edge_feature.append(feature)
            
            
        # 轉換為張量
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        edge_feature = torch.tensor(edge_feature, dtype=torch.float)

        # 創建圖數據
        data = Data(x=x, edge_index=edge_index, edge_attr=edge_feature)
        test_snapshots.append(data)
        
        

113it [00:09, 12.17it/s]


In [7]:
test_itter.iloc[0]['source']


'BOYNEXTDOOR_SUNGHO'

In [8]:
all_recommendations = []
for i, snapshot in enumerate(test_snapshots):
    cuda_snapshot = snapshot.to('cuda')
    with torch.no_grad():
        embeddings = model.get_embedding(cuda_snapshot.x, cuda_snapshot.edge_index, cuda_snapshot.edge_attr)

    label_row = test_itter.iloc[i]
    source = label_row['source']
    
    source_idx = node_encoder.transform([source])[0]
    
    source_embedding = embeddings[source_idx]
    
    similarities = F.cosine_similarity(source_embedding.unsqueeze(0), embeddings).to('cpu')
    top_k_values, top_k_indices = torch.topk(similarities, k=11)
    
    recommendations = []
    for idx in top_k_indices[1:]:
        rec_artist = node_encoder.inverse_transform([idx])[0]
        recommendations.append(rec_artist)
        
    all_recommendations.append({
        'source': source,
        'label': label_row['target'],
        'recommendations': recommendations
    })
                    

In [9]:
# 計算評估指標
true_positives = 0
total_recommendations = 0
reciprocal_ranks = []
hits_at_k = 0
    
for recommendations in all_recommendations:
    if recommendations['label'] in recommendations['recommendations']:
        rank = recommendations['recommendations'].index(recommendations['label']) + 1
        reciprocal_ranks.append(1.0 / rank)
        if rank <= 10:
            hits_at_k += 1
    else:
        reciprocal_ranks.append(0.0)
    
    total_recommendations += len(recommendations['recommendations'])
        
        
mrr = np.mean(reciprocal_ranks) if reciprocal_ranks else 0
hit_at_k = hits_at_k / len(all_recommendations) if len(all_recommendations) > 0 else 0


mrr_percentage = mrr * 100
hit_at_k_percentage = hit_at_k * 100
print(f"MRR: {mrr_percentage:.2f} %, Hit@10: {hit_at_k_percentage:.2f} %")

MRR: 5.30 %, Hit@10: 10.62 %
