In [2]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, SAGEConv, GATConv
from torch_geometric.data import Data
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from datetime import datetime
import networkx as nx
from sklearn.metrics import precision_score, recall_score, f1_score
import os
from gnn_best.model import GNNRecommender

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data = torch.load('model.pth')
model = GNNRecommender(num_features=train_data['num_features'])
    
# 載入訓練好的模型權重
model.load_state_dict(train_data['model'])
model.eval()  # 設置為評估模式

GNNRecommender(
  (conv1): GATConv(405, 128, heads=4)
  (conv2): GATConv(512, 128, heads=2)
  (conv3): GATConv(256, 405, heads=1)
  (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (layer_norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (layer_norm3): LayerNorm((405,), eps=1e-05, elementwise_affine=True)
  (forward_layer): Sequential(
    (0): Linear(in_features=405, out_features=405, bias=True)
    (1): LayerNorm((405,), eps=1e-05, elementwise_affine=True)
    (2): ELU(alpha=1.0)
  )
)

In [3]:
def prepare_graph_data(collaboration_df, time_window=60, split_timestamp=None):
    """
    準備圖神經網絡所需的數據，並根據時間戳分割訓練集和測試集
    """
    # 創建節點編碼器
    node_encoder = LabelEncoder()
    
    # 獲取所有唯一的藝人
    all_artists = pd.concat([collaboration_df['source'], collaboration_df['target']]).unique()
    node_encoder.fit(all_artists)
    
    # 創建節點特徵矩陣
    num_nodes = len(all_artists)
    x = torch.eye(num_nodes)  # 使用 one-hot 編碼作為初始特徵
    
    # 準備邊特徵
    edge_scaler = MinMaxScaler()
    edge_features = []
    
    # 分割訓練集和測試集
    if split_timestamp:
        train_df = collaboration_df[collaboration_df['timestamp'] < split_timestamp]
        test_df = collaboration_df[collaboration_df['timestamp'] >= split_timestamp]
    else:
        train_df = collaboration_df
        test_df = pd.DataFrame()
    
    # 創建訓練集的邊索引和屬性
    train_edge_index = []
    train_edge_attr = []
    
    # 處理訓練數據
    for _, row in train_df.iterrows():
        artist1_idx = node_encoder.transform([row['source']])[0]
        artist2_idx = node_encoder.transform([row['target']])[0]
        
        # 添加雙向邊
        train_edge_index.append([artist1_idx, artist2_idx])
        train_edge_index.append([artist2_idx, artist1_idx])
        
        # 計算邊特徵
        current_time = datetime.now().timestamp()
        time_diff = (current_time - row['timestamp']) / (24 * 3600)  # 轉換為天
        time_weight = np.exp(-time_diff / time_window)
        
        # 使用 views, likes, comments 作為額外特徵
        views = row['views'] if 'views' in row else 0
        likes = row['likes'] if 'likes' in row else 0
        comments = row['comments'] if 'comments' in row else 0
        
        # 組合所有特徵
        edge_feature = [time_weight, views, likes, comments]
        edge_features.append(edge_feature)
        edge_features.append(edge_feature)  # 雙向邊使用相同的特徵
    
    # 標準化邊特徵
    edge_features = edge_scaler.fit_transform(edge_features)
    train_edge_attr = torch.tensor(edge_features, dtype=torch.float)
    train_edge_index = torch.tensor(train_edge_index, dtype=torch.long).t().contiguous()
    
    train_data = Data(x=x, edge_index=train_edge_index, edge_attr=train_edge_attr)
    
    # 如果有測試集，也創建測試數據
    test_data = None
    if not test_df.empty:
        test_edge_index = []
        test_edge_attr = []
        test_edge_features = []
        
        for _, row in test_df.iterrows():
            artist1_idx = node_encoder.transform([row['source']])[0]
            artist2_idx = node_encoder.transform([row['target']])[0]
            
            test_edge_index.append([artist1_idx, artist2_idx])
            test_edge_index.append([artist2_idx, artist1_idx])
            
            current_time = datetime.now().timestamp()
            time_diff = (current_time - row['timestamp']) / (24 * 3600)
            time_weight = np.exp(-time_diff / time_window)
            
            views = row['views'] if 'views' in row else 0
            likes = row['likes'] if 'likes' in row else 0
            comments = row['comments'] if 'comments' in row else 0
            
            edge_feature = [time_weight, views, likes, comments]
            test_edge_features.append(edge_feature)
            test_edge_features.append(edge_feature)
        
        test_edge_features = edge_scaler.transform(test_edge_features)
        test_edge_attr = torch.tensor(test_edge_features, dtype=torch.float)
        test_edge_index = torch.tensor(test_edge_index, dtype=torch.long).t().contiguous()
        
        test_data = Data(x=x, edge_index=test_edge_index, edge_attr=test_edge_attr)
    
    return train_data, test_data, node_encoder, test_df

    
df = pd.read_csv('/home/bl515-ml/Documents/shaio_jie/sma/Kpop_challenge_analyze/data/collaboration_videos.csv')
train_data, test_data, node_encoder, test_df = prepare_graph_data(df, split_timestamp= datetime(2025, 1, 1).timestamp())

In [4]:
train_data.edge_index.shape

torch.Size([2, 44520])

In [5]:
out = model(train_data.x, train_data.edge_index, train_data.edge_attr)
out.shape

torch.Size([405, 405])

In [10]:
F.softmax(out[0,:], dim=0)

tensor([9.8781e-01, 4.8454e-04, 3.8341e-03, 1.8593e-03, 3.3051e-06, 8.1624e-06,
        7.8343e-06, 6.2922e-06, 3.2411e-06, 9.5504e-06, 4.5667e-06, 7.7281e-06,
        5.2938e-06, 5.3848e-06, 1.1276e-05, 1.2466e-04, 9.6601e-06, 6.1200e-06,
        4.0513e-06, 1.0625e-05, 1.0563e-05, 4.6984e-06, 1.2642e-05, 1.0441e-05,
        3.2985e-06, 2.8046e-06, 5.4756e-06, 4.1925e-06, 3.9124e-06, 3.8688e-06,
        3.1319e-06, 7.6982e-06, 3.5113e-06, 1.0838e-04, 3.8582e-06, 3.5127e-06,
        2.8012e-06, 5.6263e-06, 1.0192e-05, 2.8464e-06, 9.6234e-06, 1.3536e-05,
        3.8481e-06, 5.2545e-06, 4.9485e-06, 3.3008e-06, 4.5305e-06, 2.7508e-05,
        2.4363e-05, 5.5056e-06, 6.1503e-06, 3.0488e-06, 1.3162e-05, 6.1804e-06,
        9.2603e-06, 1.0348e-05, 1.0351e-05, 8.5524e-06, 7.3982e-06, 6.3885e-06,
        3.3795e-06, 1.1800e-05, 5.6298e-06, 1.5104e-05, 5.8854e-06, 7.7402e-06,
        1.6699e-05, 4.8820e-06, 4.7353e-06, 3.5929e-06, 6.4414e-06, 3.7914e-06,
        5.2537e-06, 2.0771e-05, 9.1566e-

In [7]:
embeddings = model.get_embedding(train_data.x, train_data.edge_index, train_data.edge_attr)

In [8]:
embeddings.shape

torch.Size([405, 405])

In [9]:
embeddings[0,:]

tensor([-9.0662e-01, -7.3545e-01, -1.1635e-01, -2.7983e-01, -2.8207e-01,
        -1.9159e-01, -9.5706e-01,  7.6652e-01,  1.3125e+00, -8.4179e-01,
        -3.1051e-01,  5.3592e-01, -4.1930e-01,  8.4386e-01,  8.6492e-01,
        -6.0051e-01, -5.0340e-01, -6.9413e-01,  6.5344e-01, -4.7714e-01,
        -4.5430e-01,  1.2794e+00, -5.8185e-01, -1.0470e-01,  9.1599e-02,
         6.3130e-01, -1.7859e-01, -1.8663e-01, -6.3641e-01, -6.9175e-01,
         6.1135e-01,  2.4539e-02, -2.8283e-01, -6.9820e-01, -5.1373e-01,
         1.8617e-01,  1.2569e+00, -5.9640e-01, -4.0214e-01, -3.6103e-01,
        -2.3937e-01,  7.4101e-01, -2.2337e-01,  1.2324e+00, -4.2983e-01,
        -5.0144e-03,  1.2977e-01,  2.0723e+00,  3.4802e-01,  7.3130e-01,
         9.6496e-01, -7.9130e-01, -4.8713e-01, -4.7521e-02, -9.3712e-01,
         8.7732e-01,  1.5317e+00, -7.3459e-01, -8.2759e-01, -5.4128e-02,
         1.7123e+00,  1.3960e-01, -8.0496e-01, -3.7677e-01, -7.5774e-02,
         1.4132e+00,  5.1004e-01,  3.5022e-01,  1.2

In [4]:

df = pd.read_csv('../data/collaboration_videos.csv')
    
    # 將時間戳轉換為浮點數
df['timestamp'] = df['timestamp'].astype(float)
    
    # 設置分割時間戳（2025-01-01 的 Unix 時間戳）
split_timestamp = datetime(2025, 1, 1).timestamp()
    
test_df = df[df['timestamp'] >= split_timestamp]

In [5]:
total = 0
same_group = 0
for _, row in test_df.iterrows():
    total += 1
    source_group = row['source'].split('_')[0]
    target_group = row['target'].split('_')[0]
    if source_group == target_group:
        same_group += 1
        
print(same_group / total)


0.7822736030828517
