In [1]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, SAGEConv, GATConv
from torch_geometric.data import Data
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from datetime import datetime
import networkx as nx
from sklearn.metrics import precision_score, recall_score, f1_score
from torch_geometric.loader import LinkNeighborLoader

import os
from model import GNNRecommender
from util import get_node_encoder, prepare_temporal_graph_data
from tqdm import tqdm
from transformers import BertTokenizer, BertModel
import torch
import pandas as pd
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class GATEncoder(torch.nn.Module):
    def __init__(self, num_node, hidden_channels=128, embedding_dim=64):
        super(GATEncoder, self).__init__()
        
        self.encoder = torch.nn.Sequential(
            GATConv(num_node, hidden_channels, heads=4),
            torch.nn.LayerNorm(hidden_channels * 4),
            torch.nn.ELU(),
            torch.nn.Dropout(0.2),
            
            GATConv(hidden_channels * 4, hidden_channels, heads=2),
            torch.nn.LayerNorm(hidden_channels * 2),
            torch.nn.ELU(),
            torch.nn.Dropout(0.2),
            
            GATConv(hidden_channels * 2, embedding_dim),
            torch.nn.LayerNorm(embedding_dim),
            torch.nn.ELU()
        )
        
    def forward(self, x, edge_index, edge_attr=None):
        for layer in self.encoder:
            if isinstance(layer, GATConv):
                x = layer(x, edge_index, edge_attr)
            else:
                x = layer(x)
        return x

class LinkClassifier(torch.nn.Module):
    def __init__(self, hidden_channels=128, embedding_dim=64):
        super(LinkClassifier, self).__init__()
        
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(embedding_dim * 2, hidden_channels),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(hidden_channels, hidden_channels * 2),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(hidden_channels * 2, 2),
        )
        
    def forward(self, u, v):
        h = torch.cat([u, v], dim=1)
        return self.mlp(h).squeeze(1)
        
class LinkRegressor(torch.nn.Module):
    def __init__(self, hidden_channels=128, embedding_dim=64):
        super(LinkRegressor, self).__init__()
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(embedding_dim * 2, hidden_channels),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(hidden_channels, hidden_channels * 2),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(hidden_channels * 2, 1),
        )
        
    def forward(self, u , v):
        h = torch.cat([u, v], dim=1)
        return self.mlp(h).squeeze(1)

class GATLinkModel(torch.nn.Module):
  def __init__(self, in_feats, hidden_feats = 128, embedding_dim = 64):
    super().__init__()
    self.encoder = GATEncoder(in_feats, hidden_feats, embedding_dim)
    self.cls_head = LinkClassifier(hidden_feats, embedding_dim)
    self.reg_head = LinkRegressor(hidden_feats, embedding_dim)
    
  def forward(self, x, edge_index, edge_attr, src_index, dst_index):
    z = self.encoder(x, edge_index, edge_attr)  
    src_z = z[src_index]
    dst_z = z[dst_index]
    
    logits = self.cls_head(dst_z, src_z)
    scores = self.reg_head(src_z, dst_z)
    
    return logits, scores

In [3]:
# 读取 k_pop_group_info 数据
with open("/home/bl515-ml/Documents/shaio_jie/sma/Kpop_challenge_analyze/artist_texts.json", "r") as f:
    node_info = json.load(f)
    
max_length =0
for node in node_info.keys():
    if len(node_info[node]) > max_length:
        max_length = len(node_info[node])
max_length

34717

In [4]:
# 加载 BERT 模型和分词器
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased').to('cuda')
model.eval()  # 设置为评估模式

# 获取节点编码器
node_encoder = get_node_encoder()


# 为每个节点生成 BERT embedding
node_embeddings = {}
for node in node_info.keys():
    # 获取节点的描述文本
    if len(node_info[node]) > 0:
        text = node_info[node]
    else:
        text = node  # 如果没有描述，使用节点名称
        
    slice_embeddings = []
    for slice_idx in range(0, len(text), 512):
        slice_text = text[slice_idx:slice_idx+512]
        # 使用 BERT 生成 embedding
        inputs = tokenizer(f"[CLS]{slice_text}[SEP]", return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = inputs.to('cuda')
        with torch.no_grad():
            outputs = model(**inputs)
        slice_embeddings.append(outputs.last_hidden_state[:, 0, :].squeeze())
        
    # 使用 [CLS] token 的 embedding 作为节点特征
    embedding = torch.mean(torch.stack(slice_embeddings), dim=0)
    
    node_embeddings[node.upper()] = embedding

In [None]:
node_features = torch.zeros(node_encoder.classes_.shape[0], 768)
for i in node_encoder.classes_:
    name = i.split('_')[-1]
    if name == 'BAHIYYIH':
        name = 'HUENING_BAHIYYIH'
    elif name == 'KI':
        name = 'NI_KI'
    elif name == 'KIM':
        name = "E_TION"
    elif name == 'N':
        name = "I_N"
        
    if name in node_embeddings:
        node_features[node_encoder.transform([i])[0]] = node_embeddings[name]
    else:
        node_features[node_encoder.transform([i])[0]] = torch.zeros(768)
        
node_features[0,:]

tensor([ 1.9665e-01, -2.4611e-01,  1.2773e-03, -1.3177e-01, -3.3159e-03,
         6.2177e-02, -1.0946e-01,  5.4648e-02, -1.9412e-01,  1.5306e-01,
        -4.5052e-02,  1.5314e-01,  1.1436e-01,  2.8708e-01, -3.6588e-01,
        -7.7544e-02,  1.2886e-01,  6.8055e-02,  1.5622e-01,  3.3720e-01,
         3.1993e-02,  1.5004e-01, -4.6461e-02,  2.3265e-02,  1.8225e-01,
        -1.9584e-01, -1.1333e-01,  2.5803e-01,  3.2474e-01, -4.6997e-02,
         1.0414e-01,  1.8729e-01, -1.9658e-01,  2.2084e-01,  4.7247e-02,
        -6.0015e-03, -1.1794e+00,  2.2172e-01,  7.6610e-02, -5.9659e-02,
        -1.2987e-01, -1.9220e-02, -8.2201e-02,  4.4585e-03, -1.9403e-03,
         7.8023e-01,  2.3173e-01,  2.1108e-01,  8.9115e-01,  1.9869e-01,
         9.4542e-02, -3.4629e-01,  2.7083e-02, -1.0205e+00, -1.4013e-01,
         7.2204e-02, -8.1762e-03, -2.0153e-01, -1.7985e-01,  8.4907e-02,
         9.5718e-03, -1.3761e-01,  5.5709e-02, -4.9836e-02, -1.9562e-01,
         7.0386e-02,  5.6497e-02,  3.2515e-02, -1.3

In [6]:
num_epochs = 200
model = GATLinkModel(768).to('cuda')
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)


# 設定 early stopping 參數
patience = 10
min_delta = 0.001
best_loss = float('inf')
patience_counter = 0

WINDOW_SIZE = 180
REMOVED_SAME_GROUP = False

# 加載數據
collaboration_df = pd.read_csv("/home/bl515-ml/Documents/shaio_jie/sma/Kpop_challenge_analyze/data/collaboration_videos.csv")
collaboration_df['timestamp'] = collaboration_df['timestamp'].astype(float)

# 只使用不同組的藝人
if REMOVED_SAME_GROUP:
    collaboration_df = collaboration_df[collaboration_df['source'].str.split('_').str[0] != collaboration_df['target'].str.split('_').str[0]]


for epoch in range(num_epochs):
    epoch_loss = 0
    end_time = collaboration_df['timestamp'].min()
    offset = 10
    total_iter = 0
    while end_time < collaboration_df['timestamp'].max():
        start_time = end_time - WINDOW_SIZE * 24 * 3600
        window_data = collaboration_df[
            (collaboration_df['timestamp'] >= start_time) & 
            (collaboration_df['timestamp'] < end_time)
        ]
        
        offset = len(collaboration_df[collaboration_df['timestamp'] <= end_time]) + 1
        
        if offset >= collaboration_df.shape[0]:
            break
        
        end_time = collaboration_df['timestamp'].iloc[offset]
        
        if len(window_data) < 10:
            continue
        
        # 創建邊索引和特徵
        edge_index = []
        edge_feature = []
        scores = []
        
        
        for _, row in window_data.iterrows():
            artist1_idx = node_encoder.transform([row['source']])[0]
            artist2_idx = node_encoder.transform([row['target']])[0]
                
            edge_index.append([artist1_idx, artist2_idx])
                
            # 計算邊特徵
            views = row['views'] if 'views' in row else 0
            likes = row['likes'] if 'likes' in row else 0
            comments = row['comments'] if 'comments' in row else 0

            # 組合特徵
            feature = [ views, likes, comments]
            edge_feature.append(feature)
            scores.append(row['views'] * 0.5 + row['likes'] * 0.3 + row['comments'] * 0.2)
            
        
        # 轉換為張量
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        edge_feature = torch.tensor(edge_feature, dtype=torch.float)
        scores = torch.tensor(scores, dtype=torch.float).t()
        # 創建圖數據
        data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_feature, scores=scores).to('cuda')
        
        # 使用 LinkNeighborLoader 進行正負取樣
        loader = LinkNeighborLoader(
            data=data,
            num_neighbors=[10, 10],  # 每個節點採樣10個一階鄰居和10個二階鄰居
            batch_size=32,
            edge_label_index=edge_index,  # 正樣本邊
            neg_sampling_ratio=1.0,  # 負樣本比例為1:1
            shuffle=True,
        )

        # 訓練模型
        model.train()
        optimizer.zero_grad()
        for batch in loader:
            batch = batch.to('cuda')
            optimizer.zero_grad()
            
            # 前向傳播
            cl, reg = model(data.x, data.edge_index, data.edge_attr, batch.edge_label_index[0].t(), batch.edge_label_index[1].t())
            
            # 計算損失
            # 計算分類損失
            classification_loss = F.cross_entropy(
                cl, 
                batch.edge_label.long()
            )
            
            # print(batch)
            # 根據 input_id 獲取對應的 scores
            mask_pos = batch.edge_label == 1
            # 計算正樣本的 scores
            mask_reg = reg[mask_pos]
            # 計算回歸損失 (預測互動分數)
            regression_loss = F.mse_loss(
                mask_reg, 
                data.scores[batch.input_id]
            )
            
            # 組合兩種損失
            loss =  classification_loss +  0.01 * regression_loss

            # 反向傳播
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
            total_iter += 1
        
    avg_loss = epoch_loss / total_iter
    print(f'Epoch {epoch}, Average Loss: {avg_loss:.4f}')
    
    # Early stopping 檢查
    if avg_loss < best_loss - min_delta:
        best_loss = avg_loss
        patience_counter = 0
        # 保存最佳模型
        torch.save(model.state_dict(), 'best_model_v2_180.pth')
    else:
        patience_counter += 1
        
    if patience_counter >= patience:
        print(f'Early stopping triggered after {epoch + 1} epochs')
        break
        

        
    



Epoch 0, Average Loss: 5.9736




Epoch 1, Average Loss: 5.8989




Epoch 2, Average Loss: 5.8920




Epoch 3, Average Loss: 5.8852




Epoch 4, Average Loss: 5.8821




KeyboardInterrupt: 

In [7]:
# 加載最佳模型
model.load_state_dict(torch.load('best_model_v2_180.pth'))



<All keys matched successfully>

In [8]:
# 只使用歷史數據
test_itter = collaboration_df[collaboration_df['timestamp'] >= datetime(2025, 1, 1).timestamp()]

all_recommendations = []
# 創建時序數據加載器
test_snapshots = []
for _, itter_row in tqdm(test_itter.iterrows()):
    start_time = itter_row['timestamp'] - WINDOW_SIZE * 24 * 3600
    # 獲取當前窗口內的數據
    window_data = collaboration_df[
        (collaboration_df['timestamp'] >= start_time) & 
        (collaboration_df['timestamp'] < itter_row['timestamp'])
    ]
        
    if len(window_data) > 0:
        # 創建邊索引和特徵
        edge_index = []
        edge_feature = []
        scores = []
            
        for _, row in window_data.iterrows():
            artist1_idx = node_encoder.transform([row['source']])[0]
            artist2_idx = node_encoder.transform([row['target']])[0]
                
            edge_index.append([artist1_idx, artist2_idx])
                
            # 計算邊特徵
            views = row['views'] if 'views' in row else 0
            likes = row['likes'] if 'likes' in row else 0
            comments = row['comments'] if 'comments' in row else 0

            # 組合特徵
            feature = [ views, likes, comments]
            edge_feature.append(feature)
            scores.append(row['views'] * 0.5 + row['likes'] * 0.3 + row['comments'] * 0.2)
            
        # 轉換為張量
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        edge_feature = torch.tensor(edge_feature, dtype=torch.float)

        
        # 創建圖數據
        data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_feature, scores=scores).to('cuda')
        
        
    
        source = itter_row['source']
        predictions = []
        with torch.no_grad():
            source_idx = node_encoder.transform([source])
            for j in node_encoder.classes_:
                if j == source:
                    continue
                target_idx = node_encoder.transform([j])
                cl, reg = model(data.x, data.edge_index, data.edge_attr, source_idx, target_idx)
            
                softmax_cl = F.softmax(cl, dim=1)
            
            
                is_positive = softmax_cl[:,1].item() > 0.5
            
                if is_positive:
                    predictions.append((j, softmax_cl[:,1].item(), reg.item(), is_positive))

    
    
        predictions.sort(key=lambda x: x[1], reverse=True)
    
        top_k_recommendations = predictions[:10]
    
        top_k_recommendations = top_k_recommendations.sort(key=lambda x: x[2], reverse=True)
    
        all_recommendations.append({
            'source': source,
        'label': itter_row['target'],
        'score': itter_row['views'] * 0.5 + itter_row['likes'] * 0.3 + itter_row['comments'] * 0.2,
        'recommendations': predictions
    })
        
        

519it [09:49,  1.14s/it]


In [7]:
test_itter.iloc[0]['source']


'BOYNEXTDOOR_SUNGHO'

In [9]:
# 計算評估指標
true_positives = 0
total_recommendations = 0
reciprocal_ranks = []
hits_at_k = 0
mses = []
    
for recommendations in all_recommendations:
    rank = 0
    selected_reg = 0
    for i,(n, cl, reg, is_positive) in enumerate(recommendations['recommendations']):
        if n == recommendations['label']:
            rank = i + 1
            selected_reg = reg
            break;
    
    if rank > 0:
        reciprocal_ranks.append(1.0 / rank)
        mses.append((selected_reg - recommendations['score']) ** 2)
        if rank <= 10:
            hits_at_k += 1
    else:
        reciprocal_ranks.append(0.0)
    
    total_recommendations += len(recommendations['recommendations'])
    
        
mrr = np.mean(reciprocal_ranks) if reciprocal_ranks else 0
hit_at_k = hits_at_k / len(all_recommendations) if len(all_recommendations) > 0 else 0


mrr_percentage = mrr * 100
hit_at_k_percentage = hit_at_k * 100
print(f"MRR: {mrr_percentage:.4f} %, Hit@10: {hit_at_k_percentage:.4f} %, MSE: {np.mean(mses):.4f}")

MRR: 9.7201 %, Hit@10: 23.5067 %, MSE: 882.0334


In [17]:
last_df = collaboration_df[collaboration_df['timestamp'] >= collaboration_df['timestamp'].max() - WINDOW_SIZE * 24 * 3600]

result = []
edge_index=[]
edge_feature=[]
scores=[]
for _, row in last_df.iterrows():
        
    artist1_idx = node_encoder.transform([row['source']])[0]
    artist2_idx = node_encoder.transform([row['target']])[0]
                
    edge_index.append([artist1_idx, artist2_idx])
                
    # 計算邊特徵
    views = row['views'] if 'views' in row else 0
    likes = row['likes'] if 'likes' in row else 0
    comments = row['comments'] if 'comments' in row else 0

    # 組合特徵
    feature = [ views, likes, comments]
    edge_feature.append(feature)
    scores.append(row['views'] * 0.5 + row['likes'] * 0.3 + row['comments'] * 0.2)
            
# 轉換為張量
edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
edge_feature = torch.tensor(edge_feature, dtype=torch.float)
    
        
# 創建圖數據
data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_feature, scores=scores).to('cuda')
    
source = itter_row['source']
predictions = []
with torch.no_grad():
    for i in node_encoder.classes_:
        for j in node_encoder.classes_:
            if j == i:
                continue
            target_idx = node_encoder.transform([j])
            cl, reg = model(data.x, data.edge_index, data.edge_attr, source_idx, target_idx)
            
            softmax_cl = F.softmax(cl, dim=1)
            
            is_positive = softmax_cl[:,1].item() > 0.5
            
            if is_positive:
                predictions.append((j, softmax_cl[:,1].item(), reg.item(), is_positive))

        predictions.sort(key=lambda x: x[1], reverse=True)
    
        top_k_recommendations = predictions[:10]
    
        top_k_recommendations = top_k_recommendations.sort(key=lambda x: x[2], reverse=True)
    
        result.append({
            'source': i,
            'recommendations': predictions
        })
    
# 創建預測結果 DataFrame
    predictions_df = pd.DataFrame(result)
    
    # 保存為 CSV 文件
    output_dir = 'predictions'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    output_file = os.path.join(output_dir, 'predictions_eee.csv')
    predictions_df.to_csv(output_file, index=False)
    print(f"\n預測結果已保存至: {output_file}")
    


預測結果已保存至: predictions/predictions_eee.csv
