In [1]:
import numpy as np
import networkx as nx
import node2vec
import json
import os
from datetime import datetime, timedelta
from collections import Counter, defaultdict
import torch
from torch.utils.data import DataLoader, random_split
from tqdm import tqdm
import torch.optim as optim
import torch.nn as nn
from gensim.models import Word2Vec

In [2]:
# 获取当前文件的目录
# CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
# 获取项目根目录
# ROOT_DIR = os.path.dirname(CURRENT_DIR)

def load_collaboration_data():
    """加载已清洗的合作视频数据"""
    # data_path = os.path.join("data", "collaboration_videos.json")
    with open('../data/collaboration_videos.json', 'r', encoding='utf-8') as f:
        return json.load(f)
    
def check_should_be_ignored(video, cut_off_date, time_window):
    """檢查是否應該忽略影片"""
    
    video_date = datetime.fromtimestamp(video['timestamp'])
    if cut_off_date:
        if video_date >= cut_off_date or video_date < cut_off_date - timedelta(days=time_window):
            return True
    return False

    
def build_collaboration_graph(collaboration_data, time_window=30, cut_off_date=None):
    """建立合作网络图"""
    G = nx.DiGraph()
    
    # 建立節點
    for video in collaboration_data:
        source_member = video['source']
        target_member = video['target']
        G.add_node(f"{source_member[0]}_{source_member[1]}")
        G.add_node(f"{target_member[0]}_{target_member[1]}")
    
    # 建立邊
    for video in collaboration_data:
        # 如果時間窗口過期，則跳過
        if check_should_be_ignored(video, cut_off_date, time_window):
            continue

        source_member = video['source']
        target_member = video['target']

        G.add_edge(f"{source_member[0]}_{source_member[1]}", f"{target_member[0]}_{target_member[1]}")
    
    # 计算边的权重
    idol_values = defaultdict(list)
    for video in collaboration_data:
        # 如果時間窗口過期，則跳過
        if check_should_be_ignored(video, cut_off_date, time_window):
            continue
        
        source_member = video['source']
        target_member = video['target']
        effectiveness = video['views'] * 0.5 + video['likes'] * 0.3 + video['comments'] * 0.2
        
        combination = (f"{source_member[0]}_{source_member[1]}", f"{target_member[0]}_{target_member[1]}")
        idol_values[combination].append(effectiveness)
        
    # 计算平均效益
    idol_avg = {name: sum(vals) / len(vals) for name, vals in idol_values.items()}
    
    # 更新图的权重
    for name, effectiveness in idol_avg.items():
        source_name, target_name = name
        
        distance = float('inf')
        if effectiveness and effectiveness > 0:
            distance = 1 / effectiveness
        
        G.add_edge(source_name, target_name, weight=distance, effectiveness=effectiveness)
    return G

In [22]:
collaboration_data = load_collaboration_data()

In [23]:
last_video = collaboration_data[-1]
last_video

{'source': ['IZNA', 'BANGJEEMIN'],
 'target': ['IZNA', 'MAI'],
 'timestamp': 1744344109.0,
 'views': 2.5,
 'likes': 2.5,
 'comments': 2.5,
 'video_id': 'zmFv1Gm4dzw'}

In [24]:
G = build_collaboration_graph(collaboration_data, time_window=60, cut_off_date=datetime.fromtimestamp(last_video['timestamp']))

G

<networkx.classes.digraph.DiGraph at 0x3078afd90>

In [25]:

vec_G = node2vec.Graph(G, is_directed=True, p=1, q=1)
vec_G.preprocess_transition_probs()
walks = vec_G.simulate_walks(num_walks=10, walk_length=80)
walks

[['DKB_HARRY_JUNE'],
 ['FIFTYFIFTY_HANA'],
 ['NCT127_MARK'],
 ['MEOVV_NARIN'],
 ['EXO_SUHO'],
 ['BADVILLAIN_EMMA'],
 ['KICKFLIP_MINJE'],
 ['NMIXX_KYUJIN', 'NMIXX_JIWOO', 'ILLIT_MINJU'],
 ['SF9_YOOTAEYANG'],
 ['APINK_HAYOUNG'],
 ['KEP1ER_HUENING_BAHIYYIH'],
 ['TRIPLES_YUBIN'],
 ['NEWJEANS_HAERIN'],
 ['CRAVITY_TAEYOUNG'],
 ['DREAMCATCHER_DAMI'],
 ['NMIXX_SULLYOON', 'THEBOYZ_YOUNGHOON'],
 ['SEVENTEEN_VERNON'],
 ['BTOB_SEOEUNKWANG'],
 ['RIIZE_EUNSEOK'],
 ['CRAVITY_MINHEE'],
 ['BADVILLAIN_CHLOEYOUNG'],
 ['ENHYPEN_NI_KI'],
 ['THEBOYZ_HYUNJAE'],
 ['EVERGLOW_EU'],
 ['TWS_JIHOON'],
 ['OHMYGIRL_MIMI'],
 ['TOMORROW_X_TOGETHER_SOOBIN'],
 ['NCTDREAM_HAECHAN'],
 ['TREASURE_ASAHI',
  'TREASURE_YOSHI',
  'TREASURE_ASAHI',
  'TREASURE_YOONJAEHYUK',
  'TREASURE_HARUTO',
  'TREASURE_YOONJAEHYUK',
  'TREASURE_ASAHI',
  'TREASURE_YOONJAEHYUK',
  'TREASURE_DOYOUNG',
  'TREASURE_PARKJUNGWOO',
  'TREASURE_HARUTO',
  'LE_SSERAFIM_SAKURA'],
 ['TRIPLES_MAYU'],
 ['YOUNGPOSSE_YEONJUNG'],
 ['TWS_KYUNGMIN'],
 ['ATEE

In [26]:
from gensim.models import Word2Vec
def learn_embeddings(walks):
	'''
	Learn embeddings by optimizing the Skipgram objective using SGD.
	'''
	walks = [list(map(str, walk)) for walk in walks]
	model = Word2Vec(walks, vector_size=256, window=10, min_count=0, sg=1, workers=8, epochs=5)

	return model

model = learn_embeddings(walks)

model.wv.vectors.shape

(405, 256)

In [29]:
avg_embeddings = np.average(model.wv.vectors, axis=0)
print(avg_embeddings.shape)
print(avg_embeddings)

(256,)
[ 1.90104730e-02 -3.88949923e-02  6.47766069e-02  1.10087320e-02
  3.85260917e-02  1.47518935e-02  7.90772494e-03  4.28736024e-03
 -3.15086171e-02 -7.48367468e-03  2.36286949e-02  3.07109226e-02
 -6.39100298e-02 -7.60283470e-02  1.90641563e-02  5.59422523e-02
  6.76890686e-02  2.34608278e-02 -1.81519277e-02  4.78226878e-02
  3.33952019e-04 -1.22195575e-02 -8.79658107e-03 -1.92259867e-02
 -6.88981339e-02 -2.70887259e-02  4.11097333e-02  9.66833439e-03
 -2.48834305e-02 -4.47242782e-02  1.02615065e-03 -4.20387872e-02
 -1.52716208e-02  2.14639436e-02 -9.53542367e-02  5.96036501e-02
  4.74108048e-02 -4.51736674e-02 -5.10457670e-04 -1.95344295e-02
 -8.01449344e-02  7.24741220e-02 -1.64881237e-02  1.23823909e-02
  2.14683060e-02 -6.44139247e-03  6.75367308e-04  9.15765855e-03
 -4.15731072e-02  2.75410973e-02 -1.42327994e-02  3.62768024e-02
  7.78341070e-02 -9.57623404e-03 -6.07678071e-02 -1.94096491e-02
  6.51553199e-02 -1.69388093e-02  1.00147342e-02  2.33654454e-02
 -1.36637641e-02  

In [33]:
concate_embeddings = np.concatenate([avg_embeddings.reshape(1, -1), avg_embeddings.reshape(1, -1)], axis=0)
print(concate_embeddings.shape)
print(concate_embeddings)

(2, 256)
[[ 1.90104730e-02 -3.88949923e-02  6.47766069e-02  1.10087320e-02
   3.85260917e-02  1.47518935e-02  7.90772494e-03  4.28736024e-03
  -3.15086171e-02 -7.48367468e-03  2.36286949e-02  3.07109226e-02
  -6.39100298e-02 -7.60283470e-02  1.90641563e-02  5.59422523e-02
   6.76890686e-02  2.34608278e-02 -1.81519277e-02  4.78226878e-02
   3.33952019e-04 -1.22195575e-02 -8.79658107e-03 -1.92259867e-02
  -6.88981339e-02 -2.70887259e-02  4.11097333e-02  9.66833439e-03
  -2.48834305e-02 -4.47242782e-02  1.02615065e-03 -4.20387872e-02
  -1.52716208e-02  2.14639436e-02 -9.53542367e-02  5.96036501e-02
   4.74108048e-02 -4.51736674e-02 -5.10457670e-04 -1.95344295e-02
  -8.01449344e-02  7.24741220e-02 -1.64881237e-02  1.23823909e-02
   2.14683060e-02 -6.44139247e-03  6.75367308e-04  9.15765855e-03
  -4.15731072e-02  2.75410973e-02 -1.42327994e-02  3.62768024e-02
   7.78341070e-02 -9.57623404e-03 -6.07678071e-02 -1.94096491e-02
   6.51553199e-02 -1.69388093e-02  1.00147342e-02  2.33654454e-02
 

In [26]:
def load_node_list():
    with open('node_list.csv', 'r') as f:
        return [line.strip().split(',')[1] for line in f.readlines()[1:]]
    
    
def train_model(collaboration_data, model):
    node_list = load_node_list()
    
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    for i,video in tqdm(enumerate(collaboration_data)):
        if video['timestamp'] >= datetime.fromisoformat("2025-01-01").timestamp() or video['timestamp'] < datetime.fromisoformat("2020-01-01").timestamp() + 60 * 60 * 24 * 60:
            continue
        
        G = build_collaboration_graph(collaboration_data, time_window=60, cut_off_date=datetime.fromtimestamp(video['timestamp']))
        
        vec_G = node2vec.Graph(G, is_directed=True, p=1, q=1)
        vec_G.preprocess_transition_probs()
        walks = vec_G.simulate_walks(num_walks=1, walk_length=80)
        walks = [list(map(str, walk)) for walk in walks]
        embedding_model = Word2Vec(walks, vector_size=256, window=10, min_count=0, sg=1, workers=8, epochs=5)
        
        source_member = video['source']
        target_member = video['target']
        
        source_name = f"{source_member[0]}_{source_member[1]}"
        target_name = f"{target_member[0]}_{target_member[1]}"

        graph_embedding = np.average(embedding_model.wv.vectors, axis=0).reshape(1, -1)
        
        source_embedding = embedding_model.wv[source_name].reshape(1, -1)
        
        input_embeddings = torch.tensor(np.concatenate([graph_embedding, source_embedding], axis=1))
        output = model(input_embeddings)
        
        target_one_hot = torch.zeros(405)
        target_node_index = node_list.index(target_name)
        target_one_hot[target_node_index] = 1
        target_one_hot = target_one_hot.view(1, -1)        
        
        loss = nn.functional.cross_entropy(output, torch.tensor([target_node_index]))
        loss.backward()
        optimizer.step()

        print(f"Loss = {loss.item():.4f}")
        return
        
collaboration_data = load_collaboration_data()



In [20]:



import torch
import torch.nn as nn

class MLPClassifier(nn.Module):
    def __init__(self, embedding_dim=512, hidden1=1024, hidden2=512, num_classes=405, dropout_p=0.5):
        super(MLPClassifier, self).__init__()
        self.fc1 = nn.Linear(embedding_dim, hidden1)
        self.bn1 = nn.LayerNorm(hidden1)       # 可选
        self.relu1 = nn.ReLU(inplace=True)
        self.drop1 = nn.Dropout(dropout_p)        # 可选

        self.fc2 = nn.Linear(hidden1, hidden2)
        self.bn2 = nn.LayerNorm(hidden2)       # 可选
        self.relu2 = nn.ReLU(inplace=True)
        self.drop2 = nn.Dropout(dropout_p)       # 可选

        self.fc3 = nn.Linear(hidden2, num_classes)

        # 权重初始化（可选）
        nn.init.kaiming_uniform_(self.fc1.weight, nonlinearity='relu')
        nn.init.kaiming_uniform_(self.fc2.weight, nonlinearity='relu')
        nn.init.xavier_uniform_(self.fc3.weight)

    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.drop1(x)

        x = self.fc2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.drop2(x)

        logits = self.fc3(x)
        return nn.functional.softmax(logits, dim=1)

In [25]:
model = MLPClassifier()
train_model(collaboration_data, model)

0it [00:00, ?it/s]

input_embeddings torch.Size([1, 512])
output tensor([[0.0024, 0.0012, 0.0043, 0.0070, 0.0067, 0.0010, 0.0055, 0.0009, 0.0150,
         0.0017, 0.0022, 0.0006, 0.0004, 0.0023, 0.0018, 0.0005, 0.0056, 0.0003,
         0.0025, 0.0022, 0.0006, 0.0026, 0.0012, 0.0007, 0.0016, 0.0023, 0.0046,
         0.0009, 0.0008, 0.0012, 0.0019, 0.0009, 0.0035, 0.0008, 0.0022, 0.0074,
         0.0027, 0.0022, 0.0015, 0.0013, 0.0005, 0.0030, 0.0037, 0.0021, 0.0005,
         0.0012, 0.0028, 0.0065, 0.0037, 0.0017, 0.0008, 0.0004, 0.0049, 0.0030,
         0.0012, 0.0002, 0.0004, 0.0002, 0.0010, 0.0015, 0.0013, 0.0008, 0.0024,
         0.0005, 0.0006, 0.0050, 0.0020, 0.0020, 0.0015, 0.0005, 0.0053, 0.0018,
         0.0048, 0.0025, 0.0031, 0.0074, 0.0003, 0.0015, 0.0009, 0.0054, 0.0010,
         0.0036, 0.0017, 0.0004, 0.0020, 0.0045, 0.0002, 0.0023, 0.0029, 0.0010,
         0.0013, 0.0022, 0.0021, 0.0021, 0.0010, 0.0016, 0.0004, 0.0013, 0.0067,
         0.0043, 0.0011, 0.0009, 0.0001, 0.0012, 0.0026, 0.0039,




In [19]:

import torch.optim as optim

model = MLPClassifier()


def train_model(model, dataset):
    train_loader = DataLoader(dataset, batch_size=64, shuffle=True, num_workers=4)  
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    for epoch in range(10):
        for batch_idx, (embeddings, source, target) in enumerate(train_loader):
            embeddings = embeddings.view(embeddings.size(0), -1)
            source = source.view(source.size(0), -1)
            target = target.view(target.size(0), -1)
            
            optimizer.zero_grad()
            
            
            
            output = model(embeddings, source)
            loss = nn.functional.nll_loss(output, target)
            loss.backward()
            optimizer.step()
            
            
            

    # 加载已清洗的数据
    collaboration_data = load_collaboration_data()
    time_window = 60
    
    # 为每个视频生成推荐
    output_rows = []
    for video in collaboration_data:
        if video['timestamp'] >= datetime.fromisoformat("2025-01-01").timestamp() or video['timestamp'] < datetime.fromisoformat("2020-05-01").timestamp():
            continue

        G = build_collaboration_graph(collaboration_data, time_window=time_window, cut_off_date=datetime.fromtimestamp(video['timestamp']))
        vec_G = node2vec.Graph(G, is_directed=True, p=1, q=1)
        vec_G.preprocess_transition_probs()
        walks = vec_G.simulate_walks(num_walks=10, walk_length=80)
        walks = [list(map(str, walk)) for walk in walks]
        embedding_model = Word2Vec(walks, vector_size=256, window=10, min_count=0, sg=1, workers=8, epochs=5)
        
        source_member = video['source']
        target_member = video['target']
        
        source_name = f"{source_member[0]}_{source_member[1]}"
        target_name = f"{target_member[0]}_{target_member[1]}"
        
        
        
        if recommendations:
            recommend_idol = []
            for recommend_team in recommendations:
                recommend_idol.append(recommend_team[0])
            
            output_rows.append({
                "date": datetime.fromtimestamp(video['timestamp']).strftime("%Y-%m-%d"),
                "initiator": source_name,
                "collaborator": target_name,
                "recommend_idol": ", ".join(recommend_idol),
                "video_id": video.get("video_id", ""),
            })
    
    # 保存结果
    output_file = os.path.join(CURRENT_DIR, f"predictions_{time_window}_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv")
    df_output = pd.DataFrame(output_rows)
    df_output.to_csv(output_file, index=False, encoding='utf-8-sig')

True

In [1]:
import torch
import torch.nn.functional as F
a =  F.softmax(torch.tensor([[0.2,0.1,0.7], [0.1,0.2,0.7]]), dim=1)


In [2]:
for i in range(len(a)):
    print(a[i][1])

tensor(0.2546)
tensor(0.2814)
