In [1]:
import argparse
import pickle

from loader import MoleculeDataset
from torch_geometric.data import DataLoader

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from tqdm import tqdm
import numpy as np

from model import GNN_graphpred, GNN_Bayes
from sklearn.metrics import roc_auc_score, accuracy_score

from splitters import scaffold_split
import pandas as pd

import os
import shutil

import torch.multiprocessing
torch.multiprocessing.set_sharing_strategy('file_system')

import pandas 
import math 

import matplotlib.pyplot as plt 

from model import MLPregression 
from sklearn.manifold import TSNE 

criterion = nn.L1Loss()

K_1 = 16846
K_2 = 17.71 # 18

In [7]:
#分开的
def r2_score(y_true, y_pred):
    """
    y_true: torch.Tensor, shape (N,)
    y_pred: torch.Tensor, shape (N,)
    """
    y_true_mean = torch.mean(y_true)
    ss_tot = torch.sum((y_true - y_true_mean) ** 2)
    ss_res = torch.sum((y_true - y_pred) ** 2)
    r2 = 1 - ss_res / ss_tot
    return r2.item()

def eval(args, model, dataset, device, loader): # para_contrastive_loss,
    model.eval()

    y_scores = []
    y_scores_cl = []
    y_scores_vdss = []
    y_scores_t1_2 = []
    
    y_cl = []
    y_vdss = []
    y_t1_2 = []
    y_auc = []

    for step, batch in enumerate(tqdm(loader, desc="Iteration")):
        batch = batch.to(device)

        with torch.no_grad():
            pred_log = model(batch.x, batch.edge_index, batch.edge_attr, batch.batch)
            # pred_log, std= model(batch.x, batch.edge_index, batch.edge_attr, batch.batch,1, False)
            # cl_pred_log = model(batch.x, batch.edge_index, batch.edge_attr, batch.batch, 0, True)
            # vdss_pred_log = model(batch.x, batch.edge_index, batch.edge_attr, batch.batch, 1, True)
            # t1_2_pred_log = model(batch.x, batch.edge_index, batch.edge_attr, batch.batch, 2, True)
            # pred_log = torch.log(pred)
            
            cl_y = batch.cl_y.view(batch.cl_y.size(0), 1)
            vdss_y = batch.vdss_y.view(batch.vdss_y.size(0), 1)
            t1_2_y = batch.t1_2_y.view(batch.t1_2_y.size(0), 1)

            y_scores.append(pred_log)
            # y_scores_cl.append(cl_pred_log)
            # y_scores_vdss.append(vdss_pred_log)
            # y_scores_t1_2.append(t1_2_pred_log)
            
            y_cl.append(cl_y)
            y_vdss.append(vdss_y)
            y_t1_2.append(t1_2_y)
            
    y_scores = torch.cat(y_scores, dim=0)
    # y_scores_cl = torch.cat(y_scores_cl, dim=0)
    # y_scores_vdss = torch.cat(y_scores_vdss, dim=0)
    # y_scores_t1_2 = torch.cat(y_scores_t1_2, dim=0)
    
    y_cl = torch.cat(y_cl, dim=0)
    y_vdss = torch.cat(y_vdss, dim=0)
    y_t1_2 = torch.cat(y_t1_2, dim=0)
    
    if dataset == 'cl':
        loss = criterion(y_scores, y_cl).cpu().detach().item()
        r2 = r2_score(y_cl, y_scores)
    elif dataset == 'vdss':
        loss = criterion(y_scores, y_vdss).cpu().detach().item()
        r2 = r2_score(y_vdss, y_scores)
    elif dataset == 't1_2':
        loss = criterion(y_scores, y_t1_2).cpu().detach().item()
        r2 = r2_score(y_t1_2, y_scores)
    elif dataset == '4':
        loss_cl = criterion(y_scores_cl, y_cl).cpu().detach().item()
        r2_cl = r2_score(y_cl, y_scores_cl)
        loss_vdss = criterion(y_scores_vdss, y_vdss).cpu().detach().item()
        r2_vdss =r2_score(y_vdss, y_scores_vdss)
        loss_t1_2 = criterion(y_scores_t1_2, y_t1_2).cpu().detach().item()
        r2_t1_2 =r2_score(y_t1_2, y_scores_t1_2)
    
    # return loss_cl,loss_vdss,loss_t1_2
    return loss,r2

def main():
    parser = argparse.ArgumentParser(description='PyTorch implementation of pre-training of graph neural networks')
    parser.add_argument('--device', type=int, default=0,
                        help='which gpu to use if any (default: 0)')
    parser.add_argument('--batch_size', type=int, default=32, ## 32: 1.009 # 64: 
                        help='input batch size for training (default: 32)')
    parser.add_argument('--epochs', type=int, default=100, # 60 # 1500
                        help='number of epochs to train (default: 100)')
    parser.add_argument('--lr', type=float, default=0.001, # 0.0001
                        help='learning rate (default: 0.001)')
    parser.add_argument('--lr_scale', type=float, default=1,
                        help='relative learning rate for the feature extraction layer (default: 1)')
    parser.add_argument('--decay', type=float, default=1e-9,
                        help='weight decay (default: 0)')
    parser.add_argument('--num_layer', type=int, default=5,
                        help='number of GNN message passing layers (default: 5).')
    parser.add_argument('--emb_dim', type=int, default=300,
                        help='embedding dimensions (default: 300)')
    parser.add_argument('--dropout_ratio', type=float, default=0.0,
                        help='dropout ratio (default: 0.5)')
    parser.add_argument('--graph_pooling', type=str, default="mean",
                        help='graph level pooling (sum, mean, max, set2set, attention)')
    parser.add_argument('--JK', type=str, default="last",
                        help='how the node features across layers are combined. last, sum, max or concat')
    parser.add_argument('--gnn_type', type=str, default="gin")
    parser.add_argument('--dataset', type=str, default = '4', help='root directory of dataset. For now, only classification.')
    
    parser.add_argument('--split', type = str, default="scaffold", help = "random or scaffold or random_scaffold")
    parser.add_argument('--eval_train', type=int, default=0, help='evaluating training or not')
    parser.add_argument('--num_workers', type=int, default=4, help='number of workers for dataset loading')
    parser.add_argument('--scheduler', action="store_true", default=False)
    parser.add_argument('--experiment_name', type=str, default="graphmae")
    parser.add_argument('--seed', type=int, default=42, help = "Seed for splitting the dataset.")
    parser.add_argument('--runseed', type=int, default=42, help = "Seed for minibatch selection, random initialization.")
    
    parser.add_argument('--input_vdss_model_file', type=str, default = 
                        '../results/20250324/finetune_mask/vdss/lr_0.001_decay_1e-05_bz_64_seed_42_dropout_0.0/best_model.pth', help='filename to read the model (if there is any)')
    parser.add_argument('--input_t1_2_model_file', type=str, default = 
                        '../results/20250312/finetune_l2/t1_2/lr_0.001_decay_1e-05_bz_128_seed_42_dropout_0.0/best_model.pth', help='filename to read the model (if there is any)')
    parser.add_argument('--input_cl_model_file', type=str, default = 
                        '../results/20250312/finetune_l2/cl/lr_0.001_decay_1e-05_bz_128_seed_42_dropout_0.0/best_model.pth', help='filename to read the model (if there is any)')
    parser.add_argument('--input_model_file', type=str, default = 
                        '../checkpoint/20250309/lr_0.001_decay_0_bz_256_dropout_0.0/best_model.pth', help='filename to read the model (if there is any)')
    
    ## add some argument 
    parser.add_argument('--dataset_type', type=int, default=1)
    parser.add_argument('--save', type=str, default='../results/20250407/finetune_mask/')
    args, unknown = parser.parse_known_args()

    torch.manual_seed(args.runseed)
    np.random.seed(args.runseed)
    # args.seed = args.runseed 
    args.experiment_name = 'lr'+'_'+str(args.lr)+'_'+'decay'+'_'+str(args.decay)+'_'+'bs'+'_'+str(args.batch_size)+'_'+'drop'+'_'+str(args.dropout_ratio)
    # os.makedirs(args.save+args.experiment_name, exist_ok= True)
    
    motif_list_path = '../dataset_reg/motif_list.pkl'
    print(f"文件 {motif_list_path} 存在，从文件中加载 motif_list...")
    with open(motif_list_path, 'rb') as f:
        motif_list = pickle.load(f)

    device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu")
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(args.runseed)

    if args.dataset == "4":
        num_tasks = 1
        valid_dataset_name = "4_valid"
    else:
        raise ValueError("Invalid dataset name.")

    ## set up pk dataset 
    valid_dataset = MoleculeDataset("../dataset_reg/"+valid_dataset_name, dataset=valid_dataset_name, motif_list=motif_list)
     
    val_loader = DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False, num_workers = args.num_workers)
    
    model = GNN_graphpred(args.num_layer, args.emb_dim, num_tasks, JK = args.JK, drop_ratio = args.dropout_ratio, graph_pooling = args.graph_pooling, gnn_type = args.gnn_type, classnum=args.dataset_type)
    # t1_2_model = GNN_graphpred(args.num_layer, args.emb_dim, num_tasks, JK = args.JK, drop_ratio = args.dropout_ratio, graph_pooling = args.graph_pooling, gnn_type = args.gnn_type, classnum=args.dataset_type)
    # cl_model = GNN_graphpred(args.num_layer, args.emb_dim, num_tasks, JK = args.JK, drop_ratio = args.dropout_ratio, graph_pooling = args.graph_pooling, gnn_type = args.gnn_type, classnum=args.dataset_type)
    
    root_folder = args.save
    sub_folders = ['cl', 'vdss', 't1_2']

    for sub in sub_folders:
        sub_path = os.path.join(root_folder, sub)
        # 遍历 sub_path 内的每个超参数模型文件夹
        for model_folder in os.listdir(sub_path):
            model_path = os.path.join(sub_path, model_folder)
            if os.path.isdir(model_path):
                best_model_file = os.path.join(model_path, 'best_model.pth')
                if os.path.exists(best_model_file):
                    print("load pretrained model from:", best_model_file)
                    model.load_state_dict(torch.load(best_model_file, map_location=device)[f'model_{sub}'])
                    model.to(device)
                    loss = eval(args, model, sub, device, val_loader)
                    new_data = pd.DataFrame({f'{sub}_loss':[loss], 
                             }, index=[f'{sub}+{model_folder}'])
    
                    save_path = args.save+"result.csv"

                    # 检查文件是否存在，存在则读取并追加新数据，否则新建
                    if os.path.exists(save_path):
                        existing_data = pd.read_csv(save_path, index_col='experiment_name')
                        # 使用 pd.concat 合并数据
                        updated_data = pd.concat([existing_data, new_data])
                    else:
                        updated_data = new_data

                    # 保存到CSV，保留索引（experiment_name作为行标签）
                    updated_data.to_csv(save_path, index_label='experiment_name')
                else:
                    print("没有找到 best_model.pth:", model_path)
    

if __name__ == "__main__":
    main()

In [3]:
#合并的
def r2_score(y_true, y_pred):
    """
    y_true: torch.Tensor, shape (N,)
    y_pred: torch.Tensor, shape (N,)
    """
    y_true_mean = torch.mean(y_true)
    ss_tot = torch.sum((y_true - y_true_mean) ** 2)
    ss_res = torch.sum((y_true - y_pred) ** 2)
    r2 = 1 - ss_res / ss_tot
    return r2.item()

def eval(args, model, device, loader): # para_contrastive_loss,
    model.eval()

    y_scores = []
    y_scores_cl = []
    y_scores_vdss = []
    y_scores_t1_2 = []
    
    y_cl = []
    y_vdss = []
    y_t1_2 = []
    y_auc = []

    for step, batch in enumerate(tqdm(loader, desc="Iteration")):
        batch = batch.to(device)

        with torch.no_grad():
            pred_log= model(batch.x, batch.edge_index, batch.edge_attr, batch.batch)
            # pred_log, std= model(batch.x, batch.edge_index, batch.edge_attr, batch.batch,1, False)
            cl_pred_log = pred_log[:,[0]]
            vdss_pred_log = pred_log[:,[1]]
            t1_2_pred_log = pred_log[:,[2]]
            # pred_log = torch.log(pred)
            
            cl_y = batch.cl_y.view(batch.cl_y.size(0), 1)
            vdss_y = batch.vdss_y.view(batch.vdss_y.size(0), 1)
            t1_2_y = batch.t1_2_y.view(batch.t1_2_y.size(0), 1)

            # y_scores.append(pred_log)
            y_scores_cl.append(cl_pred_log)
            y_scores_vdss.append(vdss_pred_log)
            y_scores_t1_2.append(t1_2_pred_log)
            
            y_cl.append(cl_y)
            y_vdss.append(vdss_y)
            y_t1_2.append(t1_2_y)
            
    # y_scores = torch.cat(y_scores, dim=0)
    y_scores_cl = torch.cat(y_scores_cl, dim=0)
    y_scores_vdss = torch.cat(y_scores_vdss, dim=0)
    y_scores_t1_2 = torch.cat(y_scores_t1_2, dim=0)
    
    y_cl = torch.cat(y_cl, dim=0)
    y_vdss = torch.cat(y_vdss, dim=0)
    y_t1_2 = torch.cat(y_t1_2, dim=0)
    
    loss_cl = criterion(y_scores_cl, y_cl).cpu().detach().item()
    r2_cl = r2_score(y_cl, y_scores_cl)
    loss_vdss = criterion(y_scores_vdss, y_vdss).cpu().detach().item()
    r2_vdss =r2_score(y_vdss, y_scores_vdss)
    loss_t1_2 = criterion(y_scores_t1_2, y_t1_2).cpu().detach().item()
    r2_t1_2 =r2_score(y_t1_2, y_scores_t1_2)
    
    return loss_cl,loss_vdss,loss_t1_2
    # return loss,r2

def main():
    parser = argparse.ArgumentParser(description='PyTorch implementation of pre-training of graph neural networks')
    parser.add_argument('--device', type=int, default=0,
                        help='which gpu to use if any (default: 0)')
    parser.add_argument('--batch_size', type=int, default=32, ## 32: 1.009 # 64: 
                        help='input batch size for training (default: 32)')
    parser.add_argument('--epochs', type=int, default=100, # 60 # 1500
                        help='number of epochs to train (default: 100)')
    parser.add_argument('--lr', type=float, default=0.001, # 0.0001
                        help='learning rate (default: 0.001)')
    parser.add_argument('--lr_scale', type=float, default=1,
                        help='relative learning rate for the feature extraction layer (default: 1)')
    parser.add_argument('--decay', type=float, default=1e-9,
                        help='weight decay (default: 0)')
    parser.add_argument('--num_layer', type=int, default=5,
                        help='number of GNN message passing layers (default: 5).')
    parser.add_argument('--emb_dim', type=int, default=300,
                        help='embedding dimensions (default: 300)')
    parser.add_argument('--dropout_ratio', type=float, default=0.0,
                        help='dropout ratio (default: 0.5)')
    parser.add_argument('--graph_pooling', type=str, default="mean",
                        help='graph level pooling (sum, mean, max, set2set, attention)')
    parser.add_argument('--JK', type=str, default="last",
                        help='how the node features across layers are combined. last, sum, max or concat')
    parser.add_argument('--gnn_type', type=str, default="gin")
    parser.add_argument('--dataset', type=str, default = '4', help='root directory of dataset. For now, only classification.')
    
    parser.add_argument('--split', type = str, default="scaffold", help = "random or scaffold or random_scaffold")
    parser.add_argument('--eval_train', type=int, default=0, help='evaluating training or not')
    parser.add_argument('--num_workers', type=int, default=4, help='number of workers for dataset loading')
    parser.add_argument('--scheduler', action="store_true", default=False)
    parser.add_argument('--experiment_name', type=str, default="graphmae")
    parser.add_argument('--seed', type=int, default=42, help = "Seed for splitting the dataset.")
    parser.add_argument('--runseed', type=int, default=42, help = "Seed for minibatch selection, random initialization.")
    
    parser.add_argument('--input_vdss_model_file', type=str, default = 
                        '../results/20250324/finetune_mask/vdss/lr_0.001_decay_1e-05_bz_64_seed_42_dropout_0.0/best_model.pth', help='filename to read the model (if there is any)')
    parser.add_argument('--input_t1_2_model_file', type=str, default = 
                        '../results/20250312/finetune_l2/t1_2/lr_0.001_decay_1e-05_bz_128_seed_42_dropout_0.0/best_model.pth', help='filename to read the model (if there is any)')
    parser.add_argument('--input_cl_model_file', type=str, default = 
                        '../results/20250312/finetune_l2/cl/lr_0.001_decay_1e-05_bz_128_seed_42_dropout_0.0/best_model.pth', help='filename to read the model (if there is any)')
    parser.add_argument('--input_model_file', type=str, default = 
                        '../checkpoint/20250309/lr_0.001_decay_0_bz_256_dropout_0.0/best_model.pth', help='filename to read the model (if there is any)')
    
    ## add some argument 
    parser.add_argument('--dataset_type', type=int, default=1)
    parser.add_argument('--save', type=str, default='../results/20250703/finetune_1+2_mask0.5_together/')
    args, unknown = parser.parse_known_args()

    torch.manual_seed(args.runseed)
    np.random.seed(args.runseed)
    # args.seed = args.runseed 
    args.experiment_name = 'lr'+'_'+str(args.lr)+'_'+'decay'+'_'+str(args.decay)+'_'+'bs'+'_'+str(args.batch_size)+'_'+'drop'+'_'+str(args.dropout_ratio)
    # os.makedirs(args.save+args.experiment_name, exist_ok= True)
    
    motif_list_path = '../dataset_reg/motif_list.pkl'
    print(f"文件 {motif_list_path} 存在，从文件中加载 motif_list...")
    with open(motif_list_path, 'rb') as f:
        motif_list = pickle.load(f)

    device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu")
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(args.runseed)

    if args.dataset == "4":
        num_tasks = 3
        valid_dataset_name = "4_valid"
    else:
        raise ValueError("Invalid dataset name.")

    ## set up pk dataset 
    valid_dataset = MoleculeDataset("../dataset_new_desc/"+valid_dataset_name, dataset=valid_dataset_name, motif_list=motif_list)
     
    val_loader = DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False, num_workers = args.num_workers)
    
    model = GNN_graphpred(args.num_layer, args.emb_dim, num_tasks, JK = args.JK, drop_ratio = args.dropout_ratio, graph_pooling = args.graph_pooling, gnn_type = args.gnn_type, classnum=args.dataset_type)
    
    root_folder = args.save


    for model_folder in os.listdir(root_folder):
        model_path = os.path.join(root_folder,model_folder)
        if os.path.isdir(model_path):
            best_model_file = os.path.join(model_path, 'best_model.pth')
            if os.path.exists(best_model_file):
                print("load pretrained model from:", best_model_file)
                model.load_state_dict(torch.load(best_model_file, map_location=device))
                model.to(device)
                loss_cl,loss_vdss,loss_t1_2 = eval(args, model, device, val_loader)
                new_data = pd.DataFrame({'loss_cl':loss_cl, 
                                         'loss_vdss':loss_vdss,
                                         'loss_t1_2':loss_t1_2
                         }, index=[model_folder])

                save_path = args.save+"result.csv"

                # 检查文件是否存在，存在则读取并追加新数据，否则新建
                if os.path.exists(save_path):
                    existing_data = pd.read_csv(save_path, index_col='experiment_name')
                    # 使用 pd.concat 合并数据
                    updated_data = pd.concat([existing_data, new_data])
                else:
                    updated_data = new_data

                # 保存到CSV，保留索引（experiment_name作为行标签）
                updated_data.to_csv(save_path, index_label='experiment_name')
            else:
                print("没有找到 best_model.pth:", model_path)


if __name__ == "__main__":
    main()

文件 ../dataset_reg/motif_list.pkl 存在，从文件中加载 motif_list...


  loaded_data = torch.load(self.processed_paths[0])
  model.load_state_dict(torch.load(best_model_file, map_location=device))


load pretrained model from: ../results/20250703/finetune_1+2_mask0.5_together/lr_0.001_decay_1e-05_bs_64_dropout_0.0_beta_1.0/best_model.pth


Iteration: 100%|██████████| 2/2 [00:00<00:00,  2.23it/s]


load pretrained model from: ../results/20250703/finetune_1+2_mask0.5_together/lr_0.001_decay_1e-05_bs_32_dropout_0.0_beta_0.0/best_model.pth


Iteration: 100%|██████████| 2/2 [00:00<00:00, 16.21it/s]
  model.load_state_dict(torch.load(best_model_file, map_location=device))


load pretrained model from: ../results/20250703/finetune_1+2_mask0.5_together/lr_0.001_decay_1e-05_bs_64_dropout_0.0_beta_0.0/best_model.pth


Iteration: 100%|██████████| 2/2 [00:00<00:00, 13.80it/s]
  model.load_state_dict(torch.load(best_model_file, map_location=device))


load pretrained model from: ../results/20250703/finetune_1+2_mask0.5_together/lr_0.001_decay_1e-05_bs_32_dropout_0.0_beta_1.0/best_model.pth


Iteration: 100%|██████████| 2/2 [00:00<00:00, 15.23it/s]


In [4]:
import pandas as pd

# 读取 CSV 文件
df = pd.read_csv('../results/20250327/finetune_0/result.csv')  # 这里替换为你的 CSV 文件路径

# 处理 NaN 值，填充成一个很大的数，确保不会影响 min 操作
df.fillna(float('inf'), inplace=True)

# 找到每种实验的最小 loss 及其 index
best_cl = df.loc[df['cl_loss'].idxmin(), ['experiment_name', 'cl_loss']]
best_vdss = df.loc[df['vdss_loss'].idxmin(), ['experiment_name', 'vdss_loss']]
best_t1_2 = df.loc[df['t1_2_loss'].idxmin(), ['experiment_name', 't1_2_loss']]

# 打印最佳实验的 index 和 loss
print(f"最佳 cl_loss 实验: {best_cl['experiment_name']}，Loss: {best_cl['cl_loss']}")
print(f"最佳 vdss_loss 实验: {best_vdss['experiment_name']}，Loss: {best_vdss['vdss_loss']}")
print(f"最佳 t1_2_loss 实验: {best_t1_2['experiment_name']}，Loss: {best_t1_2['t1_2_loss']}")


最佳 cl_loss 实验: cl+lr_0.01_decay_1e-06_bz_32_seed_42_dropout_0.0，Loss: 0.6789669990539551
最佳 vdss_loss 实验: vdss+lr_0.01_decay_1e-10_bz_32_seed_42_dropout_0.3，Loss: 0.6753852963447571
最佳 t1_2_loss 实验: t1_2+lr_0.0001_decay_1e-06_bz_32_seed_42_dropout_0.1，Loss: 0.7618190050125122


In [None]:
from loader import MoleculeDataset
motif_list=[]
cl_train_dataset = MoleculeDataset("dataset_reg/cl_train", dataset="cl_train", motif_list=motif_list)
cl_valid_dataset = MoleculeDataset("dataset_reg/cl_valid", dataset="cl_valid", motif_list=motif_list)

vdss_train_dataset = MoleculeDataset("dataset_reg/vdss_train", dataset="vdss_train", motif_list=motif_list)
vdss_valid_dataset = MoleculeDataset("dataset_reg/vdss_valid", dataset="vdss_valid", motif_list=motif_list)

t1_2_train_dataset = MoleculeDataset("dataset_reg/t1_2_train", dataset="t1_2_train", motif_list=motif_list)
t1_2_valid_dataset = MoleculeDataset("dataset_reg/t1_2_valid", dataset="t1_2_valid", motif_list=motif_list)

print(len(cl_train_dataset))
print(len(vdss_train_dataset))
print(len(t1_2_train_dataset))