In [2]:
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
np.seterr(divide='ignore', invalid='ignore')
from tqdm.auto import tqdm
import torch.nn as nn
import torch.nn.functional as F
from ase.db import connect
from src import data_proc

# 设置设备
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 定义 Schmidt 正交化函数（用于处理目标值）
def schmidt_orthogonalization(vectors):
    vectors = vectors.reshape(3, 3)
    diagonal_elements = np.diagonal(vectors)
    ele = np.mean(diagonal_elements)
    return ele

# 定义 MLP 模型
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# 自定义数据集类
class CrystalDataset(Dataset):
    def __init__(self, db_path):
        self.db = connect(db_path)
        self.entries = list(self.db.select())

    def __len__(self):
        return len(self.entries)

    def __getitem__(self, idx):
        tmp = self.entries[idx]
        atoms = tmp.toatoms()
        try:
            atom_feature = data_proc.get_crystal_path_muhead(ase_obj=True, stru=atoms, num_heads=1)
            target = torch.tensor(tmp.data['dielectric'], dtype=torch.float32)  # 保留目标属性
            atom_feature = torch.tensor(atom_feature, dtype=torch.float32)
            return atom_feature, target  # 返回晶体特征和目标属性
        except ValueError as e:
            print(f"Skipping structure {idx} due to error: {e}")
            return None

# 自定义 collate 函数
def collate_fn(batch):
    batch = [b for b in batch if b is not None]
    if len(batch) == 0:
        return None

    atom_features, targets = zip(*batch)
    num_heads = atom_features[0].shape[0]
    max_atoms = max(feat.shape[1] for feat in atom_features)
    batch_size = len(atom_features)
    embed_dim = atom_features[0].shape[2]

    # 填充特征并展平 num_heads 维度
    padded_features = torch.zeros((batch_size * num_heads, max_atoms, embed_dim), dtype=torch.float32)
    attention_masks = torch.zeros((batch_size * num_heads, max_atoms), dtype=torch.float32)
    flattened_targets = torch.zeros((batch_size * num_heads, *targets[0].shape), dtype=torch.float32)

    for i, (feat, target) in enumerate(zip(atom_features, targets)):
        num_atoms = feat.shape[1]
        for head in range(num_heads):
            idx = i * num_heads + head
            padded_features[idx, :num_atoms, :] = feat[head]
            attention_masks[idx, :num_atoms] = 1
            flattened_targets[idx] = target

    return padded_features, attention_masks, flattened_targets

# 定义 GRU 编码器
class GRUEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=2):
        super(GRUEncoder, self).__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, h_n = self.gru(x)  # h_n: (num_layers, batch_size, hidden_dim)
        h_n = h_n[-1]  # 取最后一层的隐藏状态
        output = self.fc(h_n)
        return output

class ContrastiveModel(nn.Module):
    def __init__(self, feature_dim, hidden_dim, output_dim):
        super(ContrastiveModel, self).__init__()
        self.encoder = GRUEncoder(feature_dim, hidden_dim, output_dim)
        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))

    def forward(self, x1, x2=None):
        if x2 is None:
            return self.encoder(x1)
        z1 = self.encoder(x1)
        z2 = self.encoder(x2)
        return z1, z2
# 加载预训练模型
feature_dim = 384
hidden_dim = 256
output_dim = 128
model = ContrastiveModel(feature_dim, hidden_dim, output_dim).to(device)
model.load_state_dict(torch.load("best_path_feature.pth"))
model.eval()

# 加载数据集
dataset = CrystalDataset("/data/home/hzw1010/suth/elec_gw/dbs/clean.db")
dataloader = DataLoader(dataset, batch_size=64, num_workers=2, pin_memory=False,
                        shuffle=False, collate_fn=collate_fn, drop_last=False)

# 提取特征
def extract_features(model, dataloader, device):
    features = []
    targets = []
    with torch.no_grad():
        for batch_features, batch_masks, batch_targets in tqdm(dataloader, desc="Extracting Features"):
            if batch_features is None:
                continue
            batch_features = batch_features.to(device)
            z = model(batch_features)  # 提取特征
            features.append(z.cpu())
            targets.append(batch_targets.cpu())
    return torch.cat(features, dim=0), torch.cat(targets, dim=0)

# 处理目标值
def process_targets(targets):
    processed_targets = []
    for target in targets:
        elec = schmidt_orthogonalization(target.numpy())
        processed_targets.append(elec)
    return torch.tensor(processed_targets, dtype=torch.float32)

# 提取特征并处理目标值
features, targets = extract_features(model, dataloader, device)
processed_targets = process_targets(targets)

Using device: cuda:2


  model.load_state_dict(torch.load("best_path_feature.pth"))


Extracting Features:   0%|          | 0/114 [00:00<?, ?it/s]

In [9]:
import pandas as pd
from autogluon.tabular import TabularPredictor
dataset_size = len(features)
train_size = int(0.8 * dataset_size)  
train_data = pd.DataFrame(features[:train_size].numpy())
train_labels = pd.Series(processed_targets[:train_size].numpy())
val_data = pd.DataFrame(features[train_size:].numpy())
val_labels = pd.Series(processed_targets[train_size:].numpy())
predictor = TabularPredictor(label="target", eval_metric="mean_squared_error")
train_data['target'] = train_labels

No path specified. Models will be saved in: "AutogluonModels/ag-20250116_060244"


In [10]:
predictor.fit(
    train_data, 
    presets="best_quality",
    time_limit=36000
)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.9.21
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Tue Jun 1 16:14:33 UTC 2021
CPU Count:          96
Memory Avail:       234.09 GB / 251.30 GB (93.2%)
Disk Space Avail:   529624.39 GB / 596060.00 GB (88.9%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfitting.
	Running DyStack for up to 9000s of the 36000s

[36m(_ray_fit pid=2100138)[0m [1000]	valid_set's l2: 21.1798
[36m(_ray_fit pid=2100138)[0m [2000]	valid_set's l2: 21.1433


[36m(_dystack pid=2088280)[0m 	-23.6093	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	5.86s	 = Training   runtime
[36m(_dystack pid=2088280)[0m 	0.04s	 = Validation runtime
[36m(_dystack pid=2088280)[0m Fitting model: LightGBM_BAG_L1 ... Training model for up to 5959.26s of the 8955.03s of remaining time.
[36m(_dystack pid=2088280)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=12, gpus=0, memory=0.02%)
[36m(_dystack pid=2088280)[0m 	-25.0074	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	3.17s	 = Training   runtime
[36m(_dystack pid=2088280)[0m 	0.02s	 = Validation runtime
[36m(_dystack pid=2088280)[0m Fitting model: RandomForestMSE_BAG_L1 ... Training model for up to 5935.10s of the 8930.87s of remaining time.
[36m(_dystack pid=2088280)[0m 	-27.1788	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	5.03s	 = Training   

[36m(_ray_fit pid=2131446)[0m [1000]	valid_set's l2: 27.2697
[36m(_ray_fit pid=2131453)[0m [1000]	valid_set's l2: 24.2149
[36m(_ray_fit pid=2131446)[0m [2000]	valid_set's l2: 27.2692


[36m(_dystack pid=2088280)[0m 	-27.3011	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	22.74s	 = Training   runtime
[36m(_dystack pid=2088280)[0m 	0.06s	 = Validation runtime
[36m(_dystack pid=2088280)[0m Fitting model: CatBoost_r177_BAG_L1 ... Training model for up to 5526.50s of the 8522.27s of remaining time.
[36m(_dystack pid=2088280)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=12, gpus=0, memory=0.03%)
[36m(_dystack pid=2088280)[0m 	-24.8064	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	71.9s	 = Training   runtime
[36m(_dystack pid=2088280)[0m 	0.04s	 = Validation runtime
[36m(_dystack pid=2088280)[0m Fitting model: NeuralNetTorch_r79_BAG_L1 ... Training model for up to 5429.52s of the 8425.29s of remaining time.
[36m(_dystack pid=2088280)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, 

[36m(_ray_fit pid=2140729)[0m [1000]	valid_set's l2: 22.291


[36m(_dystack pid=2088280)[0m 	-24.8941	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	9.07s	 = Training   runtime
[36m(_dystack pid=2088280)[0m 	0.06s	 = Validation runtime
[36m(_dystack pid=2088280)[0m Fitting model: NeuralNetFastAI_r191_BAG_L1 ... Training model for up to 5266.45s of the 8262.22s of remaining time.
[36m(_dystack pid=2088280)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=12, gpus=0, memory=0.01%)
[36m(_ray_fit pid=2142435)[0m No improvement since epoch 20: early stopping
[36m(_dystack pid=2088280)[0m 	-22.0727	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	43.92s	 = Training   runtime
[36m(_dystack pid=2088280)[0m 	0.1s	 = Validation runtime
[36m(_dystack pid=2088280)[0m Fitting model: CatBoost_r9_BAG_L1 ... Training model for up to 5193.79s of the 8189.56s of remaining time.
[36m(_dystack pid=2088280)[0m 	Fitting 8 child mode

[36m(_ray_fit pid=2177008)[0m [1000]	valid_set's l2: 25.88[32m [repeated 4x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)[0m
[36m(_ray_fit pid=2177009)[0m [7000]	valid_set's l2: 21.0293[32m [repeated 28x across cluster][0m


[36m(_dystack pid=2088280)[0m 	-23.6188	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	11.65s	 = Training   runtime
[36m(_dystack pid=2088280)[0m 	0.18s	 = Validation runtime
[36m(_dystack pid=2088280)[0m Fitting model: NeuralNetTorch_r22_BAG_L1 ... Training model for up to 4767.25s of the 7763.02s of remaining time.
[36m(_dystack pid=2088280)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=12, gpus=0, memory=0.01%)
[36m(_dystack pid=2088280)[0m 	-18.059	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	141.02s	 = Training   runtime
[36m(_dystack pid=2088280)[0m 	0.07s	 = Validation runtime
[36m(_dystack pid=2088280)[0m Fitting model: XGBoost_r33_BAG_L1 ... Training model for up to 4599.19s of the 7594.96s of remaining time.
[36m(_dystack pid=2088280)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, p

[36m(_ray_fit pid=2197181)[0m [1000]	valid_set's l2: 25.8943[32m [repeated 9x across cluster][0m


[36m(_dystack pid=2088280)[0m 	-23.8715	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	13.59s	 = Training   runtime
[36m(_dystack pid=2088280)[0m 	0.05s	 = Validation runtime
[36m(_dystack pid=2088280)[0m Fitting model: NeuralNetFastAI_r145_BAG_L1 ... Training model for up to 4119.46s of the 7115.23s of remaining time.
[36m(_dystack pid=2088280)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=12, gpus=0, memory=0.01%)
[36m(_dystack pid=2088280)[0m 	-21.6528	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	47.31s	 = Training   runtime
[36m(_dystack pid=2088280)[0m 	0.14s	 = Validation runtime
[36m(_dystack pid=2088280)[0m Fitting model: XGBoost_r89_BAG_L1 ... Training model for up to 4041.65s of the 7037.42s of remaining time.
[36m(_dystack pid=2088280)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers,

[36m(_ray_fit pid=2276365)[0m [1000]	valid_set's l2: 22.3275


[36m(_dystack pid=2088280)[0m 	-24.9069	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	7.54s	 = Training   runtime
[36m(_dystack pid=2088280)[0m 	0.05s	 = Validation runtime
[36m(_dystack pid=2088280)[0m Fitting model: NeuralNetTorch_r86_BAG_L1 ... Training model for up to 3827.56s of the 6823.33s of remaining time.
[36m(_dystack pid=2088280)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=12, gpus=0, memory=0.01%)
[36m(_dystack pid=2088280)[0m 	-25.6908	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	85.94s	 = Training   runtime
[36m(_dystack pid=2088280)[0m 	0.11s	 = Validation runtime
[36m(_dystack pid=2088280)[0m Fitting model: CatBoost_r50_BAG_L1 ... Training model for up to 3714.21s of the 6709.98s of remaining time.
[36m(_dystack pid=2088280)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, p

[36m(_ray_fit pid=2393376)[0m [1000]	valid_set's l2: 24.9381


[36m(_dystack pid=2088280)[0m 	-24.8513	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	15.93s	 = Training   runtime
[36m(_dystack pid=2088280)[0m 	0.09s	 = Validation runtime
[36m(_dystack pid=2088280)[0m Fitting model: NeuralNetFastAI_r143_BAG_L1 ... Training model for up to 3195.71s of the 6191.48s of remaining time.
[36m(_dystack pid=2088280)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=12, gpus=0, memory=0.01%)
[36m(_ray_fit pid=2394984)[0m No improvement since epoch 9: early stopping
[36m(_dystack pid=2088280)[0m 	-24.881	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	27.23s	 = Training   runtime
[36m(_dystack pid=2088280)[0m 	0.07s	 = Validation runtime
[36m(_dystack pid=2088280)[0m Fitting model: CatBoost_r70_BAG_L1 ... Training model for up to 3149.62s of the 6145.39s of remaining time.
[36m(_dystack pid=2088280)[0m 	Fitting 8 child mod

[36m(_ray_fit pid=2405410)[0m [1000]	valid_set's l2: 15.078
[36m(_ray_fit pid=2405407)[0m [1000]	valid_set's l2: 21.6014
[36m(_ray_fit pid=2405407)[0m [2000]	valid_set's l2: 20.928[32m [repeated 7x across cluster][0m
[36m(_ray_fit pid=2405411)[0m [3000]	valid_set's l2: 34.1131[32m [repeated 7x across cluster][0m
[36m(_ray_fit pid=2405405)[0m [4000]	valid_set's l2: 25.4984[32m [repeated 3x across cluster][0m
[36m(_ray_fit pid=2405405)[0m [6000]	valid_set's l2: 25.4811[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=2405405)[0m [7000]	valid_set's l2: 25.4762
[36m(_ray_fit pid=2405405)[0m [8000]	valid_set's l2: 25.474
[36m(_ray_fit pid=2405405)[0m [9000]	valid_set's l2: 25.4738
[36m(_ray_fit pid=2405405)[0m [10000]	valid_set's l2: 25.4729


[36m(_dystack pid=2088280)[0m 	-23.5651	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	56.28s	 = Training   runtime
[36m(_dystack pid=2088280)[0m 	0.29s	 = Validation runtime
[36m(_dystack pid=2088280)[0m Fitting model: RandomForest_r39_BAG_L1 ... Training model for up to 2794.10s of the 5789.87s of remaining time.
[36m(_dystack pid=2088280)[0m 	-26.2651	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	3.03s	 = Training   runtime
[36m(_dystack pid=2088280)[0m 	0.27s	 = Validation runtime
[36m(_dystack pid=2088280)[0m Fitting model: CatBoost_r167_BAG_L1 ... Training model for up to 2790.57s of the 5786.34s of remaining time.
[36m(_dystack pid=2088280)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=12, gpus=0, memory=0.05%)
[36m(_dystack pid=2088280)[0m 	-24.7583	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	86.78s	 = Tra

[36m(_ray_fit pid=2494537)[0m [1000]	valid_set's l2: 16.6906
[36m(_ray_fit pid=2494539)[0m [5000]	valid_set's l2: 22.168[32m [repeated 22x across cluster][0m
[36m(_ray_fit pid=2494536)[0m [10000]	valid_set's l2: 27.4734[32m [repeated 13x across cluster][0m


[36m(_dystack pid=2088280)[0m 	-25.6312	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	15.98s	 = Training   runtime
[36m(_dystack pid=2088280)[0m 	0.17s	 = Validation runtime
[36m(_dystack pid=2088280)[0m Fitting model: NeuralNetTorch_r158_BAG_L1 ... Training model for up to 2224.59s of the 5220.36s of remaining time.
[36m(_dystack pid=2088280)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=12, gpus=0, memory=0.01%)
[36m(_dystack pid=2088280)[0m 	-28.6687	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	96.54s	 = Training   runtime
[36m(_dystack pid=2088280)[0m 	0.08s	 = Validation runtime
[36m(_dystack pid=2088280)[0m Fitting model: CatBoost_r86_BAG_L1 ... Training model for up to 2102.00s of the 5097.77s of remaining time.
[36m(_dystack pid=2088280)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers,

[36m(_ray_fit pid=2514786)[0m [1000]	valid_set's l2: 22.4115[32m [repeated 3x across cluster][0m


[36m(_dystack pid=2088280)[0m 	-25.0164	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	17.93s	 = Training   runtime
[36m(_dystack pid=2088280)[0m 	0.08s	 = Validation runtime
[36m(_dystack pid=2088280)[0m Fitting model: RandomForest_r127_BAG_L1 ... Training model for up to 1596.08s of the 4591.85s of remaining time.
[36m(_dystack pid=2088280)[0m 	-26.5047	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	3.18s	 = Training   runtime
[36m(_dystack pid=2088280)[0m 	0.24s	 = Validation runtime
[36m(_dystack pid=2088280)[0m Fitting model: NeuralNetFastAI_r134_BAG_L1 ... Training model for up to 1592.42s of the 4588.19s of remaining time.
[36m(_dystack pid=2088280)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=12, gpus=0, memory=0.01%)
[36m(_dystack pid=2088280)[0m 	-22.8532	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	38.78

[36m(_ray_fit pid=2519324)[0m [1000]	valid_set's l2: 27.263[32m [repeated 2x across cluster][0m


[36m(_dystack pid=2088280)[0m 	-23.5106	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	7.98s	 = Training   runtime
[36m(_dystack pid=2088280)[0m 	0.06s	 = Validation runtime
[36m(_dystack pid=2088280)[0m Fitting model: NeuralNetTorch_r143_BAG_L1 ... Training model for up to 1485.20s of the 4480.97s of remaining time.
[36m(_dystack pid=2088280)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=12, gpus=0, memory=0.01%)
[36m(_dystack pid=2088280)[0m 	-18.9871	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	197.8s	 = Training   runtime
[36m(_dystack pid=2088280)[0m 	0.09s	 = Validation runtime
[36m(_dystack pid=2088280)[0m Fitting model: CatBoost_r128_BAG_L1 ... Training model for up to 1251.07s of the 4246.84s of remaining time.
[36m(_dystack pid=2088280)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers,

[36m(_ray_fit pid=2545618)[0m [1000]	valid_set's l2: 20.5562[32m [repeated 12x across cluster][0m
[36m(_ray_fit pid=2545619)[0m [2000]	valid_set's l2: 21.2223[32m [repeated 8x across cluster][0m


[36m(_dystack pid=2088280)[0m 	-23.4622	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	17.11s	 = Training   runtime
[36m(_dystack pid=2088280)[0m 	0.12s	 = Validation runtime
[36m(_dystack pid=2088280)[0m Fitting model: XGBoost_r49_BAG_L1 ... Training model for up to 539.37s of the 3535.14s of remaining time.
[36m(_dystack pid=2088280)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=12, gpus=0, memory=0.05%)
[36m(_dystack pid=2088280)[0m 	-25.5289	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	15.4s	 = Training   runtime
[36m(_dystack pid=2088280)[0m 	0.11s	 = Validation runtime
[36m(_dystack pid=2088280)[0m Fitting model: CatBoost_r5_BAG_L1 ... Training model for up to 494.21s of the 3489.98s of remaining time.
[36m(_dystack pid=2088280)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=1

KeyboardInterrupt: 

[36m(_dystack pid=2088280)[0m 	-21.1266	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	173.62s	 = Training   runtime
[36m(_dystack pid=2088280)[0m 	0.08s	 = Validation runtime
[36m(_dystack pid=2088280)[0m Fitting model: NeuralNetTorch_r71_BAG_L1 ... Training model for up to 194.05s of the 3189.82s of remaining time.
[36m(_dystack pid=2088280)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=12, gpus=0, memory=0.01%)
[36m(_dystack pid=2088280)[0m 	-21.1389	 = Validation score   (-mean_squared_error)
[36m(_dystack pid=2088280)[0m 	77.52s	 = Training   runtime
[36m(_dystack pid=2088280)[0m 	0.08s	 = Validation runtime
[36m(_dystack pid=2088280)[0m Fitting model: CatBoost_r143_BAG_L1 ... Training model for up to 85.09s of the 3080.86s of remaining time.
[36m(_dystack pid=2088280)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, p

In [8]:
predictor

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7ff448f70d00>

In [6]:
# 评估模型性能
val_data['target'] = val_labels
performance = predictor.evaluate(val_data)
print(performance)

{'mean_absolute_error': -1.754349708557129, 'root_mean_squared_error': -5.314374923706055, 'mean_squared_error': -28.242578506469727, 'r2': 0.5191915035247803, 'pearsonr': 0.7241694160215834, 'median_absolute_error': -0.5566833019256592}


In [2]:
mlp_input_dim = output_dim  # 输入维度是特征维度
mlp_hidden_dim = 128
mlp_output_dim = 1  # 输出是一个标量
mlp_model = MLP(mlp_input_dim, mlp_hidden_dim, mlp_output_dim).to(device)

In [None]:
dataset_size = len(features)
train_size = int(0.8 * dataset_size)  # 80% 训练集
val_size = dataset_size - train_size  # 20% 验证集
train_dataset = TensorDataset(features[:train_size], processed_targets[:train_size])
train_dataset = TensorDataset(features, processed_targets)
val_dataset = TensorDataset(features[train_size:], processed_targets[train_size:])
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

mlp_input_dim = output_dim  # 输入维度是特征维度
mlp_hidden_dim = 128
mlp_output_dim = 1  # 输出是一个标量
mlp_model = MLP(mlp_input_dim, mlp_hidden_dim, mlp_output_dim).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(mlp_model.parameters(), lr=1e-4)
num_epochs = 2000
from tqdm.auto import tqdm

import logging

# 配置日志记录
logging.basicConfig(filename='training.log', level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')

best_val_loss = float('inf')

for epoch in range(num_epochs):
    mlp_model.train()
    train_loss = 0.0
    
    # 训练阶段
    with tqdm(train_dataloader, desc=f"Epoch {epoch+1} - Training", unit="batch") as t:
        for batch_features, batch_targets in t:
            batch_features = batch_features.to(device)
            batch_targets = batch_targets.to(device)
            
            optimizer.zero_grad()
            predictions = mlp_model(batch_features)
            loss = criterion(predictions.squeeze(), batch_targets)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            t.set_postfix({'Batch Train Loss': f"{loss.item():.4f}"})
    
    # 验证阶段
    mlp_model.eval()
    val_loss = 0.0
    with tqdm(val_dataloader, desc=f"Epoch {epoch+1} - Validation", unit="batch") as v:
        with torch.no_grad():
            for batch_features, batch_targets in v:
                batch_features = batch_features.to(device)
                batch_targets = batch_targets.to(device)
                predictions = mlp_model(batch_features)
                loss = criterion(predictions.squeeze(), batch_targets)
                val_loss += loss.item()
                v.set_postfix({'Batch Val Loss': f"{loss.item():.4f}"})
    
    # 计算平均损失
    train_loss /= len(train_dataloader)
    val_loss /= len(val_dataloader)
    # 更新训练和验证损失到进度条的描述信息
    #tqdm.write(f"Epoch {epoch+1} - Final Train Loss: {train_loss:.4f}, Final Val Loss: {val_loss:.4f}")
    
    # 保存最佳模型（记录到日志中）
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(mlp_model.state_dict(), "best_mlp_model.pth")
        logging.info(f"Best MLP model saved with Val Loss: {best_val_loss:.4f}")

    # 在下一个 epoch 开始前，更新上一个 epoch 的最终损失信息到进度条中
    if epoch < num_epochs - 1:
        tqdm.write("")  # 空行分隔不同的 epoch 输出

In [None]:
%config InlineBackend.figure_format = 'svg'
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt
def evaluate_model(model, dataloader, device):
    model.eval()
    all_targets = []
    all_predictions = []
    with torch.no_grad():
        for batch_features, batch_targets in tqdm(dataloader, desc="Evaluating"):
            batch_features = batch_features.to(device)
            batch_targets = batch_targets.to(device)

            predictions = model(batch_features).squeeze()
            all_predictions.append(predictions.cpu())
            all_targets.append(batch_targets.cpu())

    # 合并所有批次数据
    all_predictions = torch.cat(all_predictions).numpy()
    all_targets = torch.cat(all_targets).numpy()

    # 计算 MAE 和 R²
    mae = mean_absolute_error(all_targets, all_predictions)
    r2 = r2_score(all_targets, all_predictions)
    return all_targets, all_predictions, mae, r2
def plot_scatter(targets, predictions):
    plt.figure(figsize=(8, 8))
    plt.scatter(targets, predictions, alpha=0.6, label="Predictions vs Targets")
    plt.plot([min(targets), max(targets)], [min(targets), max(targets)], color="red", linestyle="--", label="Ideal")
    plt.xlabel("True Values")
    plt.ylabel("Predicted Values")
    plt.title(f'Scatter Plot of True vs Predicted Values\nMAE: {mae:.4f}, R²: {r2:.4f}')
    plt.legend()
    plt.grid(True)
    plt.show()
# 加载最佳模型
mlp_model.load_state_dict(torch.load("best_mlp_model.pth"))

# 测试模型性能并绘图
test_targets, test_predictions, mae, r2 = evaluate_model(mlp_model, val_dataloader, device)
print(f"MAE: {mae:.4f}, R²: {r2:.4f}")

# 画散点图
plot_scatter(test_targets, test_predictions)