In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import OneHotEncoder

# ========== 数据读取 ==========
main_data_path = r"D:\ML-3DPrinting-Project\data\7.7\2_regression_original.xlsx"
smiles_path = r"D:\\ML-3DPrinting-Project\\data\\smiles.xlsx"
df_main = pd.read_excel(main_data_path)
df_smiles = pd.read_excel(smiles_path)

# ===== 第一步：提取 API 名称 =====
api_names = df_smiles[df_smiles["is_API"] == 1]["material_name"].tolist()
excipient_names = df_smiles[df_smiles["is_API"] == 0]["material_name"].tolist()
api_names = [col for col in api_names if col in df_main.columns]
excipient_names = [col for col in excipient_names if col in df_main.columns]

# ===== 第二步：构造主 API 列 =====
api_dose_df = df_main[api_names].copy()
api_dose_df["API_name"] = api_dose_df.apply(
    lambda row: row[row > 0].index[0] if any(row > 0) else "Unknown", axis=1
)

# ===== 第三步：构造 API_dose 列 =====
api_dose_df["API_dose"] = [
    row[api] if api in row else 0
    for row, api in zip(api_dose_df[api_names].to_dict(orient="records"), api_dose_df["API_name"])
]

# ===== 第四步：构造过滤条件（统一过滤）=====
valid_mask = (
    (api_dose_df["API_name"] != "Unknown") &
    (df_main["printability"].isin(["yes", "no"]))
)

# ===== 第五步：过滤所有相关数据 =====
df_main = df_main[valid_mask].copy()
api_dose_df = api_dose_df[valid_mask].copy()

X = df_main[excipient_names].fillna(0).values  # ✅ filtered!


# ===== 第六步：API名称独热编码 + 用量向量 =====
api_encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
api_onehot = api_encoder.fit_transform(api_dose_df[["API_name"]])
api_dose_vector = np.array(api_dose_df["API_dose"].fillna(0)).reshape(-1, 1)

# ===== 第七步：处理标签并构造最终训练输入 =====
printability = df_main["printability"].map({"yes": 1, "no": 0}).values.reshape(-1, 1)
cond = np.hstack([api_onehot, api_dose_vector, printability])
assert X.shape[0] == cond.shape[0], "❌ X 与 cond 样本数不一致！"



In [2]:
# ========== 2. Dataset + mask ==========
def mask_x(x, mask_prob=0.3):
    mask = torch.bernoulli(torch.ones_like(x) * (1 - mask_prob))
    return x * mask

class FormulationDataset(Dataset):
    def __init__(self, X, cond):
        self.x = torch.FloatTensor(X)
        self.cond = torch.FloatTensor(cond)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return {"x": self.x[idx], "cond": self.cond[idx]}


In [3]:
# ========== 3. Conditional VAE ==========
class ConditionalVAE(nn.Module):
    def __init__(self, input_dim, cond_dim, latent_dim):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim + cond_dim, 256), nn.ReLU(),
            nn.Linear(256, 128), nn.ReLU()
        )
        self.fc_mu = nn.Linear(128, latent_dim)
        self.fc_logvar = nn.Linear(128, latent_dim)
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim + cond_dim, 128), nn.ReLU(),
            nn.Linear(128, 256), nn.ReLU(),
            nn.Linear(256, input_dim), nn.Sigmoid()
        )

    def encode(self, x, cond):
        h = self.encoder(torch.cat([x, cond], dim=1))
        return self.fc_mu(h), self.fc_logvar(h)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z, cond):
        return self.decoder(torch.cat([z, cond], dim=1))

    def forward(self, x, cond):
        mu, logvar = self.encode(x, cond)
        z = self.reparameterize(mu, logvar)
        return self.decode(z, cond), mu, logvar

In [4]:
# ========== 4. Loss ==========
def vae_loss(recon_x, x_true, mu, logvar):
    mse = F.mse_loss(recon_x, x_true, reduction='sum')  # 预测excipient用量
    kld = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return mse + kld


In [5]:
print(f"X shape: {X.shape}, cond shape: {cond.shape}")


X shape: (1116, 269), cond shape: (1116, 57)


In [6]:
# ========== 5. Train ==========
BATCH_SIZE = 64
EPOCHS = 50
LATENT_DIM = 64
LEARNING_RATE = 1e-3

dataset = FormulationDataset(X, cond)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

model = ConditionalVAE(input_dim=X.shape[1], cond_dim=cond.shape[1], latent_dim=LATENT_DIM)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in dataloader:
        x = mask_x(batch['x'])
        cond_batch = batch['cond']
        recon, mu, logvar = model(x, cond_batch)
        loss = vae_loss(recon, batch['x'], mu, logvar)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss:.2f}")


Epoch 1/50, Loss: 46443.98
Epoch 2/50, Loss: 2242.57
Epoch 3/50, Loss: 633.04
Epoch 4/50, Loss: 558.32
Epoch 5/50, Loss: 530.29
Epoch 6/50, Loss: 521.67
Epoch 7/50, Loss: 518.05
Epoch 8/50, Loss: 518.26
Epoch 9/50, Loss: 514.94
Epoch 10/50, Loss: 513.17
Epoch 11/50, Loss: 512.05
Epoch 12/50, Loss: 510.77
Epoch 13/50, Loss: 510.47
Epoch 14/50, Loss: 510.08
Epoch 15/50, Loss: 508.83
Epoch 16/50, Loss: 507.71
Epoch 17/50, Loss: 507.93
Epoch 18/50, Loss: 507.98
Epoch 19/50, Loss: 506.89
Epoch 20/50, Loss: 506.74
Epoch 21/50, Loss: 506.39
Epoch 22/50, Loss: 507.06
Epoch 23/50, Loss: 505.59
Epoch 24/50, Loss: 506.43
Epoch 25/50, Loss: 505.64
Epoch 26/50, Loss: 505.20
Epoch 27/50, Loss: 504.72
Epoch 28/50, Loss: 504.74
Epoch 29/50, Loss: 504.90
Epoch 30/50, Loss: 505.11
Epoch 31/50, Loss: 504.31
Epoch 32/50, Loss: 503.91
Epoch 33/50, Loss: 504.20
Epoch 34/50, Loss: 504.69
Epoch 35/50, Loss: 504.08
Epoch 36/50, Loss: 504.41
Epoch 37/50, Loss: 503.89
Epoch 38/50, Loss: 503.39
Epoch 39/50, Loss:

In [7]:
@torch.no_grad()
def generate_dosed_formulation(model, cond_vector):
    model.eval()
    cond_tensor = torch.FloatTensor(cond_vector).unsqueeze(0)
    z = torch.randn(1, model.fc_mu.out_features)
    output = model.decode(z, cond_tensor).squeeze().numpy()
    return output  # 直接返回每个 excipient 的预测用量


In [10]:
# ========== 7. 示例: 按照 API + 剂量 + printability 生成配方 ==========
new_cond = np.zeros(cond.shape[1])
new_cond[api_encoder.categories_[0].tolist().index("Paracetamol")] = 1
new_cond[-2] = 0.65  # dose
new_cond[-1] = 1     # printable

output = generate_dosed_formulation(model, new_cond)

recommended = [(name, round(output[i], 3)) 
               for i, name in enumerate(excipient_names) if output[i] > 0.01]

recommended_sorted = sorted(recommended, key=lambda x: -x[1])[:5]
print("推荐配方及用量：")
for name, dose in recommended_sorted:
    print(f"{name}: {dose}")



推荐配方及用量：
Mannitol: 0.06400000303983688
HydroxypropylcelluloseKlucelEF: 0.032999999821186066
PolyvinylalcoholPVAfilament: 0.032999999821186066
PolyvinylalcoholParteckMXP: 0.03200000151991844
HydroxypropylmethylcelluloseAffinisol15LV: 0.02500000037252903
