In [4]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from rdkit import Chem
import pickle
from tqdm import tqdm

# ========== 文件路径 ==========
file_path = r"D:\ML-3DPrinting-Project\data\smiles.xlsx"
embedding_output_path = r"D:\ML-3DPrinting-Project\data\chemberta_embeddings.csv"
graph_output_path = r"D:\ML-3DPrinting-Project\data\molecular_graphs.pkl"

# ========== 数据读取 ==========
df = pd.read_excel(file_path)
name_col = "material_name"
smiles_col = "SMILES"

# ========== 加载 ChemBERTa 模型 ==========
tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-77M-MTR")
model = AutoModel.from_pretrained("DeepChem/ChemBERTa-77M-MTR")
device = torch.device("cpu")
model.to(device)
model.eval()

# ========== 存储变量 ==========
embedding_list = []
valid_names = []
graph_dict = {}

# ========== 处理每个 SMILES ==========
for _, row in tqdm(df.iterrows(), total=len(df)):
    name = row[name_col]
    smiles = row[smiles_col]

    if pd.isna(smiles) or smiles.strip() == "":
        continue

    try:
        # ====== ChemBERTa embedding ======
        inputs = tokenizer(smiles, return_tensors="pt", truncation=True, padding=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

        embedding_list.append(embedding)
        valid_names.append(name)

        # ====== 分子图结构提取 (RDKit) ======
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            edge_index = []
            for bond in mol.GetBonds():
                a1 = bond.GetBeginAtomIdx()
                a2 = bond.GetEndAtomIdx()
                edge_index.append((a1, a2))
                edge_index.append((a2, a1))  # 双向边
            graph_dict[name] = {
                "num_atoms": mol.GetNumAtoms(),
                "edge_index": edge_index
            }

    except Exception as e:
        print(f"[ERROR] {name} skipped: {e}")

# ========== 保存 embedding ==========
latent_cols = [f"latent_{i}" for i in range(len(embedding_list[0]))]
embedding_df = pd.DataFrame(embedding_list, columns=latent_cols)
embedding_df.insert(0, "material_name", valid_names)
embedding_df.to_csv(embedding_output_path, index=False)
print(f"✅ Embedding saved to: {embedding_output_path}")

# ========== 保存分子图 ==========
with open(graph_output_path, "wb") as f:
    pickle.dump(graph_dict, f)
print(f"✅ Molecular graphs saved to: {graph_output_path}")



Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 61%|████████████████████████████████████████████████▌                              | 207/337 [00:01<00:00, 131.34it/s][19:48:36] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14
 91%|███████████████████████████████████████████████████████████████████████▋       | 306/337 [00:02<00:00, 144.22it/s][19:48:37] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14
[19:48:37] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14
 95%|███████████████████████████████████████████████████████████████████████████▏   | 321/337 [00:02<00:00, 134.30it/s][19:48:37] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14
[19:48:37] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14
[19:48:37] Can't kekuliz

✅ Embedding saved to: D:\ML-3DPrinting-Project\data\chemberta_embeddings.csv
✅ Molecular graphs saved to: D:\ML-3DPrinting-Project\data\molecular_graphs.pkl


In [5]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# === 路径 ===
main_path = r"D:\ML-3DPrinting-Project\data\7.7\2_regression_original.xlsx"
embedding_path = r"D:\ML-3DPrinting-Project\data\chemberta_embeddings.csv"
output_path = r"D:\ML-3DPrinting-Project\data\7.9_datasets\regression_final_all_with_weighted_embedding.xlsx"

# === 读取数据 ===
df_main = pd.read_excel(main_path)
df_embed = pd.read_csv(embedding_path)

# === 构建 embedding 字典，并确保为 float 类型 ===
latent_cols = [f"latent_{i}" for i in range(384)]
embedding_dict = {
    row["material_name"]: row[latent_cols].values.astype(float)
    for _, row in df_embed.iterrows()
}

# === 初始化输出 embedding 列表 ===
weighted_embeddings = []

# === 遍历每一条配方 ===
for _, row in tqdm(df_main.iterrows(), total=len(df_main)):
    weighted_sum = np.zeros(384, dtype=float)
    total_weight = 0.0

    for col in df_main.columns:
        if col in embedding_dict:
            amount = row[col]
            if pd.notna(amount) and amount > 0:
                embedding = embedding_dict[col]  # 现在已经确保是 float 类型了
                weighted_sum += embedding * amount
                total_weight += amount

    if total_weight > 0:
        weighted_avg = weighted_sum / total_weight
    else:
        weighted_avg = np.zeros(384, dtype=float)

    weighted_embeddings.append(weighted_avg)

# === 构建 embedding DataFrame 并拼接回主表 ===
df_embedding = pd.DataFrame(weighted_embeddings, columns=latent_cols)
df_final = pd.concat([df_main.reset_index(drop=True), df_embedding], axis=1)

# === 保存结果 ===
df_final.to_excel(output_path, index=False)
print(f"✅ 加权 embedding 已保存到：{output_path}")



100%|█████████████████████████████████████████████████████████████████████████████| 1520/1520 [00:03<00:00, 484.52it/s]


✅ 加权 embedding 已保存到：D:\ML-3DPrinting-Project\data\7.9_datasets\regression_final_all_with_weighted_embedding.xlsx


In [6]:
from rdkit.Chem import AllChem
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator

# === 路径 ===
embedding_main_path = r"D:\ML-3DPrinting-Project\data\7.9_datasets\regression_final_all_with_weighted_embedding.xlsx"
smiles_path = r"D:\ML-3DPrinting-Project\data\smiles.xlsx"
output_path = r"D:\ML-3DPrinting-Project\data\7.9_datasets\regression_with_chemberta_rdkit.xlsx"

# === 参数 ===
fp_size = 128
fp_radius = 2

# === 读取数据表 ===
df_main = pd.read_excel(embedding_main_path)
df_smiles = pd.read_excel(smiles_path)

# === 构建 SMILES → RDKit fingerprint 字典 ===
fp_dict = {}
generator = GetMorganGenerator(radius=fp_radius, fpSize=fp_size) 

for _, row in df_smiles.iterrows():
    name = row["material_name"]
    smiles = row["SMILES"]
    if pd.notna(smiles) and isinstance(smiles, str) and smiles.strip() != "":
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            fp = generator.GetFingerprint(mol).ToList() 
            arr = np.array(fp).astype(float)            
            fp_dict[name] = arr

# === 构造各配方的加权 fingerprint ===
fp_list = []

for _, row in tqdm(df_main.iterrows(), total=len(df_main)):
    weighted_sum = np.zeros(fp_size, dtype=float)
    total_weight = 0.0

    for col in df_main.columns:
        if col in fp_dict:
            amount = row[col]
            if pd.notna(amount) and amount > 0:
                fp = fp_dict[col]
                weighted_sum += fp * amount
                total_weight += amount

    if total_weight > 0:
        avg_fp = weighted_sum / total_weight
    else:
        avg_fp = np.zeros(fp_size, dtype=float)

    fp_list.append(avg_fp)

# === 转成 DataFrame 并拼接主表 ===
fp_cols = [f"rdkit_{i}" for i in range(fp_size)]
df_fp = pd.DataFrame(fp_list, columns=fp_cols)

df_final = pd.concat([df_main.reset_index(drop=True), df_fp], axis=1)

# === 保存 ===
df_final.to_excel(output_path, index=False)
print(f"✅ 最终数据表（含 ChemBERTa + RDKit 等特征）已保存到: {output_path}")


[19:50:36] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14
[19:50:36] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14
[19:50:36] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14
[19:50:36] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14
[19:50:36] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14
[19:50:36] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14
[19:50:36] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14
100%|█████████████████████████████████████████████████████████████████████████████| 1520/1520 [00:02<00:00, 539.07it/s]


✅ 最终数据表（含 ChemBERTa + RDKit 等特征）已保存到: D:\ML-3DPrinting-Project\data\7.9_datasets\regression_with_chemberta_rdkit.xlsx


In [7]:
import pandas as pd
import numpy as np
from rdkit import Chem
from mordred import Calculator, descriptors
from tqdm import tqdm

# === 路径 ===
main_path = r"D:\ML-3DPrinting-Project\data\7.9_datasets\regression_final_all_with_weighted_embedding.xlsx"
smiles_path = r"D:\ML-3DPrinting-Project\data\smiles.xlsx"
output_path = r"D:\ML-3DPrinting-Project\data\regression_with_mordred_rdkit.xlsx"

# === 读取主数据集和 SMILES 数据 ===
df_main = pd.read_excel(main_path)
df_smiles = pd.read_excel(smiles_path)

# === 初始化 Mordred calculator ===
calc = Calculator(descriptors, ignore_3D=True)

# === 构建描述符字典（name → Mordred特征） ===
desc_dict = {}
valid_features = None  # 用于后面一致化列

print("🧪 正在提取 Mordred 特征...")
for _, row in tqdm(df_smiles.iterrows(), total=len(df_smiles)):
    name = row["material_name"]
    smiles = row["SMILES"]
    if pd.isna(smiles) or not isinstance(smiles, str) or smiles.strip() == "":
        continue

    mol = Chem.MolFromSmiles(smiles)
    if mol:
        try:
            desc = calc(mol)
            desc_values = pd.Series(desc.asdict()).replace([np.inf, -np.inf], np.nan).astype(float)

            # 初始化列名
            if valid_features is None:
                valid_features = desc_values.dropna().index.tolist()

            # 保证所有成分有相同维度（缺失值填0）
            vector = desc_values[valid_features].fillna(0).values.astype(float)
            desc_dict[name] = vector
        except Exception as e:
            print(f"[❌] {name} 跳过: {e}")

# === 加权融合成配方级特征 ===
final_features = []

for _, row in tqdm(df_main.iterrows(), total=len(df_main)):
    weighted_sum = np.zeros(len(valid_features), dtype=float)
    total_weight = 0.0

    for col in df_main.columns:
        if col in desc_dict:
            amount = row[col]
            if pd.notna(amount) and amount > 0:
                weighted_sum += desc_dict[col] * amount
                total_weight += amount

    if total_weight > 0:
        avg_vector = weighted_sum / total_weight
    else:
        avg_vector = np.zeros(len(valid_features), dtype=float)

    final_features.append(avg_vector)

# === 构建特征表并拼接回主数据集 ===
desc_feature_names = [f"mordred_{name}" for name in valid_features]
df_desc = pd.DataFrame(final_features, columns=desc_feature_names)

df_final = pd.concat([df_main.reset_index(drop=True), df_desc], axis=1)

# === 保存输出 ===
df_final.to_excel(output_path, index=False)
print(f"✅ 融合后数据集已保存到：{output_path}")


🧪 正在提取 Mordred 特征...


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwarg

✅ 融合后数据集已保存到：D:\ML-3DPrinting-Project\data\regression_with_mordred_rdkit.xlsx
