In [7]:
!pip install transformers torch



In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# 1. 读取原始文件
file_path = "D:/ML-3DPrinting-Project/data/smiles.xlsx"
df = pd.read_excel(file_path)

# 2. 过滤出 is_homopolymer == 1 的均聚物
homopolymer_df = df[df["is_homopolymer"] == 1].copy()
homopolymer_df = homopolymer_df[["material_name", "BigSMILES"]].dropna()

# 3. 加载模型
tokenizer = AutoTokenizer.from_pretrained("hkqiu/PolyTAO-BigSMILES_Version")
model = AutoModelForSeq2SeqLM.from_pretrained("hkqiu/PolyTAO-BigSMILES_Version")
model.eval()

# 4. 定义嵌入提取函数
def get_embedding(bigsmiles):
    inputs = tokenizer(bigsmiles, return_tensors="pt")
    with torch.no_grad():
        outputs = model.encoder(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# 5. 提取 embedding 并展开成多个列
embeddings = []
material_names = []

for name, smile in zip(homopolymer_df["material_name"], homopolymer_df["BigSMILES"]):
    try:
        emb = get_embedding(smile)
        embeddings.append(emb)
        material_names.append(name)
    except:
        print(f"⚠️ Failed for: {name}")
        continue

embedding_df = pd.DataFrame(embeddings, columns=[f"polyTAO_emb_{i}" for i in range(len(embeddings[0]))])
embedding_df["material_name"] = material_names

# 6. 合并到原始 DataFrame
merged_df = df.merge(embedding_df, on="material_name", how="left")

# 7. 保存新文件
merged_df.to_excel("D:/ML-3DPrinting-Project/data/smiles_with_polyTAO_embedding.xlsx", index=False)
