In [5]:
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
import os

# 显示可用 GPU 信息
if torch.cuda.is_available():
    print(f"Available GPUs: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("No GPU available.")

# 加载模型和分词器
model_name_or_id = "/share/home/pwmat/Huggingface_Model_Downloades/ChemDFM-13B-v1.0"
tokenizer = LlamaTokenizer.from_pretrained(model_name_or_id)
model = LlamaForCausalLM.from_pretrained(model_name_or_id, torch_dtype=torch.float16, device_map="auto")

# 查看模型实际分配到的 GPU
print("\nModel loaded on devices:")
for param in model.parameters():
    if param.device.type == "cuda":
        print(f" - {param.device}")
        device = "cuda"
    break  # 只需查看一个参数的设备分配



Available GPUs: 4
GPU 0: NVIDIA GeForce RTX 4090 D
GPU 1: NVIDIA GeForce RTX 4090 D
GPU 2: NVIDIA GeForce RTX 4090 D
GPU 3: NVIDIA GeForce RTX 4090 D


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]


Model loaded on devices:
 - cuda:0


In [6]:
# 获取模型支持的最大 token 数
max_tokens = model.config.max_position_embeddings
def get_text_vector(text, model, tokenizer, max_tokens=2048, window_size=512, stride=256):
    """
    获取文本的向量化表示，根据长度选择滑动窗口法或直接编码。
    
    参数:
        text (str): 输入文本
        model: 已加载的语言模型
        tokenizer: 模型的分词器
        max_tokens (int): 模型支持的最大 token 数
        window_size (int): 滑动窗口的大小（适用于长文本）
        stride (int): 滑动窗口的步长（适用于长文本）
        
    返回:
        vector (np.ndarray): 文本的向量化表示
    """
    # 分词并计算 token 数
    inputs = tokenizer(text, return_tensors="pt", truncation=False, padding=False)
    token_count = inputs["input_ids"].shape[1]
    
    if token_count <= max_tokens:
        # 短文本直接编码
        print("短文本直接编码......")
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_tokens).to(device)
        with torch.no_grad():
            outputs = model(**inputs, output_hidden_states=True)
        hidden_states = outputs.hidden_states[-1]
        vector = hidden_states.mean(dim=1).squeeze().cpu().numpy()
    else:
        print("使用滑动窗口法处理长文本......")
        # 长文本使用滑动窗口法
        tokens = inputs["input_ids"][0]
        segments = [
            tokens[i:i + window_size] for i in range(0, len(tokens), stride) if i + window_size <= len(tokens)
        ]
        
        segment_vectors = []
        for segment in segments:
            segment_input = {"input_ids": segment.unsqueeze(0).to(device)}
            with torch.no_grad():
                outputs = model(**segment_input, output_hidden_states=True)
            hidden_states = outputs.hidden_states[-1]
            segment_vector = hidden_states.mean(dim=1).squeeze().cpu().numpy()
            segment_vectors.append(segment_vector)
        
        # 聚合所有子段的向量表示
        vector = np.mean(segment_vectors, axis=0)
    
    return vector


In [7]:
import numpy as np
# 示例文本
short_text = "This is a short example of photovoltaic materials description."
long_text = "Mg7Cu17O24 crystallizes in the orthorhombic Pmm2 space group. There are five inequivalent Mg sites. In the first Mg site, Mg(1) is bonded to one O(5), one O(9), two equivalent O(1), and two equivalent O(2) atoms to form MgO6 octahedra that share corners with two equivalent Mg(1)O6 octahedra, edges with two equivalent Mg(2)O6 octahedra, and edges with two equivalent Cu(6)O5 square pyramids. The corner-sharing octahedral tilt angles range from 18-20°. The Mg(1)-O(5) bond length is 2.05 Å. The Mg(1)-O(9) bond length is 2.05 Å. Both Mg(1)-O(1) bond lengths are 2.15 Å. Both Mg(1)-O(2) bond lengths are 2.20 Å. In the second Mg site, Mg(2) is bonded to one O(10), one O(6), two equivalent O(1), and two equivalent O(2) atoms to form MgO6 octahedra that share corners with two equivalent Mg(2)O6 octahedra, edges with two equivalent Mg(1)O6 octahedra, and edges with two equivalent Cu(6)O5 square pyramids. The corner-sharing octahedral tilt angles are 20°. The Mg(2)-O(10) bond length is 2.05 Å. The Mg(2)-O(6) bond length is 2.06 Å. Both Mg(2)-O(1) bond lengths are 2.20 Å. Both Mg(2)-O(2) bond lengths are 2.15 Å. In the third Mg site, Mg(3) is bonded to two equivalent O(3), two equivalent O(7), and two equivalent O(8) atoms to form a mixture of edge and corner-sharing MgO6 octahedra. The corner-sharing octahedral tilt angles are 20°. Both Mg(3)-O(3) bond lengths are 2.05 Å. Both Mg(3)-O(7) bond lengths are 2.20 Å. Both Mg(3)-O(8) bond lengths are 2.15 Å. In the fourth Mg site, Mg(4) is bonded to two equivalent O(4), two equivalent O(7), and two equivalent O(8) atoms to form MgO6 octahedra that share corners with two equivalent Cu(7)O6 octahedra and edges with two equivalent Mg(3)O6 octahedra. The corner-sharing octahedral tilt angles are 22°. Both Mg(4)-O(4) bond lengths are 2.10 Å. Both Mg(4)-O(7) bond lengths are 2.13 Å. Both Mg(4)-O(8) bond lengths are 2.17 Å. In the fifth Mg site, Mg(5) is bonded to two equivalent O(11), two equivalent O(12), and two equivalent O(3) atoms to form MgO6 octahedra that share corners with two equivalent Mg(3)O6 octahedra, corners with two equivalent Cu(6)O5 square pyramids, and edges with two equivalent Cu(7)O6 octahedra. The corner-sharing octahedral tilt angles are 20°. Both Mg(5)-O(11) bond lengths are 2.19 Å. Both Mg(5)-O(12) bond lengths are 2.15 Å. Both Mg(5)-O(3) bond lengths are 2.05 Å. There are seven inequivalent Cu sites. In the first Cu site, Cu(1) is bonded in a distorted rectangular see-saw-like geometry to one O(1), one O(11), one O(3), and one O(7) atom. The Cu(1)-O(1) bond length is 1.95 Å. The Cu(1)-O(11) bond length is 2.02 Å. The Cu(1)-O(3) bond length is 1.92 Å. The Cu(1)-O(7) bond length is 2.04 Å. In the second Cu site, Cu(2) is bonded in a distorted rectangular see-saw-like geometry to one O(12), one O(2), one O(4), and one O(8) atom. The Cu(2)-O(12) bond length is 2.02 Å. The Cu(2)-O(2) bond length is 1.95 Å. The Cu(2)-O(4) bond length is 1.92 Å. The Cu(2)-O(8) bond length is 2.04 Å. In the third Cu site, Cu(3) is bonded in a distorted rectangular see-saw-like geometry to one O(5), one O(8), and two equivalent O(2) atoms. The Cu(3)-O(5) bond length is 1.93 Å. The Cu(3)-O(8) bond length is 1.97 Å. Both Cu(3)-O(2) bond lengths are 2.04 Å. In the fourth Cu site, Cu(4) is bonded in a distorted rectangular see-saw-like geometry to one O(6), one O(7), and two equivalent O(1) atoms. The Cu(4)-O(6) bond length is 1.93 Å. The Cu(4)-O(7) bond length is 1.96 Å. Both Cu(4)-O(1) bond lengths are 2.03 Å. In the fifth Cu site, Cu(5) is bonded in a distorted rectangular see-saw-like geometry to one O(12), one O(9), and two equivalent O(2) atoms. The Cu(5)-O(12) bond length is 1.93 Å. The Cu(5)-O(9) bond length is 1.91 Å. Both Cu(5)-O(2) bond lengths are 2.03 Å. In the sixth Cu site, Cu(6) is bonded to one O(10), one O(11), one O(9), and two equivalent O(1) atoms to form distorted CuO5 square pyramids that share  a cornercorner with one Mg(5)O6 octahedra,  a cornercorner with one Cu(7)O6 octahedra, edges with two equivalent Mg(1)O6 octahedra, edges with two equivalent Mg(2)O6 octahedra, and  an edgeedge with one Cu(6)O5 square pyramid. The corner-sharing octahedral tilt angles range from 15-78°. The Cu(6)-O(10) bond length is 1.92 Å. The Cu(6)-O(11) bond length is 1.95 Å. The Cu(6)-O(9) bond length is 2.58 Å. Both Cu(6)-O(1) bond lengths are 2.03 Å. In the seventh Cu site, Cu(7) is bonded to two equivalent O(11), two equivalent O(12), and two equivalent O(4) atoms to form CuO6 octahedra that share corners with two equivalent Mg(4)O6 octahedra, corners with two equivalent Cu(6)O5 square pyramids, and edges with two equivalent Mg(5)O6 octahedra. The corner-sharing octahedral tilt angles are 22°. Both Cu(7)-O(11) bond lengths are 2.20 Å. Both Cu(7)-O(12) bond lengths are 2.28 Å. Both Cu(7)-O(4) bond lengths are 2.02 Å. There are twelve inequivalent O sites. In the first O site, O(1) is bonded to one Mg(1), one Mg(2), one Cu(1), one Cu(4), and one Cu(6) atom to form OMg2Cu3 square pyramids that share corners with two equivalent O(1)Mg2Cu3 square pyramids, corners with four equivalent O(2)Mg2Cu3 square pyramids,  an edgeedge with one O(1)Mg2Cu3 square pyramid,  an edgeedge with one O(7)Mg2Cu3 square pyramid, and  an edgeedge with one O(11)MgCu4 square pyramid. In the second O site, O(2) is bonded to one Mg(1), one Mg(2), one Cu(2), one Cu(3), and one Cu(5) atom to form OMg2Cu3 square pyramids that share corners with two equivalent O(2)Mg2Cu3 square pyramids, corners with four equivalent O(1)Mg2Cu3 square pyramids,  an edgeedge with one O(2)Mg2Cu3 square pyramid,  an edgeedge with one O(8)Mg2Cu3 square pyramid, and  an edgeedge with one O(12)MgCu4 square pyramid. In the third O site, O(3) is bonded in a 4-coordinate geometry to one Mg(3), one Mg(5), and two equivalent Cu(1) atoms. In the fourth O site, O(4) is bonded in a 4-coordinate geometry to one Mg(4), one Cu(7), and two equivalent Cu(2) atoms. In the fifth O site, O(5) is bonded in a 4-coordinate geometry to two equivalent Mg(1) and two equivalent Cu(3) atoms. In the sixth O site, O(6) is bonded in a 4-coordinate geometry to two equivalent Mg(2) and two equivalent Cu(4) atoms. In the seventh O site, O(7) is bonded to one Mg(3), one Mg(4), one Cu(4), and two equivalent Cu(1) atoms to form OMg2Cu3 square pyramids that share corners with two equivalent O(11)MgCu4 square pyramids, corners with four equivalent O(8)Mg2Cu3 square pyramids,  an edgeedge with one O(7)Mg2Cu3 square pyramid, and edges with two equivalent O(1)Mg2Cu3 square pyramids. In the eighth O site, O(8) is bonded to one Mg(3), one Mg(4), one Cu(3), and two equivalent Cu(2) atoms to form OMg2Cu3 square pyramids that share corners with two equivalent O(12)MgCu4 square pyramids, corners with four equivalent O(7)Mg2Cu3 square pyramids,  an edgeedge with one O(8)Mg2Cu3 square pyramid, and edges with two equivalent O(2)Mg2Cu3 square pyramids. In the ninth O site, O(9) is bonded in a 6-coordinate geometry to two equivalent Mg(1), two equivalent Cu(5), and two equivalent Cu(6) atoms. In the tenth O site, O(10) is bonded in a distorted see-saw-like geometry to two equivalent Mg(2) and two equivalent Cu(6) atoms. In the eleventh O site, O(11) is bonded to one Mg(5), one Cu(6), one Cu(7), and two equivalent Cu(1) atoms to form distorted OMgCu4 square pyramids that share corners with two equivalent O(7)Mg2Cu3 square pyramids, corners with four equivalent O(12)MgCu4 square pyramids,  an edgeedge with one O(11)MgCu4 square pyramid, and edges with two equivalent O(1)Mg2Cu3 square pyramids. In the twelfth O site, O(12) is bonded to one Mg(5), one Cu(5), one Cu(7), and two equivalent Cu(2) atoms to form distorted OMgCu4 square pyramids that share corners with two equivalent O(8)Mg2Cu3 square pyramids, corners with four equivalent O(11)MgCu4 square pyramids,  an edgeedge with one O(12)MgCu4 square pyramid, and edges with two equivalent O(2)Mg2Cu3 square pyramids."  # 超长文本示例

# 获取短文本向量表示
short_vector = get_text_vector(short_text, model, tokenizer, max_tokens=max_tokens)
print(short_vector)
print("短文本向量维度:", short_vector.shape)

# 获取长文本向量表示
long_vector = get_text_vector(long_text, model, tokenizer, max_tokens=max_tokens)
print(long_vector)
print("长文本向量维度:", long_vector.shape)

短文本直接编码......
[ 0.51   -0.8125 -0.575  ...  0.2129 -0.6196  0.4744]
短文本向量维度: (5120,)
使用滑动窗口法处理长文本......
[ 0.01031 -0.4126  -0.981   ...  0.7314   0.375    0.6035 ]
长文本向量维度: (5120,)


In [8]:
import ast
import pandas as pd
# 读取文本文件
def read_texts_from_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        content = file.read()
    texts = ast.literal_eval(content)  # 将字符串解析为 Python 列表
    return texts

# 主函数：处理文本并保存到 CSV 文件
def process_texts_to_csv(input_file, output_csv, model, tokenizer, max_tokens=2048):
    # 读取文本列表
    texts = read_texts_from_file(input_file)
    
    # 初始化存储向量的列表
    data = []

    for text in texts:
        # 获取向量表示
        print(f"正在处理文本: {text[:30]}...")  # 打印前30字符用于调试
        vector = get_text_vector(text, model, tokenizer, max_tokens=max_tokens)
        
        # 保存文本及其对应向量
        data.append([text] + vector.tolist())
    
    # 创建列名：第一列为 "Word"，后续为 "dim_0" 到 "dim_5119"
    column_names = ["Word"] + [f"dim_{i}" for i in range(5120)]
    
    # 将数据转换为 DataFrame
    df = pd.DataFrame(data, columns=column_names)
    
    # 保存为 CSV 文件
    df.to_csv(output_csv, index=False, encoding="utf-8")
    print(f"结果已保存到 {output_csv}")

# 设置输入文件路径和输出文件路径
input_file = "mat.txt"  # 替换为你的文本文件名
output_csv = "transparent.csv"

# 调用主函数
process_texts_to_csv(input_file, output_csv, model, tokenizer, max_tokens=2048)

正在处理文本: PMMA...
短文本直接编码......
正在处理文本: PC...
短文本直接编码......
正在处理文本: PET...
短文本直接编码......
正在处理文本: PACM12...
短文本直接编码......
正在处理文本: C28H44N2O23S·Na...
短文本直接编码......
正在处理文本: (C₂H₃Cl)ₙ...
短文本直接编码......
正在处理文本: Al23O27N5...
短文本直接编码......
正在处理文本: Na2O·CaO·6SiO2...
短文本直接编码......
结果已保存到 transparent.csv
