In [1]:
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
import os

# 显示可用 GPU 信息
if torch.cuda.is_available():
    print(f"Available GPUs: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("No GPU available.")

# 加载模型和分词器
model_name_or_id = "/share/home/pwmat/Huggingface_Model_Downloades/ChemDFM-13B-v1.0"
tokenizer = LlamaTokenizer.from_pretrained(model_name_or_id)
model = LlamaForCausalLM.from_pretrained(model_name_or_id, torch_dtype=torch.float16, device_map="auto")

# 查看模型实际分配到的 GPU
print("\nModel loaded on devices:")
for param in model.parameters():
    if param.device.type == "cuda":
        print(f" - {param.device}")
    break  # 只需查看一个参数的设备分配


Available GPUs: 4
GPU 0: NVIDIA GeForce RTX 4090 D
GPU 1: NVIDIA GeForce RTX 4090 D
GPU 2: NVIDIA GeForce RTX 4090 D
GPU 3: NVIDIA GeForce RTX 4090 D


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]


Model loaded on devices:
 - cuda:0


In [2]:
# 定义 Prompt
input_text = (
    "Provide a highly detailed and mechanistic description of photovoltaic materials, "
    "focusing exclusively on their structural and functional aspects. "
    "Include information about crystal structures (e.g., symmetry, space group, lattice constants), "
    "bond lengths, bond angles, and their electronic and optical properties relevant to the photovoltaic effect. "
    "Discuss specific mechanisms such as charge separation, exciton dynamics, and bandgap engineering. "
    "Avoid general and superficial explanations. Only include the most technical and professional content."
)
input_text = f"[Round 0]\nHuman: {input_text}\nAssistant:"

# 分词输入
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

# 配置生成参数
generation_config = GenerationConfig(
    do_sample=True,  # 开启随机采样生成，允许模型以非确定性方式生成文本。
    top_k=50,   # 限制模型在每一步生成时仅考虑概率前 20 的候选词（减少搜索空间）
    top_p=0.7,  # 采用 核采样 (nucleus sampling)，仅累积概率前 90% 的词作为候选。
    temperature=0.9, # 控制生成的随机性，值越低生成越保守，值越高生成越多样。
    max_new_tokens=1024, # 允许生成的最大 token 数。
    repetition_penalty=1.05, # 惩罚重复生成的词，使得生成的文本更多样。
    eos_token_id=tokenizer.eos_token_id  # 生成遇到结束标记时停止。
)

# 生成文本
with torch.no_grad():
    outputs = model.generate(**inputs, generation_config=generation_config)

# 解码生成结果
generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0][len(input_text):]

print("生成的描述:")
print(generated_text)

生成的描述:
 Photovoltaic materials are materials that convert light energy into electrical energy through the photovoltaic effect. The most commonly used photovoltaic materials are silicon-based, but there are also other materials such as thin-film technologies based on cadmium telluride (CdTe), copper indium gallium selenide (CIGS), and perovskites.
The crystal structure of photovoltaic materials is important because it determines the electronic and optical properties of the material. Silicon, for example, has a diamond-like crystal structure with a cubic lattice. The silicon atoms are arranged in a regular pattern, with each atom surrounded by four other atoms at the corners of a square. This arrangement results in a high degree of order and periodicity, which is important for the efficient movement of electrons through the material.
The electronic properties of photovoltaic materials are determined by the bond lengths and bond angles between the atoms. In silicon, for example, the bond 

In [3]:
import numpy as np

# 生成多次描述
num_generations = 20
description_vectors = []
description_texts = []  # 存储描述的文本
for i in range(num_generations):
    with torch.no_grad():
        inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, generation_config=generation_config)
    generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0][len(input_text):]
    print(f"生成的描述{i}")
    print(generated_text)
    # 保存描述文本
    description_texts.append(generated_text)
    
    # 对每次生成的描述进行编码
    inputs = tokenizer(generated_text, return_tensors="pt", padding=True, truncation=True).to("cuda")
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    hidden_states = outputs.hidden_states[-1]
    vector = hidden_states.mean(dim=1).squeeze().cpu().numpy()
    description_vectors.append(vector)

print("20个代表性词向量:", description_vectors)
print("每个代表性词向量维度:", len(description_vectors[0]))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


生成的描述0
 Photovoltaic materials are semiconductors that convert sunlight into electricity through the photovoltaic effect. The efficiency of a photovoltaic material is determined by its ability to absorb light and generate charge carriers. The most commonly used photovoltaic materials are silicon (Si) and thin-film technologies such as cadmium telluride (CdTe) and copper indium gallium selenide (CIGS).
Si is a crystalline material with a diamond cubic structure (space group Fd3m). The Si atoms are arranged in a regular pattern, with each atom surrounded by four neighbors at the corners of a tetrahedron. The Si-Si bond length is approximately 2.35 angstroms, and the bond angle is 109.47 degrees. The electronic band structure of Si has an indirect bandgap of 1.12 eV, which means that it can only absorb light with wavelengths less than 1100 nm. This limits the efficiency of Si-based solar cells, as they can only capture a small portion of the solar spectrum.
CdTe is a direct bandgap semico

In [4]:
import pandas as pd
# 保存结果到 CSV 文件
embedding_columns = [f"Dim_{i}" for i in range(len(description_vectors[0]))]
df = pd.DataFrame(description_vectors, columns=embedding_columns)
df.insert(0, "Description", description_texts)
output_file = "expertise_description_vectors.csv"
df.to_csv(output_file, index=False)

print(f"独立生成了 {len(description_texts)} 个描述，并保存到 {output_file}")

独立生成了 20 个描述，并保存到 expertise_description_vectors.csv
