In [1]:
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
import os

# 显示可用 GPU 信息
if torch.cuda.is_available():
    print(f"Available GPUs: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("No GPU available.")

# 加载模型和分词器
model_name_or_id = "/share/home/pwmat/Huggingface_Model_Downloades/ChemDFM-13B-v1.0"
tokenizer = LlamaTokenizer.from_pretrained(model_name_or_id)
model = LlamaForCausalLM.from_pretrained(model_name_or_id, torch_dtype=torch.float16, device_map="auto")

# 查看模型实际分配到的 GPU
print("\nModel loaded on devices:")
for param in model.parameters():
    if param.device.type == "cuda":
        print(f" - {param.device}")
    break  # 只需查看一个参数的设备分配


Available GPUs: 4
GPU 0: NVIDIA GeForce RTX 4090 D
GPU 1: NVIDIA GeForce RTX 4090 D
GPU 2: NVIDIA GeForce RTX 4090 D
GPU 3: NVIDIA GeForce RTX 4090 D


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]


Model loaded on devices:
 - cuda:0


In [5]:
# 定义 prompt
input_text = (
    "List strictly the chemical formulas of photovoltaic materials or potential photovoltaic materials. "
    "Do not include names, examples, or descriptions. Only provide chemical formulas."
)
input_text = f"[Round 0]\nHuman: {input_text}\nAssistant:"

# 分词输入
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

# 配置生成参数
generation_config = GenerationConfig(
    do_sample=False,  # 禁用随机采样
    temperature=0.5,  # 更保守的生成
    max_new_tokens=256,  # 限制生成长度
    repetition_penalty=1.2,
    eos_token_id=tokenizer.eos_token_id,
)

# 生成文本
with torch.no_grad():
    outputs = model.generate(**inputs, generation_config=generation_config)

# 解码生成结果
generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0][len(input_text):]
print(generated_text)



 Sure! Here are some common materials used in solar cells and their corresponding chemical formulas:
1. Silicon (Si) - this is the most commonly used material for solar cells due to its high efficiency and abundance. It forms tetrahedral molecules with four bonds to other silicon atoms.
2. Gallium Arsenide (GaAs) - this material has a higher absorption coefficient than silicon, making it more efficient at converting sunlight into electricity. Its formula is GaAs.
3. Cadmium Telluride (CdTe) - this material also has a high absorption coefficient and is less expensive to produce than silicon-based cells. Its formula is CdTe.
4. Copper Indium Gallium Selenide (CIGS) - this material has similar properties to cadmium telluride but is even cheaper to produce. Its formula is Cu(In, Ga)(Se)2.
5. Perovskite - this material has gained attention recently as a low-cost alternative to traditional solar cell materials. Its formula varies depending on the specific perovskite compound, but generally i

In [2]:
# 定义 Prompt
input_text = (
    "Describe the key characteristics, properties, and components of photovoltaic materials in a concise, "
    "representative manner. Use technical language, and highlight the most significant aspects."
)
input_text = f"[Round 0]\nHuman: {input_text}\nAssistant:"

# 分词输入
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

# 配置生成参数
generation_config = GenerationConfig(
    do_sample=True,  # 开启随机采样生成，允许模型以非确定性方式生成文本。
    top_k=20,   # 限制模型在每一步生成时仅考虑概率前 20 的候选词（减少搜索空间）
    top_p=0.9,  # 采用 核采样 (nucleus sampling)，仅累积概率前 90% 的词作为候选。
    temperature=0.9, # 控制生成的随机性，值越低生成越保守，值越高生成越多样。
    max_new_tokens=1024, # 允许生成的最大 token 数。
    repetition_penalty=1.05, # 惩罚重复生成的词，使得生成的文本更多样。
    eos_token_id=tokenizer.eos_token_id  # 生成遇到结束标记时停止。
)

# 生成文本
with torch.no_grad():
    outputs = model.generate(**inputs, generation_config=generation_config)

# 解码生成结果
generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0][len(input_text):]

print("生成的描述:")
print(generated_text)

生成的描述:
 Photovoltaic materials are characterized by their ability to convert sunlight directly into electricity. Key properties include high absorption coefficients, low resistivity, and long carrier lifetimes. Common components include silicon-based materials, thin-film technologies such as cadmium telluride (CdTe) or copper indium gallium selenide (CIGS), and perovskite compounds. These materials undergo photovoltaic processes like photoexcitation, carrier transport, and collection, resulting in electric current.


In [10]:
# 对生成的描述进行编码
inputs = tokenizer(generated_text, return_tensors="pt", padding=True, truncation=True).to("cuda")

with torch.no_grad():
    outputs = model(**inputs, output_hidden_states=True)

# 提取最后一层隐藏状态，并计算描述的整体词向量
hidden_states = outputs.hidden_states[-1]  # (batch_size, seq_len, hidden_size)
description_vector = hidden_states.mean(dim=1).squeeze().cpu().numpy()
print(description_vector)
print("描述的词向量维度:", description_vector.shape)


[ 0.49    -0.07166 -0.3694  ...  0.872    0.1774   0.641  ]
描述的词向量维度: (5120,)


In [5]:
import numpy as np

# 生成多次描述
num_generations = 20
description_vectors = []
description_texts = []  # 存储描述的文本
for i in range(num_generations):
    with torch.no_grad():
        inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, generation_config=generation_config)
    generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0][len(input_text):]
    print(f"生成的描述{i}")
    print(generated_text)
    # 保存描述文本
    description_texts.append(generated_text)
    
    # 对每次生成的描述进行编码
    inputs = tokenizer(generated_text, return_tensors="pt", padding=True, truncation=True).to("cuda")
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    hidden_states = outputs.hidden_states[-1]
    vector = hidden_states.mean(dim=1).squeeze().cpu().numpy()
    description_vectors.append(vector)

print("20个代表性词向量:", description_vectors)
print("每个代表性词向量维度:", len(description_vectors[0]))

生成的描述0
 Photovoltaic materials are characterized by their ability to convert sunlight directly into electricity through the photovoltaic effect. These materials are typically based on semiconductors, such as silicon, and exhibit a range of properties that make them suitable for this purpose. These include:
1. Bandgap energy: The bandgap energy determines the amount of energy required to excite an electron from the valence band to the conduction band, which is crucial for generating electric current.
2. Conductivity type: Photovoltaic materials can be either n-type (containing excess electrons) or p-type (containing excess holes). The combination of these types forms a pn junction, which is essential for the photovoltaic effect.
3. Carrier mobility: High carrier mobility enables efficient transport of charge carriers, reducing recombination losses and improving overall efficiency.
4. Transparency: Transparent conductive oxides like indium tin oxide (ITO) allow light to pass through whil

In [7]:
import pandas as pd
# 保存结果到 CSV 文件
embedding_columns = [f"Dim_{i}" for i in range(len(description_vectors[0]))]
df = pd.DataFrame(description_vectors, columns=embedding_columns)
df.insert(0, "Description", description_texts)
output_file = "generalization_description_vectors.csv"
df.to_csv(output_file, index=False)

print(f"独立生成了 {len(description_texts)} 个描述，并保存到 {output_file}")

独立生成了 20 个描述，并保存到 generalization_description_vectors.csv


In [14]:
print(model.config.max_position_embeddings)
# 输入段落
input_text = (
    "Mg7Cu17O24 crystallizes in the orthorhombic Pmm2 space group. There are five inequivalent Mg sites. In the first Mg site, Mg(1) is bonded to one O(5), one O(9), two equivalent O(1), and two equivalent O(2) atoms to form MgO6 octahedra that share corners with two equivalent Mg(1)O6 octahedra, edges with two equivalent Mg(2)O6 octahedra, and edges with two equivalent Cu(6)O5 square pyramids. The corner-sharing octahedral tilt angles range from 18-20°. The Mg(1)-O(5) bond length is 2.05 Å. The Mg(1)-O(9) bond length is 2.05 Å. Both Mg(1)-O(1) bond lengths are 2.15 Å. Both Mg(1)-O(2) bond lengths are 2.20 Å. In the second Mg site, Mg(2) is bonded to one O(10), one O(6), two equivalent O(1), and two equivalent O(2) atoms to form MgO6 octahedra that share corners with two equivalent Mg(2)O6 octahedra, edges with two equivalent Mg(1)O6 octahedra, and edges with two equivalent Cu(6)O5 square pyramids. The corner-sharing octahedral tilt angles are 20°. The Mg(2)-O(10) bond length is 2.05 Å. The Mg(2)-O(6) bond length is 2.06 Å. Both Mg(2)-O(1) bond lengths are 2.20 Å. Both Mg(2)-O(2) bond lengths are 2.15 Å. In the third Mg site, Mg(3) is bonded to two equivalent O(3), two equivalent O(7), and two equivalent O(8) atoms to form a mixture of edge and corner-sharing MgO6 octahedra. The corner-sharing octahedral tilt angles are 20°. Both Mg(3)-O(3) bond lengths are 2.05 Å. Both Mg(3)-O(7) bond lengths are 2.20 Å. Both Mg(3)-O(8) bond lengths are 2.15 Å. In the fourth Mg site, Mg(4) is bonded to two equivalent O(4), two equivalent O(7), and two equivalent O(8) atoms to form MgO6 octahedra that share corners with two equivalent Cu(7)O6 octahedra and edges with two equivalent Mg(3)O6 octahedra. The corner-sharing octahedral tilt angles are 22°. Both Mg(4)-O(4) bond lengths are 2.10 Å. Both Mg(4)-O(7) bond lengths are 2.13 Å. Both Mg(4)-O(8) bond lengths are 2.17 Å. In the fifth Mg site, Mg(5) is bonded to two equivalent O(11), two equivalent O(12), and two equivalent O(3) atoms to form MgO6 octahedra that share corners with two equivalent Mg(3)O6 octahedra, corners with two equivalent Cu(6)O5 square pyramids, and edges with two equivalent Cu(7)O6 octahedra. The corner-sharing octahedral tilt angles are 20°. Both Mg(5)-O(11) bond lengths are 2.19 Å. Both Mg(5)-O(12) bond lengths are 2.15 Å. Both Mg(5)-O(3) bond lengths are 2.05 Å. There are seven inequivalent Cu sites. In the first Cu site, Cu(1) is bonded in a distorted rectangular see-saw-like geometry to one O(1), one O(11), one O(3), and one O(7) atom. The Cu(1)-O(1) bond length is 1.95 Å. The Cu(1)-O(11) bond length is 2.02 Å. The Cu(1)-O(3) bond length is 1.92 Å. The Cu(1)-O(7) bond length is 2.04 Å. In the second Cu site, Cu(2) is bonded in a distorted rectangular see-saw-like geometry to one O(12), one O(2), one O(4), and one O(8) atom. The Cu(2)-O(12) bond length is 2.02 Å. The Cu(2)-O(2) bond length is 1.95 Å. The Cu(2)-O(4) bond length is 1.92 Å. The Cu(2)-O(8) bond length is 2.04 Å. In the third Cu site, Cu(3) is bonded in a distorted rectangular see-saw-like geometry to one O(5), one O(8), and two equivalent O(2) atoms. The Cu(3)-O(5) bond length is 1.93 Å. The Cu(3)-O(8) bond length is 1.97 Å. Both Cu(3)-O(2) bond lengths are 2.04 Å. In the fourth Cu site, Cu(4) is bonded in a distorted rectangular see-saw-like geometry to one O(6), one O(7), and two equivalent O(1) atoms. The Cu(4)-O(6) bond length is 1.93 Å. The Cu(4)-O(7) bond length is 1.96 Å. Both Cu(4)-O(1) bond lengths are 2.03 Å. In the fifth Cu site, Cu(5) is bonded in a distorted rectangular see-saw-like geometry to one O(12), one O(9), and two equivalent O(2) atoms. The Cu(5)-O(12) bond length is 1.93 Å. The Cu(5)-O(9) bond length is 1.91 Å. Both Cu(5)-O(2) bond lengths are 2.03 Å. In the sixth Cu site, Cu(6) is bonded to one O(10), one O(11), one O(9), and two equivalent O(1) atoms to form distorted CuO5 square pyramids that share  a cornercorner with one Mg(5)O6 octahedra,  a cornercorner with one Cu(7)O6 octahedra, edges with two equivalent Mg(1)O6 octahedra, edges with two equivalent Mg(2)O6 octahedra, and  an edgeedge with one Cu(6)O5 square pyramid. The corner-sharing octahedral tilt angles range from 15-78°. The Cu(6)-O(10) bond length is 1.92 Å. The Cu(6)-O(11) bond length is 1.95 Å. The Cu(6)-O(9) bond length is 2.58 Å. Both Cu(6)-O(1) bond lengths are 2.03 Å. In the seventh Cu site, Cu(7) is bonded to two equivalent O(11), two equivalent O(12), and two equivalent O(4) atoms to form CuO6 octahedra that share corners with two equivalent Mg(4)O6 octahedra, corners with two equivalent Cu(6)O5 square pyramids, and edges with two equivalent Mg(5)O6 octahedra. The corner-sharing octahedral tilt angles are 22°. Both Cu(7)-O(11) bond lengths are 2.20 Å. Both Cu(7)-O(12) bond lengths are 2.28 Å. Both Cu(7)-O(4) bond lengths are 2.02 Å. There are twelve inequivalent O sites. In the first O site, O(1) is bonded to one Mg(1), one Mg(2), one Cu(1), one Cu(4), and one Cu(6) atom to form OMg2Cu3 square pyramids that share corners with two equivalent O(1)Mg2Cu3 square pyramids, corners with four equivalent O(2)Mg2Cu3 square pyramids,  an edgeedge with one O(1)Mg2Cu3 square pyramid,  an edgeedge with one O(7)Mg2Cu3 square pyramid, and  an edgeedge with one O(11)MgCu4 square pyramid. In the second O site, O(2) is bonded to one Mg(1), one Mg(2), one Cu(2), one Cu(3), and one Cu(5) atom to form OMg2Cu3 square pyramids that share corners with two equivalent O(2)Mg2Cu3 square pyramids, corners with four equivalent O(1)Mg2Cu3 square pyramids,  an edgeedge with one O(2)Mg2Cu3 square pyramid,  an edgeedge with one O(8)Mg2Cu3 square pyramid, and  an edgeedge with one O(12)MgCu4 square pyramid. In the third O site, O(3) is bonded in a 4-coordinate geometry to one Mg(3), one Mg(5), and two equivalent Cu(1) atoms. In the fourth O site, O(4) is bonded in a 4-coordinate geometry to one Mg(4), one Cu(7), and two equivalent Cu(2) atoms. In the fifth O site, O(5) is bonded in a 4-coordinate geometry to two equivalent Mg(1) and two equivalent Cu(3) atoms. In the sixth O site, O(6) is bonded in a 4-coordinate geometry to two equivalent Mg(2) and two equivalent Cu(4) atoms. In the seventh O site, O(7) is bonded to one Mg(3), one Mg(4), one Cu(4), and two equivalent Cu(1) atoms to form OMg2Cu3 square pyramids that share corners with two equivalent O(11)MgCu4 square pyramids, corners with four equivalent O(8)Mg2Cu3 square pyramids,  an edgeedge with one O(7)Mg2Cu3 square pyramid, and edges with two equivalent O(1)Mg2Cu3 square pyramids. In the eighth O site, O(8) is bonded to one Mg(3), one Mg(4), one Cu(3), and two equivalent Cu(2) atoms to form OMg2Cu3 square pyramids that share corners with two equivalent O(12)MgCu4 square pyramids, corners with four equivalent O(7)Mg2Cu3 square pyramids,  an edgeedge with one O(8)Mg2Cu3 square pyramid, and edges with two equivalent O(2)Mg2Cu3 square pyramids. In the ninth O site, O(9) is bonded in a 6-coordinate geometry to two equivalent Mg(1), two equivalent Cu(5), and two equivalent Cu(6) atoms. In the tenth O site, O(10) is bonded in a distorted see-saw-like geometry to two equivalent Mg(2) and two equivalent Cu(6) atoms. In the eleventh O site, O(11) is bonded to one Mg(5), one Cu(6), one Cu(7), and two equivalent Cu(1) atoms to form distorted OMgCu4 square pyramids that share corners with two equivalent O(7)Mg2Cu3 square pyramids, corners with four equivalent O(12)MgCu4 square pyramids,  an edgeedge with one O(11)MgCu4 square pyramid, and edges with two equivalent O(1)Mg2Cu3 square pyramids. In the twelfth O site, O(12) is bonded to one Mg(5), one Cu(5), one Cu(7), and two equivalent Cu(2) atoms to form distorted OMgCu4 square pyramids that share corners with two equivalent O(8)Mg2Cu3 square pyramids, corners with four equivalent O(11)MgCu4 square pyramids,  an edgeedge with one O(12)MgCu4 square pyramid, and edges with two equivalent O(2)Mg2Cu3 square pyramids."
)

# 分词并转为张量
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to("cuda")

# 获取模型的隐藏状态
with torch.no_grad():
    outputs = model(**inputs, output_hidden_states=True)

# 提取最后一层隐藏状态
hidden_states = outputs.hidden_states[-1]  # (batch_size, seq_len, hidden_size)

# 计算段落的整体表示（取所有 token 的隐藏状态均值）
paragraph_vector = hidden_states.mean(dim=1).squeeze().cpu().numpy()

print("段落的向量表示:")
print(paragraph_vector)
print("向量维度:", paragraph_vector.shape)

2048
段落的向量表示:
[ 0.0595 -0.4854 -0.5864 ...  0.564   0.2854  0.519 ]
向量维度: (5120,)


In [15]:
import numpy as np

window_size = 512  # 每段的 token 数
stride = 256       # 滑动窗口的步长

# 分割文本为多个子段
tokens = tokenizer(input_text, return_tensors="pt", truncation=False)["input_ids"][0]
segments = [tokens[i:i + window_size] for i in range(0, len(tokens), stride) if i + window_size <= len(tokens)]

# 对每个子段进行编码
segment_vectors = []
for segment in segments:
    inputs = {"input_ids": segment.unsqueeze(0).to("cuda")}
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    hidden_states = outputs.hidden_states[-1]
    vector = hidden_states.mean(dim=1).squeeze().cpu().numpy()
    segment_vectors.append(vector)

# 聚合子段向量为全局表示
overall_vector = np.mean(segment_vectors, axis=0)
print(overall_vector)
print("整体段落向量维度:", overall_vector.shape)


[ 0.01031 -0.4126  -0.981   ...  0.7314   0.375    0.6035 ]
整体段落向量维度: (5120,)
