In [1]:
import numpy as np
from sklearn.manifold import TSNE
import torch
from transformers import BertConfig, BertModel, BertTokenizer
from transformers import BertForMaskedLM, BertTokenizerFast, pipeline
from pprint import pprint
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from umap import UMAP
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
model_id = "/share/home/pwmat/Huggingface_Model_Downloades/MatBERT_model/matbert-base-cased"
model = BertForMaskedLM.from_pretrained(model_id)
tokenizer = BertTokenizerFast.from_pretrained(model_id, do_lower_case=False)

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [3]:
# 提取所有词向量
all_word_embeddings = []
all_words = []
for word in tokenizer.vocab.keys():
    if word.isalnum():  # 只选择字母数字组合的词，排除特殊标记
        token_id = tokenizer.encode(word, add_special_tokens=False)
        if len(token_id) == 1:  # 确保词没有分词成多个子词
            token_id = torch.tensor(token_id).unsqueeze(0)
            embeddings = model.bert.embeddings.word_embeddings(token_id)
            all_word_embeddings.append(embeddings.squeeze().detach().numpy())
            all_words.append(word)

all_word_embeddings = np.array(all_word_embeddings)

In [4]:
%%time
from umap import UMAP
# 使用UMAP进行降维
reducer = UMAP()
embeddings_umap = reducer.fit_transform(all_word_embeddings)

CPU times: user 2min 23s, sys: 5.82 s, total: 2min 29s
Wall time: 22.4 s


In [5]:
 # 目标词汇1
word3_token = tokenizer.encode("photovoltaic", add_special_tokens=False)
word3_embedding = model.bert.embeddings.word_embeddings(torch.tensor(word3_token)).squeeze().detach().numpy()
cos_word3 = cosine_similarity(all_word_embeddings, [word3_embedding])

In [6]:
# 定义要比较的词
target_word1 = "photovoltaic"
# 计算目标词与所有其他词的余弦相似度
target_token_id = tokenizer.encode(target_word1, add_special_tokens=False)
target_embedding = model.bert.embeddings.word_embeddings(torch.tensor(target_token_id)).squeeze().detach().numpy()
cos_similarities2 = cosine_similarity(all_word_embeddings, [target_embedding])
# 对相似度进行排序并获取排序后的索引
sorted_indices = np.argsort(cos_similarities2.flatten())[::-1]

# 打印排名和相似度
sum = []
# print("cosine similarity for '{}':".format(target_word1))
i = 0
for rank, idx in enumerate(sorted_indices):
    word = all_words[idx]
    similarity = cos_similarities2[idx][0]
    # print("{}: {}(Cosine Similarity: {:.4f})".format(rank + 1, word, similarity))
    sum.append(word)
    i += 1
    if i == 500:
        break

In [11]:
# 定义要比较的词
target_word1 = "photovoltaic"
# 计算目标词与所有其他词的余弦相似度
target_token_id = tokenizer.encode(target_word1, add_special_tokens=False)
target_embedding = model.bert.embeddings.word_embeddings(torch.tensor(target_token_id)).squeeze().detach().numpy()
cos_similarities2 = cosine_similarity(all_word_embeddings, [target_embedding])

# 对相似度进行排序并获取排序后的索引
sorted_indices = np.argsort(cos_similarities2.flatten())[::-1]

# 保存词汇及其余弦相似度和排名
word_rankings = {}

# 打印排名和相似度，并将其保存到字典中
for rank, idx in enumerate(sorted_indices):
    word = all_words[idx]
    similarity = cos_similarities2[idx][0]
    word_rankings[word] = {"rank": rank + 1, "similarity": similarity}  # 保存排名和相似度
    # 如果只需要前500个，可以加一个限制
    if rank == 499:
        break

words = ['DSSCs', 'DSSC', 'DSCs', 'SOFCs', 'MFCs', 'DMFC', 'PSCs', 'OLEDs', 'LIBs', 'SOFC', 'OPV', 'OLED', 'PEMFC', 'PV',
         'LEDs', 'PMT', 'PEC', 'PECVD', 'LiCoO2', 'LiNbO3', 'Li4Ti5O12', 'LiClO4', 'Li', 'Ni', 'SnS2', 'TiO2', 'MoSe2', 'SiH4', 
         'CdSe', 'Na2HPO4', 'Bi2Te3', 'SnS', 'Bi2Se3', 'PCBM', 'PC71BM', 'Si', 'GaSb', 'PbSe', 'PbTe', 'CuInS2', 'CuInSe2', 'CIGS', 
         'MAPbI3', 'CH3NH3PbI3', 'Bi2S3', 'Bi2WO6', 'BiVO4', 'Al2O3', 'ZnO', 'CuO', 'CdTe', 'ZnS', 'SnO2', 'Na2Ti3O7', 'MnO2', 'P25', 
         'Y2O3', 'Sb2S3', 'SiO2']


# 示例：查询某个词的排名和相似度
for word in words:
    word_to_query = word  # 你想查询的词
    if word_to_query in word_rankings:
        print(f"'{word_to_query}' - Rank: {word_rankings[word_to_query]['rank']}, Cosine Similarity: {word_rankings[word_to_query]['similarity']:.4f}")
    else:
        print(f"'{word_to_query}' not found in the rankings.")

# 如果你想查看所有词的排名和相似度，可以遍历字典
# for word, info in word_rankings.items():
#     print(f"{word}: Rank {info['rank']}, Similarity {info['similarity']:.4f}")


'DSSCs' - Rank: 11, Cosine Similarity: 0.5105
'DSSC' - Rank: 7, Cosine Similarity: 0.5387
'DSCs' - Rank: 53, Cosine Similarity: 0.4643
'SOFCs' - Rank: 129, Cosine Similarity: 0.4270
'MFCs' - Rank: 419, Cosine Similarity: 0.3974
'DMFC' - Rank: 286, Cosine Similarity: 0.4053
'PSCs' - Rank: 33, Cosine Similarity: 0.4793
'OLEDs' - Rank: 278, Cosine Similarity: 0.4058
'LIBs' not found in the rankings.
'SOFC' - Rank: 97, Cosine Similarity: 0.4367
'OPV' - Rank: 10, Cosine Similarity: 0.5187
'OLED' - Rank: 122, Cosine Similarity: 0.4279
'PEMFC' not found in the rankings.
'PV' - Rank: 16, Cosine Similarity: 0.4984
'LEDs' - Rank: 452, Cosine Similarity: 0.3956
'PMT' not found in the rankings.
'PEC' not found in the rankings.
'PECVD' - Rank: 259, Cosine Similarity: 0.4075
'LiCoO2' - Rank: 191, Cosine Similarity: 0.4164
'LiNbO3' - Rank: 469, Cosine Similarity: 0.3947
'Li4Ti5O12' - Rank: 221, Cosine Similarity: 0.4124
'LiClO4' - Rank: 334, Cosine Similarity: 0.4019
'Li' not found in the rankings.
'

In [18]:
print(sum)

['photovoltaic', 'photovoltaics', 'photoelectrochemical', 'optoelectronic', 'photoelectric', 'photovoltage', 'DSSC', 'photoelectro', 'photoconduc', 'OPV', 'DSSCs', 'photochromic', 'IPCE', 'photodetectors', 'thermoelectric', 'PV', 'photoanodes', 'photophysical', 'optoelectronics', 'photodiode', 'Jsc', 'photoconductivity', 'photodetector', 'photocurrent', 'photoactivity', 'photosensitiz', 'microalgal', 'photogen', 'photoanode', 'supercapacitor', 'Photoluminescence', 'electroluminescence', 'PSCs', 'electrochromic', 'antiproliferative', 'supercapacitors', 'photoactive', 'Electrochemical', 'sensitizers', 'photosensitizer', 'photosensi', 'photothermal', 'electroly', 'CuInS2', 'Solar', 'photoexc', 'CuIn', 'pervaporation', 'photovolta', 'Electricity', 'pyroelectric', 'supercapac', 'DSCs', 'electrol', 'Photoc', 'photocatalytic', 'CIGS', 'Josephson', 'photoan', 'polyvinyl', 'electropor', 'microalg', 'photoresponse', 'electrocatalytic', 'photocatalysts', 'Neural', 'photoproduc', 'JSC', 'semicondu

#### ChatGPT提取出的化学式（余弦相似性排名前500）  
['DSSC', 'OPV', 'DSSCs', 'IPCE', 'PV', 'PSCs', 'CuInS2', 'CuIn', 'DSCs', 'CIGS', 'JSC', 'BHJ', 'CH3NH3PbI3', 'SOFC', 
'InGa', 'OLED', 'HVAC', 'Bi2WO6', 'MAPbI3', 'SOFCs', 'Bi2Te3', 'EQE', 'BiVO4', 'Sb2', 'PC71BM', 'SHEL', 'GaSb', 'ZnPc', 
'GaIn', 'PbI2', 'LiCoO2', 'PbTe', 'BiOCl', 'PCBM', 'PtRu', 'TFTs', 'HeLa', 'Li4Ti5O12', 'CZTS', 'S14', 'LaNi', 'MoSe2', 
'HEK293', 'FETs', 'PECVD', 'OLEDs', 'DMFC', 'CuPc', 'Bi2S3', 'SiNWs', 'PDT', 'LiClO4', 'MEH', 'PbSe', 'SnS', 'Ti6', 'Bi2Se3', 
'MFCs', 'PPh2', 'MOSFETs', 'CdSe', 'PVT', 'SiH4', 'Na2HPO4', 'LEDs', 'MWh', 'SnS2', 'LiNbO3', 'Mo6']


In [19]:
 # 目标词汇1
word3_token = tokenizer.encode("perovskite", add_special_tokens=False)
word3_embedding = model.bert.embeddings.word_embeddings(torch.tensor(word3_token)).squeeze().detach().numpy()
cos_word3 = cosine_similarity(all_word_embeddings, [word3_embedding])

In [20]:
# 定义要比较的词
target_word1 = "perovskite"
# 计算目标词与所有其他词的余弦相似度
target_token_id = tokenizer.encode(target_word1, add_special_tokens=False)
target_embedding = model.bert.embeddings.word_embeddings(torch.tensor(target_token_id)).squeeze().detach().numpy()
cos_similarities2 = cosine_similarity(all_word_embeddings, [target_embedding])
# 对相似度进行排序并获取排序后的索引
sorted_indices = np.argsort(cos_similarities2.flatten())[::-1]

# 打印排名和相似度
sum = []
# print("cosine similarity for '{}':".format(target_word1))
i = 0
for rank, idx in enumerate(sorted_indices):
    word = all_words[idx]
    similarity = cos_similarities2[idx][0]
    # print("{}: {}(Cosine Similarity: {:.4f})".format(rank + 1, word, similarity))
    sum.append(word)
    i += 1
    if i == 500:
        break

In [21]:
print(sum)

['perovskite', 'perovskites', 'pyrochlore', 'CH3NH3PbI3', 'MAPbI3', 'spinel', 'fluorite', 'perovsk', 'ferroelectricity', 'BaTiO3', 'PbI2', 'SrTiO3', 'CH3NH3', 'ferroelectrics', 'ilmenite', 'BiFeO3', 'ferroelectric', 'manganites', 'lanthanum', 'wurtzite', 'LSCF', 'PSCs', 'apatite', 'superlattice', 'hematite', 'lanthan', 'Heusler', 'La2', 'chalcopyrite', 'La2O3', 'relaxor', 'octahedra', 'CZTS', 'Keggin', 'rutile', 'mullite', 'Sr2', 'commercialization', 'BFO', 'PZT', 'titanate', 'rhombohedral', 'BTO', 'PbSe', 'La0', 'perchlor', 'La3', 'limestones', 'photosynthesis', 'orthorhombic', 'Bi2WO6', 'PbO', 'strontium', 'FOX', 'hydrotalcite', 'BHJ', 'mesoporous', 'irreduc', 'ferrites', 'Nb2O5', 'SOFCs', 'photosynthetic', 'anatase', 'phosphines', 'photoanode', 'olivine', 'HfO2', 'SrO', 'CIGS', 'CoFe2O4', 'PANalytical', 'PbTe', 'LiCoO2', 'sple', 'opal', 'auth', 'photoble', 'MnOx', 'pseudoc', 'BiOCl', 'faradaic', 'pyroelectric', 'poled', 'zeolitic', 'superstruct', 'ineff', 'heterostructures', 'L10', 