In [1]:
!pip install flash_attn

[0m

In [3]:
import copy
import json
import os
from pathlib import Path
import sys
import warnings

import torch
from anndata import AnnData
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import pandas as pd
import tqdm
import gseapy as gp

from torchtext.vocab import Vocab
from torchtext._torchtext import (
    Vocab as VocabPybind,
)

sys.path.insert(0, "../")
import scgpt as scg
from scgpt.tasks import GeneEmbedding
from scgpt.tokenizer.gene_tokenizer import GeneVocab
from scgpt.model import TransformerModel
from scgpt.preprocess import Preprocessor
from scgpt.utils import set_seed 

os.environ["KMP_WARNINGS"] = "off"
warnings.filterwarnings('ignore')



In [4]:
set_seed(42)
pad_token = "<pad>" # 用于在处理文本数据时填充（pad）短于最大长度的序列。这样可以保持序列的统一长度

'''
special_tokens = [pad_token, "<cls>", "<eoc>"]：
    special_tokens 是一个包含特殊标记的列表。除了上面提到的 pad_token，列表中还包括：
    "<cls>"：通常用于表示一个序列的开始。
    "<eoc>"：代表一个序列的结束（End Of Content）
'''
# n_hvg 代表“高变异基因”的数量。在生物统计分析中，选择高变异基因（Highly Variable Genes）是为了捕捉细胞间的重要变异特征
special_tokens = [pad_token, "<cls>", "<eoc>"]
n_hvg = 1200
n_bins = 51
mask_value = -1
pad_value = -2
n_input_bins = n_bins

## Load pre-trained model¶

In [5]:
# here we load the pre-trained scGPT blood model

model_dir = "../pre_trained_model/scGPT_blood/"
print(model_dir)

model_config_file = model_dir +"args.json"
model_file = model_dir + "best_model.pt"
vocab_file = model_dir + "vocab.json"

# 加载和更新词汇表
vocab = GeneVocab.from_file(vocab_file)

# 检查特殊标记 special_tokens 是否存在于词汇表中，如果不存在，则添加到词汇表中
for s in special_tokens:
    if s not in vocab:
        vocab.append_token(s)

# Retrieve model parameters from config files
with open(model_config_file, "r") as f:
    model_configs = json.load(f)
print(
    f"Resume model from {model_file}, the model args will override the "
    f"config {model_config_file}."
)

'''
提取模型配置参数：
    embsize：模型中嵌入向量的大小。
    nhead：在模型中，多头注意力机制中的头数。
    d_hid：隐藏层的维度。
    nlayers：模型中的层数。
    n_layers_cls：分类层的数量，特定于模型的某些设计。
'''

embsize = model_configs["embsize"]
nhead = model_configs["nheads"]
d_hid = model_configs["d_hid"]
nlayers = model_configs["nlayers"]
n_layers_cls = model_configs["n_layers_cls"]

gene2idx = vocab.get_stoi()

../pre_trained_model/scGPT_blood/
Resume model from ../pre_trained_model/scGPT_blood/best_model.pt, the model args will override the config ../pre_trained_model/scGPT_blood/args.json.


In [6]:
print(len(gene2idx))

36574


### Check the model genes embeddings

In [15]:
gene2idx

{'hsa-mir-423': 36570,
 'ZZEF1': 36564,
 'ZYX': 36563,
 'ZYG11A': 36561,
 'ZXDB': 36559,
 'ZXDA': 36558,
 'ZW10': 36555,
 'ZUP1': 36554,
 'ZSWIM9': 36553,
 'ZSWIM3': 36546,
 'ZSCAN5C': 36542,
 'ZSCAN5B': 36541,
 'ZSCAN26': 36533,
 'ZSCAN23': 36531,
 'ZSCAN22': 36530,
 'ZSCAN21': 36529,
 'ZSCAN2': 36527,
 'ZSCAN16-AS1': 36525,
 'ZSCAN16': 36524,
 'ZSCAN12': 36523,
 'ZRANB3': 36519,
 'ZRANB1': 36515,
 'ZPLD1': 36513,
 'ZP4': 36510,
 'ZP3': 36509,
 'ZP1': 36507,
 'ZNRF3': 36503,
 'ZNRF1': 36501,
 'ZNRD2-AS1': 36500,
 'ZNNT1': 36498,
 'ZNHIT3': 36496,
 'ZNHIT2': 36495,
 'ZNF99': 36492,
 'ZNF880': 36484,
 'ZNF879': 36483,
 'ZNF875': 36481,
 'ZNF865': 36480,
 'ZNF860': 36478,
 'ZNF853': 36477,
 'ZNF85': 36474,
 'ZNF846': 36473,
 'ZNF841': 36469,
 'ZNF84': 36467,
 'ZNF837': 36465,
 'ZNF835': 36463,
 'ZNF831': 36462,
 'ZNF830': 36461,
 'ZNF813': 36453,
 'ZNF808': 36451,
 'ZNF804B': 36449,
 'ZNF80': 36446,
 'ZNF8-DT': 36445,
 'ZNF8': 36444,
 'ZNF799': 36443,
 'ZNF793-AS1': 36442,
 'ZNF791': 364

In [28]:
import pandas as pd

# 导入 CSV 文件
df = pd.read_csv('scaled_OLINK_data.csv')
header = df.columns # 取出表头（列名）
header_list = header.tolist() # 将表头转换为列表
header_list.pop(0) # 去除 header_list 中的第一个元素
processed_header_list = [element.split('_')[1] for element in header_list] # 处理每个元素，删除 'P_' 和 '_0'
protein_name_list = list(set(processed_header_list)) # 去重（不保留顺序）

print(type(protein_name_list))

<class 'list'>


In [29]:
protein_name_list

['TBL1X',
 'GASK1A',
 'MYH9',
 'ZP4',
 'COL3A1',
 'COL4A4',
 'ADA',
 'ITM2A',
 'AIF1L',
 'TGFBR2',
 'BEX3',
 'RBM17',
 'MAGEA3',
 'LAMP2',
 'TOP1MT',
 'APOE',
 'GPI',
 'FABP2',
 'LILRA2',
 'DNAJC21',
 'CFI',
 'IL10RB',
 'STC1',
 'COL5A1',
 'OGFR',
 'LAT2',
 'SH2D1A',
 'MORF4L1',
 'CD36',
 'ERBIN',
 'CD58',
 'SERPING1',
 'TP53',
 'AFAP1',
 'CUZD1',
 'GPR158',
 'GPKOW',
 'PFDN4',
 'TNFRSF11B',
 'SCARF2',
 'PXDNL',
 'ADGRB3',
 'MRC1',
 'TRIAP1',
 'PPP1R9B',
 'BNIP3L',
 'PCDHB15',
 'CD99',
 'NOTCH3',
 'JAM2',
 'EXOSC10',
 'PNLIPRP2',
 'TMPRSS11D',
 'FUT3',
 'SLC12A2',
 'PKLR',
 'EGF',
 'HEPH',
 'ACTN2',
 'RNF43',
 'TMED8',
 'SIGLEC10',
 'CASP8',
 'SLITRK6',
 'RAC3',
 'UBXN1',
 'CCL16',
 'DDHD2',
 'SPAG1',
 'TCL1B',
 'MCTS1',
 'PAIP2B',
 'NFKB1',
 'ARHGAP5',
 'ESPL1',
 'RNF41',
 'SETMAR',
 'CEP43',
 'PSMD1',
 'AK1',
 'TNFSF12',
 'SKAP2',
 'RILP',
 'OGA',
 'MGLL',
 'SKAP1',
 'KITLG',
 'SRPK2',
 'PPBP',
 'NAA10',
 'AMY2B',
 'KAZN',
 'PTS',
 'VPS28',
 'CD70',
 'TFF2',
 'AK2',
 'TERF1',
 'MAP1L

In [16]:
# 检查每个元素是否在 gene2idx 中找到对应的键
i = 0
for gene in unique_processed_header_list:
    if gene in gene2idx:
        i += 1
        # print(f"'{gene}' found in gene2idx with value {gene2idx[gene]}")
    else:
        print(f"'{gene}' not found in gene2idx")
print(len(unique_processed_header_list))
print(i)
print(len(unique_processed_header_list) - i)

'CD99' not found in gene2idx
'PNLIPRP2' not found in gene2idx
'LEG1' not found in gene2idx
'IL3RA' not found in gene2idx
'AKR7L' not found in gene2idx
'KIR2DS4' not found in gene2idx
'KIR2DL2' not found in gene2idx
'FHIP2A' not found in gene2idx
'BAP18' not found in gene2idx
'BTNL10' not found in gene2idx
'ANP32C' not found in gene2idx
'MENT' not found in gene2idx
'SARG' not found in gene2idx
'CERT' not found in gene2idx
'GATD3' not found in gene2idx
'HCG22' not found in gene2idx
'SIGLEC5' not found in gene2idx
'CSF2RA' not found in gene2idx
'GPR15L' not found in gene2idx
'NTproBNP' not found in gene2idx
'PALM2' not found in gene2idx
'LILRA3' not found in gene2idx
'WARS' not found in gene2idx
2920
2897
23


In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 计算词汇表的长度，即模型需要处理的不同词汇的数量
ntokens = len(vocab)  # size of vocabulary

model = TransformerModel(
    ntokens,
    embsize,
    nhead,
    d_hid,
    nlayers,
    vocab=vocab,
    pad_value=pad_value,
    n_input_bins=n_input_bins,
)

try:
    model.load_state_dict(torch.load(model_file))
    print(f"Loading all model params from {model_file}")
except:
    # only load params that are in the model and match the size
    model_dict = model.state_dict()
    pretrained_dict = torch.load(model_file)
    pretrained_dict = {
        k: v
        for k, v in pretrained_dict.items()
        if k in model_dict and v.shape == model_dict[k].shape
    }
    for k, v in pretrained_dict.items():
        print(f"Loading params {k} with shape {v.shape}")
        model_dict.update(pretrained_dict)
        model.load_state_dict(model_dict)

model.to(device)

Loading params encoder.embedding.weight with shape torch.Size([36574, 512])
Loading params encoder.enc_norm.weight with shape torch.Size([512])
Loading params encoder.enc_norm.bias with shape torch.Size([512])
Loading params value_encoder.linear1.weight with shape torch.Size([512, 1])
Loading params value_encoder.linear1.bias with shape torch.Size([512])
Loading params value_encoder.linear2.weight with shape torch.Size([512, 512])
Loading params value_encoder.linear2.bias with shape torch.Size([512])
Loading params value_encoder.norm.weight with shape torch.Size([512])
Loading params value_encoder.norm.bias with shape torch.Size([512])
Loading params transformer_encoder.layers.0.self_attn.out_proj.weight with shape torch.Size([512, 512])
Loading params transformer_encoder.layers.0.self_attn.out_proj.bias with shape torch.Size([512])
Loading params transformer_encoder.layers.0.linear1.weight with shape torch.Size([512, 512])
Loading params transformer_encoder.layers.0.linear1.bias with 

TransformerModel(
  (encoder): GeneEncoder(
    (embedding): Embedding(36574, 512, padding_idx=36571)
    (enc_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (value_encoder): ContinuousValueEncoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (linear1): Linear(in_features=1, out_features=512, bias=True)
    (activation): ReLU()
    (linear2): Linear(in_features=512, out_features=512, bias=True)
    (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-11): 12 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=512, bias=True)
        (dropout): Dropout(p=0.5, inplace=False)
        (linear2): Linear(in_features=512, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, el

### Retrieve scGPT's gene embeddings

In [17]:
# Retrieve the data-independent gene embeddings from scGPT

gene_ids = np.array([id for id in gene2idx.values()]) # 提取基因 ID 和相应的索引
gene_embeddings = model.encoder(torch.tensor(gene_ids, dtype=torch.long).to(device)) # 使用模型编码器获取基因嵌入向量
gene_embeddings = gene_embeddings.detach().cpu().numpy()

In [18]:
len(gene_embeddings)

36574

In [21]:
gene_embeddings = {gene: gene_embeddings[i] for i, gene in enumerate(gene2idx.keys()) if gene in unique_processed_header_list}
print('Retrieved gene embeddings for {} genes.'.format(len(gene_embeddings)))

Retrieved gene embeddings for 2897 genes.


In [24]:
gene_embeddings['ZP4'].shape

(512,)

### Combine gene embeddings for every individual