In [1]:
import tempfile
import os

# 检查 tempfile 模块使用的临时文件目录
temp_dir = tempfile.gettempdir()
print("Updated temp directory:", temp_dir)

import sys
import importlib

new_path = '/data/mr423/project/code/'
if new_path not in sys.path:
    sys.path.insert(0, new_path)

# relaod the scgpt files
import scgpt
print("scgpt location: ", scgpt.__file__)
importlib.reload(scgpt)

Updated temp directory: /data/mr423/tmp
scgpt location:  /data/mr423/project/code/scgpt/__init__.py


<module 'scgpt' from '/data/mr423/project/code/scgpt/__init__.py'>

In [2]:
import json
import copy
import os
from pathlib import Path
import shutil
import sys
import time
from typing import List, Tuple, Dict, Union, Optional
import warnings
import pandas as pd
# from . import asyn
import torch
from anndata import AnnData
import scanpy as sc
import seaborn as sns
import numpy as np
import wandb
from scipy.sparse import issparse
import matplotlib.pyplot as plt
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from torchtext.vocab import Vocab
from torchtext._torchtext import (
    Vocab as VocabPybind,
)

sys.path.insert(0, "../")
import scgpt as scg
from scgpt.model import TransformerModel
from scgpt.tokenizer import tokenize_and_pad_batch

from scgpt.tokenizer.gene_tokenizer import GeneVocab
from scgpt.preprocess import Preprocessor
from scgpt.utils import set_seed

sc.set_figure_params(figsize=(6, 6))

os.environ["KMP_WARNINGS"] = "off"
# os.environ["WANDB_MODE"]= "offline"

warnings.filterwarnings('ignore')

set_seed(0)

In [3]:
######################################################################
# Settings for input and preprocessing
######################################################################

pad_token = "<pad>"
special_tokens = [pad_token, "<cls>", "<eoc>"]
mask_value = "auto"  # for masked values, now it should always be auto

max_seq_len = 3001
n_bins = 101

In [4]:
# input/output representation
input_style = "binned"  # "normed_raw", "log1p", or "binned"                                    # decide the type of the input

input_emb_style = "category"  # "category" or "continuous" or "scaling"
cell_emb_style = "w-pool"  # "avg-pool" or "w-pool" or "cls"


# %% validate settings
assert input_style in ["normed_raw", "log1p", "binned"]
assert input_emb_style in ["category", "continuous", "scaling"]

# if input_style == "binned":
#     if input_emb_style == "scaling":
#         raise ValueError("input_emb_style `scaling` is not supported for binned input.")
# elif input_style == "log1p" or input_style == "normed_raw":
#     if input_emb_style == "category":
#         raise ValueError(
#             "input_emb_style `category` is not supported for log1p or normed_raw input."
#         )

if input_emb_style == "category":
    mask_value = n_bins + 1
    pad_value = n_bins  # for padding gene expr values
    n_input_bins = n_bins + 2
else:
    mask_value = -1
    pad_value = -2
    n_input_bins = n_bins

In [5]:
######################################################################
# Data loading
######################################################################
adata = sc.read("/data/mr423/project/data/3-OLINK_data_train_withOutlier_all.h5ad")
adata_test = sc.read("/data/mr423/project/data/3-OLINK_data_test_withOutlier_all.h5ad")

print(adata.shape)
print(adata_test.shape)

adata.obs["batch_id"]  = adata.obs["str_batch"] = "0"
adata_test.obs["batch_id"]  = adata_test.obs["str_batch"] = "1" 

adata.var.set_index(adata.var["gene_name"], inplace=True)
adata_test.var.set_index(adata.var["gene_name"], inplace=True)

data_is_raw = False
filter_gene_by_counts = False
adata_test_raw = adata_test.copy()
adata = adata.concatenate(adata_test, batch_key="str_batch")


# make the batch category column
batch_id_labels = adata.obs["str_batch"].astype("category").cat.codes.values
adata.obs["batch_id"] = batch_id_labels

adata.var["gene_name"] = adata.var.index.tolist()

(37304, 2919)
(4145, 2919)


In [6]:
######################################################################
# set up the preprocessor, use the args to config the workflow
######################################################################
preprocessor = Preprocessor(
    use_key="X",  # the key in adata.layers to use as raw data
    filter_gene_by_counts=filter_gene_by_counts,  # step 1
    filter_cell_by_counts=False,  # step 2
    normalize_total=3000,  # 3. whether to normalize the raw data and to what sum
    result_normed_key="X_normed",  # the key in adata.layers to store the normalized data
    log1p=data_is_raw,  # 4. whether to log1p the normalized data
    result_log1p_key="X_log1p",
    subset_hvg=False,  # 5. whether to subset the raw data to highly variable genes
    hvg_flavor="seurat_v3" if data_is_raw else "cell_ranger",
    binning=n_bins,  # 6. whether to bin the raw data and to what number of bins
    result_binned_key="X_binned",  # the key in adata.layers to store the binned data
)


adata_test = adata[adata.obs["str_batch"] == "1"]
adata = adata[adata.obs["str_batch"] == "0"]

preprocessor(adata, batch_key=None)
preprocessor(adata_test, batch_key=None)

scGPT - INFO - Normalizing total counts ...
scGPT - INFO - Binning data ...
scGPT - INFO - Normalizing total counts ...
scGPT - INFO - Binning data ...


In [9]:
adata.layers['X_binned']

array([[24, 80, 14, ..., 36,  4, 75],
       [42, 49, 23, ..., 42, 12, 70],
       [65, 92, 12, ..., 41,  2, 54],
       ...,
       [57, 73, 52, ..., 31, 30, 68],
       [47, 91, 28, ..., 49, 94, 53],
       [63, 91, 14, ..., 42, 92, 45]])

In [10]:
adata.layers['X_binned'].shape

(37304, 2919)

In [12]:
col_name = adata.var['gene_name'].values
col_name_test = adata_test.var['gene_name'].values

adata.obs.index = adata.obs.index.str.replace('-0','')
adata_test.obs.index = adata_test.obs.index.str.replace('-1','')

binned_data = pd.DataFrame(adata.layers['X_binned'], columns=col_name, index = adata.obs.index)
binned_data_test = pd.DataFrame(adata_test.layers['X_binned'], columns=col_name_test, index = adata_test.obs.index)


In [13]:
binned_data

Unnamed: 0_level_0,EIF4EBP1,EIF4G1,EIF5A,ENAH,ENG,ENPP2,ENPP5,ENPP7,ENTPD5,EGLN1,...,CHCHD6,CHM,CHP1,CHMP6,CHMP1A,CHGB,CHGA,CHRM1,KLK1,WFDC2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2144829,24,80,14,75,86,90,61,10,99,15,...,39,64,2,91,73,82,32,36,4,75
3154285,42,49,23,58,92,99,82,92,95,27,...,58,17,0,87,87,54,38,42,12,70
1679423,65,92,12,44,83,78,56,0,96,14,...,53,52,5,98,64,54,29,41,2,54
1172610,88,53,29,72,83,58,50,95,99,52,...,45,22,1,90,33,75,27,43,99,74
4011532,75,93,29,45,83,78,75,24,90,44,...,30,42,19,95,92,63,10,38,25,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1137580,72,84,21,57,83,76,78,54,98,31,...,36,32,12,97,84,47,19,30,3,44
3378384,70,12,38,52,95,90,83,60,98,25,...,53,7,17,63,25,69,39,42,60,69
1220136,57,73,52,43,94,92,70,84,99,18,...,39,24,3,95,75,78,17,31,30,68
4988172,47,91,28,44,90,86,83,6,99,10,...,42,41,10,94,91,70,21,49,94,53


In [15]:
binned_data.to_csv("/data/mr423/project/data/all_train_binned_data.csv")
binned_data_test.to_csv("/data/mr423/project/data/all_test_binned_data.csv")