In [1]:
import pyarrow # must occur prior to ray import
import ray
from ray import tune
from ray.tune import ExperimentAnalysis
from ray.tune.search.hyperopt import HyperOptSearch
import datetime
import numpy as np
import pandas as pd
import random
import seaborn as sns; sns.set()
from collections import Counter
from datasets import load_from_disk
from scipy.stats import ranksums
from sklearn.metrics import accuracy_score, f1_score
from transformers import BertForSequenceClassification
from transformers import Trainer
from transformers.training_args import TrainingArguments

from geneformer import DataCollatorForCellClassification
from geneformer import TranscriptomeTokenizer
import anndata as ad
import pandas as pd

In [2]:
file_path = "/home/wangxihe/AF_atlas/Data/单细胞分析/AF_renamed_ensembl.h5ad"
# 使用 read_h5ad 函数读取文件
HLCA = ad.read_h5ad(file_path)

In [3]:
HLCA

AnnData object with n_obs × n_vars = 36327 × 23568
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'condition', 'nCount_SCT', 'nFeature_SCT', 'pANN_0.25_0.21_278', 'DF.classifications_0.25_0.21_278', 'pANN_0.25_0.19_10', 'DF.classifications_0.25_0.19_10', 'pANN_0.25_0.02_89', 'DF.classifications_0.25_0.02_89', 'pANN_0.25_0.03_62', 'DF.classifications_0.25_0.03_62', 'pANN_0.25_0.19_483', 'DF.classifications_0.25_0.19_483', 'pANN_0.25_0.25_276', 'DF.classifications_0.25_0.25_276', 'pANN_0.25_0.01_608', 'DF.classifications_0.25_0.01_608', 'pANN_0.25_0.01_369', 'DF.classifications_0.25_0.01_369', 'pANN_0.25_0.21_187', 'DF.classifications_0.25_0.21_187', 'pANN_0.25_0.005_202', 'DF.classifications_0.25_0.005_202', 'pANN_0.25_0.27_140', 'DF.classifications_0.25_0.27_140', 'pANN_0.25_0.005_257', 'DF.classifications_0.25_0.005_257', 'S.Score', 'G2M.Score', 'Phase', 'old.ident', 'CC.Difference', 'integrated_snn_res.0.5', 'seurat_clusters', 'cell_type'
    var: 'features'

In [4]:
HLCA.var.features

ENSG00000237613    ENSG00000237613
ENSG00000186092    ENSG00000186092
ENSG00000284733    ENSG00000284733
ENSG00000284662    ENSG00000284662
ENSG00000177757    ENSG00000177757
                        ...       
ENSG00000099725    ENSG00000099725
ENSG00000291456    ENSG00000291456
ENSG00000237802    ENSG00000237802
ENSG00000169763    ENSG00000169763
ENSG00000274847    ENSG00000274847
Name: features, Length: 23568, dtype: object

In [4]:
total_read_counts = HLCA.X.sum(axis=1)
HLCA.obs['n_counts'] = total_read_counts
HLCA.var['ensembl_id'] = HLCA.var.index

In [5]:
condition_counts = HLCA.obs['condition'].value_counts()
cell_type_counts = HLCA.obs['cell_type'].value_counts()

In [6]:
condition_counts

condition
AF      22096
Ctrl    14231
Name: count, dtype: int64

In [7]:
cell_type_counts #5: T cell 6:MP  0:EC  7:Neutrophil  3:FB  2:SMC  1:DC  4:B cel1

cell_type
5    20152
6     4369
0     3142
7     2235
3     2182
2     1963
1     1903
4      381
Name: count, dtype: int64

In [9]:
obs_df = HLCA.obs
SMC_obs = obs_df.loc[obs_df['cell_type'] == 2]
SMC= HLCA[SMC_obs.index]

In [10]:
SMC

View of AnnData object with n_obs × n_vars = 1963 × 23568
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'condition', 'nCount_SCT', 'nFeature_SCT', 'pANN_0.25_0.21_278', 'DF.classifications_0.25_0.21_278', 'pANN_0.25_0.19_10', 'DF.classifications_0.25_0.19_10', 'pANN_0.25_0.02_89', 'DF.classifications_0.25_0.02_89', 'pANN_0.25_0.03_62', 'DF.classifications_0.25_0.03_62', 'pANN_0.25_0.19_483', 'DF.classifications_0.25_0.19_483', 'pANN_0.25_0.25_276', 'DF.classifications_0.25_0.25_276', 'pANN_0.25_0.01_608', 'DF.classifications_0.25_0.01_608', 'pANN_0.25_0.01_369', 'DF.classifications_0.25_0.01_369', 'pANN_0.25_0.21_187', 'DF.classifications_0.25_0.21_187', 'pANN_0.25_0.005_202', 'DF.classifications_0.25_0.005_202', 'pANN_0.25_0.27_140', 'DF.classifications_0.25_0.27_140', 'pANN_0.25_0.005_257', 'DF.classifications_0.25_0.005_257', 'S.Score', 'G2M.Score', 'Phase', 'old.ident', 'CC.Difference', 'integrated_snn_res.0.5', 'seurat_clusters', 'cell_type', 'n_counts'
    va

In [12]:
SMC.__dict__['_raw'].__dict__['_var'] = SMC.__dict__['_raw'].__dict__['_var'].rename(columns={'_index': 'features'})
save_path = "/home/wangxihe/AF_atlas/Data/单细胞分析/SMC/SMC.h5ad"
# 使用 write_h5ad 方法保存 AnnData 对象
SMC.write_h5ad(save_path)

In [13]:
tk = TranscriptomeTokenizer({"cell_type": "cell_type","condition": "condition"}, nproc=1)

In [14]:
tk.tokenize_data("/home/wangxihe/AF_atlas/Data/单细胞分析/SMC", 
                 "/home/wangxihe/AF_atlas/Data/单细胞分析/SMC/", 
                 "SMC",
                file_format="h5ad")

Tokenizing /home/wangxihe/AF_atlas/Data/单细胞分析/SMC/SMC.h5ad
/home/wangxihe/AF_atlas/Data/单细胞分析/SMC/SMC.h5ad has no column attribute 'filter_pass'; tokenizing all cells.
Creating dataset.


Map:   0%|          | 0/1963 [00:00<?, ? examples/s]

Map:   0%|          | 0/1963 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1963 [00:00<?, ? examples/s]