In [16]:
import scanpy as sc
import pandas as pd
import anndata as ad


In [17]:
# data path of the ST data
data_path = './data/ST_PancreaticCancer'

In [31]:

# file name of the count matrix 
count_file = 'GSM3405534_PDAC-B-ST1-filtered.txt.gz'


In [19]:
# read tsv using pandas
count = pd.read_csv(f'{data_path}/{count_file}', sep='\t')

In [20]:

count

Unnamed: 0,Genes,21x2,23x2,27x2,22x2,24x2,25x2,26x2,20x2,28x2,...,24x18,22x19,23x19,24x19,25x19,22x20,23x20,24x20,23x21,24x21
0,A1BG,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,A1CF,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,A2M,3,0,2,0,1,0,1,2,1,...,0,0,2,1,0,0,0,2,1,1
3,A2ML1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,A3GALT2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19733,ZYG11A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19734,ZYG11B,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
19735,ZYX,1,0,2,0,0,2,1,0,0,...,0,0,1,0,1,0,0,6,3,1
19736,ZZEF1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# get cell_ID
cell_id_array = count.columns[1:]
cell_id_array

Index(['21x2', '23x2', '27x2', '22x2', '24x2', '25x2', '26x2', '20x2', '28x2',
       '29x2',
       ...
       '24x18', '22x19', '23x19', '24x19', '25x19', '22x20', '23x20', '24x20',
       '23x21', '24x21'],
      dtype='object', length=224)

In [22]:
# get geneID
gene_id_array = count.Genes	

In [23]:
gene_id_array

0           A1BG
1           A1CF
2            A2M
3          A2ML1
4        A3GALT2
          ...   
19733     ZYG11A
19734     ZYG11B
19735        ZYX
19736      ZZEF1
19737       ZZZ3
Name: Genes, Length: 19738, dtype: object

In [24]:
# get count_matrix
count_X = count.values[:,1:].T.astype('int')

In [25]:
# adata
adata = ad.AnnData(count_X)
adata.var_names = gene_id_array
adata.obs_names = cell_id_array

In [26]:
adata.obs_names

Index(['21x2', '23x2', '27x2', '22x2', '24x2', '25x2', '26x2', '20x2', '28x2',
       '29x2',
       ...
       '24x18', '22x19', '23x19', '24x19', '25x19', '22x20', '23x20', '24x20',
       '23x21', '24x21'],
      dtype='object', length=224)

In [27]:
# get spot coordinate
spot_coordinate = pd.DataFrame()
spot_coordinate['coord_ID'] = cell_id_array
spot_coordinate['x'] = spot_coordinate['coord_ID'].apply(lambda x: x.split('x')[0])
spot_coordinate['y'] = spot_coordinate['coord_ID'].apply(lambda x: x.split('x')[1])
spot_coordinate = spot_coordinate.set_index('coord_ID')
spot_coordinate

Unnamed: 0_level_0,x,y
coord_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
21x2,21,2
23x2,23,2
27x2,27,2
22x2,22,2
24x2,24,2
...,...,...
22x20,22,20
23x20,23,20
24x20,24,20
23x21,23,21


In [28]:
adata.obsm['spatial'] = spot_coordinate.loc[adata.obs_names].loc[:,['x','y']]
adata

AnnData object with n_obs × n_vars = 224 × 19738
    obsm: 'spatial'

In [29]:
adata.write_h5ad(f'{data_path}/h5ad/GSM3405534_PDAC-B-ST1-filtered.h5ad')