In [1]:
import scanpy as sc
import pandas as pd
import anndata as ad

In [2]:
# data path of the ST data
data_path = './data/ST_PancreaticCancer'

In [3]:
# file name of the count matrix and 
count_file = 'GSM3405534_PDAC-B-ST1.tsv.gz'

In [4]:
# read tsv using pandas
count = pd.read_csv(f'{data_path}/{count_file}', sep='\t')

In [5]:
count
# Unnamed: 0 x y

Unnamed: 0.1,Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A4GALT,A4GNT,AAAS,AACS,AADAT,...,ZW10,ZWILCH,ZWINT,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
0,7x2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8x2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9x2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,10x2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,11x2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991,29x34,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
992,30x34,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
993,31x34,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
994,32x34,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# get cell ID
cell_id_array = count['Unnamed: 0']
cell_id_array


0        7x2
1        8x2
2        9x2
3       10x2
4       11x2
       ...  
991    29x34
992    30x34
993    31x34
994    32x34
995      6x2
Name: Unnamed: 0, Length: 996, dtype: object

In [8]:
# get geneID
gene_id_array = count.columns[1:]
gene_id_array

Index(['A1BG', 'A1CF', 'A2M', 'A2ML1', 'A4GALT', 'A4GNT', 'AAAS', 'AACS',
       'AADAT', 'AAED1',
       ...
       'ZW10', 'ZWILCH', 'ZWINT', 'ZXDB', 'ZXDC', 'ZYG11A', 'ZYG11B', 'ZYX',
       'ZZEF1', 'ZZZ3'],
      dtype='object', length=16528)

In [9]:
# get count_matrix
count_X = count.values[:,1:].astype('int')

In [10]:
# adata
adata = ad.AnnData(count_X)
adata.var_names = gene_id_array
adata.obs_names = cell_id_array

In [11]:
adata

AnnData object with n_obs × n_vars = 996 × 16528

In [15]:
# get spot coordinate
count['x'] = count['Unnamed: 0'].apply(lambda x: x.split('x')[0])
count['y'] = count['Unnamed: 0'].apply(lambda x: x.split('x')[1])
count = count.set_index('Unnamed: 0')


In [16]:
adata.obsm['spatial'] = count.loc[adata.obs_names].loc[:,['x','y']]
adata

AnnData object with n_obs × n_vars = 996 × 16528
    obsm: 'spatial'

In [17]:
adata.write_h5ad(f'{data_path}/h5ad/GSM3405534_PDAC-B-ST1.h5ad')