In [2]:
import scanpy as sc
import anndata as ad
import pandas as pd
import numpy as np
from scipy.io import mmread

In [None]:
adata = sc.read_h5ad("counts.h5ad")

load gene, cell, and sample info

In [3]:
gene_names = np.loadtxt("features.tsv.gz", dtype=str)
cell_names = np.loadtxt("barcodes.tsv.gz", dtype=str)
cell_annotation = pd.read_csv("cell_annotation.csv.gz")

In [6]:
gene_names[50:150]

array(['ABCA9-AS1', 'ABCB1', 'ABCB10', 'ABCB11', 'ABCB4', 'ABCB5',
       'ABCB6', 'ABCB7', 'ABCB8', 'ABCB9', 'ABCC1', 'ABCC10', 'ABCC11',
       'ABCC2', 'ABCC3', 'ABCC4', 'ABCC5', 'ABCC5-AS1', 'ABCC6', 'ABCC8',
       'ABCC9', 'ABCD1', 'ABCD2', 'ABCD3', 'ABCD4', 'ABCE1', 'ABCF1',
       'ABCF2', 'ABCF2.1', 'ABCF3', 'ABCG1', 'ABCG2', 'ABCG5', 'ABCG8',
       'ABHD1', 'ABHD10', 'ABHD11', 'ABHD12', 'ABHD12B', 'ABHD13',
       'ABHD14A', 'ABHD14A-ACY1', 'ABHD14B', 'ABHD15', 'ABHD15-AS1',
       'ABHD16A', 'ABHD17A', 'ABHD17B', 'ABHD17C', 'ABHD18', 'ABHD2',
       'ABHD3', 'ABHD4', 'ABHD5', 'ABHD6', 'ABHD8', 'ABI1', 'ABI2',
       'ABI3', 'ABI3BP', 'ABL1', 'ABL2', 'ABLIM1', 'ABLIM2', 'ABLIM3',
       'ABO', 'ABR', 'ABRA', 'ABRACL', 'ABRAXAS1', 'ABRAXAS2', 'ABT1',
       'ABTB1', 'ABTB2', 'AC000036.1', 'AC000058.1', 'AC000068.1',
       'AC000068.2', 'AC000093.1', 'AC000099.1', 'AC000123.1',
       'AC000403.1', 'AC001226.1', 'AC001226.2', 'AC002064.1',
       'AC002064.2', 'AC002066.1', '

In [None]:
sample_info = pd.read_excel("sample_metadata.xlsx", skiprows=20, skipfooter=37)
# sample_info.drop(columns=['title', 'organism',
#                           'characteristics: geo accession', 'characteristics: instrument model',
#                           'characteristics: Unpublished', 'characteristics: City', 'characteristics: Age', 
#                           'characteristics: Sex', 'characteristics: Single cell sequencing platform',
#                           'characteristics: Sampling day (Days after symptom onset)', 'characteristics: Leukocytes [G/L]', 
#                           'characteristics: Neutrophils [G/L]', 'characteristics: Lymphocytes [G/L]'], inplace=True)

check whether `cell_names` are actually equal to the `cellName` column in `cell_annotation`:

In [None]:
num = 0
for i, cell_name in enumerate(cell_names):
    cellName = cell_annotation.iloc[i, 0]
    if cell_name != cellName[:len(cell_name)]:
        num += 1
num

merge cell info and sample info

In [None]:
cell_info = pd.merge(cell_annotation, sample_info, left_on='sampleID', right_on='Sample name', how='left', validate='many_to_one')
cell_info.drop(columns=['Sample name'], inplace=True)
cell_info.set_index('cellName', inplace=True)

restore info in the `anndata` object

In [None]:
adata.var_names = gene_names
adata.obs_names = cell_names
adata.obs = cell_info

Save the raw `anndata` object. Before writing the columns containing mixed types of data are converted into `str` type.

In [None]:
for col in ['characteristics: Age', 'characteristics: Sampling day (Days after symptom onset)'
           'characteristics: Leukocytes [G/L]', 'characteristics: Neutrophils [G/L]', 'characteristics: Lymphocytes [G/L]']:
    adata.obs[col] = adata.obs[col].astype(str)

In [None]:
adata.write_h5ad("covid19_GSE158055_raw.h5ad")

subset the `anndata` object

In [13]:
adata = sc.read_h5ad("covid19_GSE158055_raw.h5ad")

In [14]:
# 8 severe patients and 5 controls, more balanced
is_frozen_pbmc = adata.obs['characteristics: Sample type'].isin(['frozen PBMC'])
# sampled in the covid19 progress or from controls
is_recovered_or_healthy = adata.obs['characteristics: Sample time'].isin(['progression', 'control'])
# severe symptomes or control
is_severe_or_ctrl = adata.obs['characteristics: CoVID-19 severity'].isin(['severe/critical', 'control'])
# no other diseases
only_covid_or_ctrl = adata.obs['characteristics: Comorbidities'].isin(['none', 'control'])
# most cells are sequenced by 10X 5'
is_tgt_protocol = adata.obs['characteristics: Single cell sequencing platform'] == "10X 5'"
# these 2 cell types contain < 100 cells
is_tgt_majorType = ~adata.obs['majorType'].isin(['Neu', 'Macro'])
is_tgt_samples = is_frozen_pbmc & is_severe_or_ctrl & is_recovered_or_healthy & only_covid_or_ctrl & is_tgt_protocol & is_tgt_majorType

In [15]:
subset_adata = adata[is_tgt_samples,]
subset_adata

View of AnnData object with n_obs × n_vars = 94652 × 27943
    obs: 'sampleID', 'celltype', 'majorType', 'title', 'organism', 'Patients', 'characteristics:  Datasets', 'characteristics: City', 'characteristics: Age', 'characteristics: Sex', 'characteristics: Sample type', 'characteristics: CoVID-19 severity', 'characteristics: Sample time', 'characteristics: Sampling day (Days after symptom onset)', 'characteristics: SARS-CoV-2', 'characteristics: Single cell sequencing platform', 'characteristics: BCR single cell sequencing', 'characteristics: TCR single cell sequencing', 'characteristics: Outcome', 'characteristics: Comorbidities', 'characteristics: COVID-19-related medication and anti-microbials', 'characteristics: Leukocytes [G/L]', 'characteristics: Neutrophils [G/L]', 'characteristics: Lymphocytes [G/L]', 'characteristics: Unpublished', 'characteristics: geo accession', 'characteristics: instrument model'

In [19]:
subset_adata.write_h5ad("covid19_GSE158055_subset100k.h5ad")

In [3]:
subset_adata = sc.read_h5ad("covid19_GSE158055_subset100k.h5ad")

In [4]:
subset_adata.obs['majorType'].value_counts()

majorType
CD8       30714
CD4       27128
Mono      16707
B         12038
NK         4567
Mega       1598
DC         1028
Plasma      872
Name: count, dtype: int64

In [16]:
subset_adata.obs.groupby('characteristics: CoVID-19 severity')['Patients'].nunique()

characteristics: CoVID-19 severity
control            5
severe/critical    7
Name: Patients, dtype: int64

In [18]:
subset_adata.obs.groupby('characteristics: CoVID-19 severity')['sampleID'].nunique()

characteristics: CoVID-19 severity
control            5
severe/critical    8
Name: sampleID, dtype: int64

In [17]:
subset_adata.obs.groupby(['characteristics: Sample type', 'characteristics: CoVID-19 severity'])['sampleID'].nunique()

characteristics: Sample type  characteristics: CoVID-19 severity
frozen PBMC                   control                               5
                              severe/critical                       8
Name: sampleID, dtype: int64