This notebook saves the relevant information from the **HSCA extended** Anndata file after transfer learning, allowing it to be exported later as an **RDS file** for further analysis of the HSCA extended in **R**.

In [None]:
from scipy import io
import pandas as pd
import os
import anndata
import scanpy as sc
import numpy as np

In [None]:
os.chdir("/data/projects/Bioinformatics/human_skin_cell_atlas/scripts")

**Note:** Cell numbers here differ from the final HSCA extended, as de novo annotation and integration guided by label transfer uncertainties in **R** reveal low-quality cells that were not detectable during standalone preprocessing.

In [3]:
adata = sc.read("./scarches_output/HSCA/SCVI/HSCA_extended/atlas_with_uncerts.h5ad")
adata

AnnData object with n_obs × n_vars = 900321 × 48905
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'sample', 'subject_ID', 'age_years', 'age_range', 'sex', 'ethnicity', 'anatomical_region_level1', 'anatomical_region_level2', 'anatomical_region_level3', 'sequencing_platform', 'reference_genome_compact', 'cell_ranger_version_compact', 'single_cell_platform', 'tissue_sampling_type', 'Accession_source', 'Dataset', 'Condition', 'Core', 'percent_mito', 'soup_group', 'nCount_original.counts', 'nFeature_original.counts', 'scDblFinder.class', 'genes', 'RNA_snn_res.1.2', 'seurat_clusters', 'orig_celltype_lvl_3', 'RNA_snn_res.0.8', 'RNA_snn_res.1.6', 'fresh_or_frozen', 'RNA_snn_res.1.8', 'RNA_snn_res.1.5', 'seurat_clusters_full', 'RNA_snn_res.2', 'inherited_celltype_lvl_1', 'inherited_celltype_lvl_2', 'inherited_celltype_lvl_3', 'inherited_celltype_lvl_4', 'inherited_celltype_lvl_5', 'barcode', 'UMAP_1', 'UMAP_2', 'celltype_lvl_1', 'celltype_lvl_2', 'celltype_lvl_3', 'celltype_lvl_4', 'cell

In [4]:
os.chdir("/data/projects/Bioinformatics/human single cell skin atlas/scripts/atlas_seurat_output")

In [18]:
!mkdir matrix_files

In [5]:
adata

AnnData object with n_obs × n_vars = 900321 × 48905
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'sample', 'subject_ID', 'age_years', 'age_range', 'sex', 'ethnicity', 'anatomical_region_level1', 'anatomical_region_level2', 'anatomical_region_level3', 'sequencing_platform', 'reference_genome_compact', 'cell_ranger_version_compact', 'single_cell_platform', 'tissue_sampling_type', 'Accession_source', 'Dataset', 'Condition', 'Core', 'percent_mito', 'soup_group', 'nCount_original.counts', 'nFeature_original.counts', 'scDblFinder.class', 'genes', 'RNA_snn_res.1.2', 'seurat_clusters', 'orig_celltype_lvl_3', 'RNA_snn_res.0.8', 'RNA_snn_res.1.6', 'fresh_or_frozen', 'RNA_snn_res.1.8', 'RNA_snn_res.1.5', 'seurat_clusters_full', 'RNA_snn_res.2', 'inherited_celltype_lvl_1', 'inherited_celltype_lvl_2', 'inherited_celltype_lvl_3', 'inherited_celltype_lvl_4', 'inherited_celltype_lvl_5', 'barcode', 'UMAP_1', 'UMAP_2', 'celltype_lvl_1', 'celltype_lvl_2', 'celltype_lvl_3', 'celltype_lvl_4', 'cell

In [None]:
np.max(adata.X)

In [None]:
np.max(adata.layers["counts"])

125537

In [8]:
with open('matrix_files/barcodes.tsv', 'w') as f:
    for item in adata.obs_names:
        f.write(item + '\n')

In [9]:
with open('matrix_files/features.tsv', 'w') as f:
    for item in ['\t'.join([x,x,'Gene Expression']) for x in adata.var_names]:
        f.write(item + '\n')

In [None]:
io.mmwrite('matrix_files/matrix', adata.layers["counts"].T)

In [None]:
!ls matrix_files/

barcodes.tsv  features.tsv  matrix.mtx


In [None]:
!gzip matrix_files/*

In [None]:
!ls matrix_files/

barcodes.tsv.gz  features.tsv.gz  matrix.mtx.gz


In [None]:
adata.obs.to_csv('metadata.csv')

In [15]:
adata.obsm["X_scvi_emb"]

array([[ 0.08039778,  0.1574584 ,  0.8148092 , ...,  0.87250125,
        -0.7238407 ,  1.6856034 ],
       [-1.185399  ,  0.20160827, -0.9421859 , ...,  0.17223558,
         0.24679837, -1.0805833 ],
       [ 0.42882103,  0.18065396, -0.6045707 , ...,  0.5154194 ,
        -1.8228576 ,  0.01332785],
       ...,
       [ 0.37651467, -0.8964641 , -1.4499615 , ...,  0.09810773,
        -0.6619447 ,  1.8233206 ],
       [-0.2672233 , -1.8265716 , -0.09668534, ..., -2.5268192 ,
        -0.3920045 ,  1.1951497 ],
       [ 0.38987154, -0.1339483 , -0.08489776, ...,  1.2982671 ,
         2.1678464 , -0.9686131 ]], dtype=float32)

In [None]:
import pandas as pd

# Access to SCVI-Embeddings
scvi_df = pd.DataFrame(
    adata.obsm["X_scvi_emb"],
    index=adata.obs_names
)

scvi_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
HRS118996_AAACCTGAGCGATATA-1,0.080398,0.157458,0.814809,1.555954,0.415151,0.485750,-1.817345,0.872501,-0.723841,1.685603
HRS118996_AAACCTGCAAGGTTTC-1,-1.185399,0.201608,-0.942186,0.355930,-0.054668,-0.863082,0.068611,0.172236,0.246798,-1.080583
HRS118996_AAACCTGCACAGTCGC-1,0.428821,0.180654,-0.604571,0.372192,-0.759331,-0.852488,-0.157817,0.515419,-1.822858,0.013328
HRS118996_AAACCTGCATCACCCT-1,0.258077,-2.235218,0.225615,0.160601,-0.628965,-0.885455,1.893266,1.335566,0.226430,-0.741456
HRS118996_AAACCTGCATGCATGT-1,-0.761781,1.213483,0.069246,-2.235992,-1.022571,2.190440,-0.434807,0.019359,0.515292,-0.565891
...,...,...,...,...,...,...,...,...,...,...
EMTAB13084_1_WS_SKN_KCL10525741_TTTGTTGGTGTCCGTG-1,0.654932,-0.914868,-1.395637,-0.338357,1.116864,-0.167566,0.781047,-0.518363,1.531658,1.582730
EMTAB13084_1_WS_SKN_KCL10525741_TTTGTTGGTGTGTGGA-1,1.577358,-0.661898,-1.478666,0.052076,1.315349,-0.466588,0.140877,0.387793,1.111544,1.466529
EMTAB13084_1_WS_SKN_KCL10525741_TTTGTTGGTTGCGGAA-1,0.376515,-0.896464,-1.449962,-0.362395,1.200107,0.476014,1.044285,0.098108,-0.661945,1.823321
EMTAB13084_1_WS_SKN_KCL10525741_TTTGTTGTCGCATTAG-1,-0.267223,-1.826572,-0.096685,-0.495930,-1.413955,0.198055,0.457041,-2.526819,-0.392004,1.195150


In [17]:
scvi_df.to_csv("scvi_embedding.csv")

In [None]:
import pandas as pd

# Extract UMAP from anndata obj
umap_df = pd.DataFrame(
    adata.obsm["X_umap"],
    index=adata.obs_names
)
umap_df.columns = ["UMAP_1", "UMAP_2"]

# Save
umap_df.to_csv("umap_coords.csv")


In [19]:
import pandas as pd

pca_df = pd.DataFrame(
    adata.obsm["X_pca"],
    index=adata.obs_names
)
pca_df.to_csv("pca_embedding.csv")
