In [1]:
#Importing packages # Make sure that all required packages are downloaded in given environment 
import numpy as np
import pandas as pd
import anndata as ad
from scipy.sparse import csr_matrix

In [2]:
#INITIALIZING ANNDATA#

#Reading h5ad data
immune = ad.read_h5ad("./hca_heart_immune_download.h5ad")

#Data structure of anndata file i.e., summary stastics of the data
ad.AnnData(immune)

AnnData object with n_obs × n_vars = 40868 × 33538
    obs: 'NRP', 'age_group', 'cell_source', 'cell_states', 'donor', 'gender', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'region', 'sample', 'scrublet_score', 'type', 'version', 'scNym', 'scNym_confidence'
    uns: 'cell_states_colors', 'scNym_colors', 'scNym_probabilities'
    obsm: 'X_pca', 'X_scnym', 'X_umap'

In [3]:
immune.obs

Unnamed: 0,NRP,age_group,cell_source,cell_states,donor,gender,n_counts,n_genes,percent_mito,percent_ribo,region,sample,scrublet_score,type,version,scNym,scNym_confidence
AAAGTGAAGTCGGCCT-1-H0015_apex,No,50-55,Harvard-Nuclei,CD4+T_cytox,H5,Female,724.717285,588,0.023481,0.001381,AX,H0015_apex,0.026029,DBD,V3,CD4+T_cell,0.797180
AAATGGAAGGTCCCTG-1-H0015_apex,No,50-55,Harvard-Nuclei,CD4+T_cytox,H5,Female,668.059509,515,0.006289,0.001572,AX,H0015_apex,0.074830,DBD,V3,CD4+T_cell,0.999248
AAATGGAGTTGTCTAG-1-H0015_apex,No,50-55,Harvard-Nuclei,doublets,H5,Female,670.216309,504,0.011200,0.000000,AX,H0015_apex,0.071618,DBD,V3,NK,0.680673
AACAACCGTAATTGGA-1-H0015_apex,No,50-55,Harvard-Nuclei,DOCK4+MØ1,H5,Female,730.082947,578,0.004231,0.002821,AX,H0015_apex,0.085546,DBD,V3,CD14+Monocyte,0.538159
AAGACTCTCAGGACGA-1-H0015_apex,No,50-55,Harvard-Nuclei,Mast,H5,Female,612.323425,428,0.001835,0.005505,AX,H0015_apex,0.048649,DBD,V3,Mast,0.990977
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGATCGTGTCATGT-1-HCAHeart8102862,Yes,60-65,Sanger-CD45,CD4+T_cytox,D11,Female,631.149170,715,0.156748,0.095643,AX,HCAHeart8102862,0.023596,DCD,V3,CD8+T_cell,0.756579
TTTGATCGTTCTCCTG-1-HCAHeart8102862,Yes,60-65,Sanger-CD45,LYVE1+MØ1,D11,Female,819.040100,2526,0.138066,0.086377,AX,HCAHeart8102862,0.197861,DCD,V3,CD8+T_cell,0.269561
TTTGGAGGTCGCTCGA-1-HCAHeart8102862,Yes,60-65,Sanger-CD45,MØ_AgP,D11,Female,757.455505,1350,0.151764,0.024610,AX,HCAHeart8102862,0.085546,DCD,V3,M3,0.585436
TTTGGTTTCAGTGTTG-1-HCAHeart8102862,Yes,60-65,Sanger-CD45,LYVE1+MØ3,D11,Female,815.372131,2507,0.085284,0.055148,AX,HCAHeart8102862,0.132150,DCD,V3,MØ,0.968681


In [4]:
#SUBSETTING DATA WHEN METimmune IS AVAILABLE#

#Subsetting data when gender is female
bdata  = immune[immune.obs.cell_source == "Female"]
bdata

#Subsetting data when cell_states is CD4+T_cytox
cdata  = immune[immune.obs.cell_states == "CD4+T_cytox"]
cdata


View of AnnData object with n_obs × n_vars = 3113 × 33538
    obs: 'NRP', 'age_group', 'cell_source', 'cell_states', 'donor', 'gender', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'region', 'sample', 'scrublet_score', 'type', 'version', 'scNym', 'scNym_confidence'
    uns: 'cell_states_colors', 'scNym_colors', 'scNym_probabilities'
    obsm: 'X_pca', 'X_scnym', 'X_umap'

In [5]:
#SUBSETTING DATA WHEN METimmune IS NOT AVAILABLE#

# Make a copy of data
testdata = immune.copy()

# Set custom observation and variable names
testdata.obs_names = [f"Cell_{i:d}" for i in range(testdata.n_obs)]
testdata.var_names = [f"Gene_{i:d}" for i in range(testdata.n_vars)]

# To view the observation names
print(testdata.obs_names[:10])

Index(['Cell_0', 'Cell_1', 'Cell_2', 'Cell_3', 'Cell_4', 'Cell_5', 'Cell_6',
       'Cell_7', 'Cell_8', 'Cell_9'],
      dtype='object')


In [13]:
#ADDING ALIGNED METimmune#   

#Adding more information to the dataset which is aligned 

#1. Obs/Var level - Adding data on one dimension

#Adding randomly to obs
ct = np.random.choice(["B", "T", "Monocyte"], size=(immune.n_obs))
immune.obs["cell_type"] = pd.Categorical(ct)  # Categoricals are preferred for efficiency
immune.obs

#Adding specifically
#Making a new column cs_short and assigning np.nan values to it
immune.obs['cs_short'] = np.nan

#When I want to assign SC for cell source Sanger-CD45  #Specific to obs of data
sanger =  immune.obs['cell_source'] == 'Sanger-CD45'
immune.obs.loc[sanger, 'cs_short'] = 'SC'
immune.obs

#When I want to assign apex for multiple barcode names #Specific to identifier name
apex = immune.obs.index.str.contains('H0015_apex')
immune.obs.loc[apex,'cs_short'] = 'Apex'
immune.obs



  immune.obs.loc[sanger, 'cs_short'] = 'SC'


Unnamed: 0,NRP,age_group,cell_source,cell_states,donor,gender,n_counts,n_genes,percent_mito,percent_ribo,region,sample,scrublet_score,type,version,scNym,scNym_confidence,cell_type,cs_short
AAAGTGAAGTCGGCCT-1-H0015_apex,No,50-55,Harvard-Nuclei,CD4+T_cytox,H5,Female,724.717285,588,0.023481,0.001381,AX,H0015_apex,0.026029,DBD,V3,CD4+T_cell,0.797180,Monocyte,Apex
AAATGGAAGGTCCCTG-1-H0015_apex,No,50-55,Harvard-Nuclei,CD4+T_cytox,H5,Female,668.059509,515,0.006289,0.001572,AX,H0015_apex,0.074830,DBD,V3,CD4+T_cell,0.999248,B,Apex
AAATGGAGTTGTCTAG-1-H0015_apex,No,50-55,Harvard-Nuclei,doublets,H5,Female,670.216309,504,0.011200,0.000000,AX,H0015_apex,0.071618,DBD,V3,NK,0.680673,T,Apex
AACAACCGTAATTGGA-1-H0015_apex,No,50-55,Harvard-Nuclei,DOCK4+MØ1,H5,Female,730.082947,578,0.004231,0.002821,AX,H0015_apex,0.085546,DBD,V3,CD14+Monocyte,0.538159,Monocyte,Apex
AAGACTCTCAGGACGA-1-H0015_apex,No,50-55,Harvard-Nuclei,Mast,H5,Female,612.323425,428,0.001835,0.005505,AX,H0015_apex,0.048649,DBD,V3,Mast,0.990977,Monocyte,Apex
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGATCGTGTCATGT-1-HCAHeart8102862,Yes,60-65,Sanger-CD45,CD4+T_cytox,D11,Female,631.149170,715,0.156748,0.095643,AX,HCAHeart8102862,0.023596,DCD,V3,CD8+T_cell,0.756579,T,SC
TTTGATCGTTCTCCTG-1-HCAHeart8102862,Yes,60-65,Sanger-CD45,LYVE1+MØ1,D11,Female,819.040100,2526,0.138066,0.086377,AX,HCAHeart8102862,0.197861,DCD,V3,CD8+T_cell,0.269561,T,SC
TTTGGAGGTCGCTCGA-1-HCAHeart8102862,Yes,60-65,Sanger-CD45,MØ_AgP,D11,Female,757.455505,1350,0.151764,0.024610,AX,HCAHeart8102862,0.085546,DCD,V3,M3,0.585436,Monocyte,SC
TTTGGTTTCAGTGTTG-1-HCAHeart8102862,Yes,60-65,Sanger-CD45,LYVE1+MØ3,D11,Female,815.372131,2507,0.085284,0.055148,AX,HCAHeart8102862,0.132150,DCD,V3,MØ,0.968681,Monocyte,SC


In [7]:
#2.Obs/Var level matrices - Adding data on multi dimensions(both obs and var)
immune.obsm["X_umap"] = np.random.normal(0, 1, size=(immune.n_obs))
immune.varm["gene_data"] = np.random.normal(0, 1, size=(immune.n_vars))
immune

AnnData object with n_obs × n_vars = 40868 × 33538
    obs: 'NRP', 'age_group', 'cell_source', 'cell_states', 'donor', 'gender', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'region', 'sample', 'scrublet_score', 'type', 'version', 'scNym', 'scNym_confidence', 'cell_type', 'cs_short'
    uns: 'cell_states_colors', 'scNym_colors', 'scNym_probabilities'
    obsm: 'X_pca', 'X_scnym', 'X_umap'
    varm: 'gene_data'

In [8]:
#3. Unstructred metimmune
immune.uns["random"] = [1, 2, 3]
immune

AnnData object with n_obs × n_vars = 40868 × 33538
    obs: 'NRP', 'age_group', 'cell_source', 'cell_states', 'donor', 'gender', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'region', 'sample', 'scrublet_score', 'type', 'version', 'scNym', 'scNym_confidence', 'cell_type', 'cs_short'
    uns: 'cell_states_colors', 'scNym_colors', 'scNym_probabilities', 'random'
    obsm: 'X_pca', 'X_scnym', 'X_umap'
    varm: 'gene_data'

In [9]:
#MAKING LAYERS OF DATA

#Making Normalized data using log transformation and storing as a layer
immune.layers["log_transformed"] = np.log1p(immune.X)
immune

#Making a dataframe of log values
logdata = immune.to_df(layer="log_transformed")
logdata

#To view the layers in anndata
ad.AnnData(immune)

AnnData object with n_obs × n_vars = 40868 × 33538
    obs: 'NRP', 'age_group', 'cell_source', 'cell_states', 'donor', 'gender', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'region', 'sample', 'scrublet_score', 'type', 'version', 'scNym', 'scNym_confidence', 'cell_type', 'cs_short'
    uns: 'cell_states_colors', 'scNym_colors', 'scNym_probabilities', 'random'
    obsm: 'X_pca', 'X_scnym', 'X_umap'
    varm: 'gene_data'
    layers: 'log_transformed'

In [10]:
#Saving data into file
immune.write('./outputs/my_results.h5ad', compression="gzip")
logdata.to_csv('./outputs/logdata.txt')