# Merge IHD with HCA AP V3 controls
## Analysis date 2022/02/15

In [1]:
import numpy as np
import pandas as pd
import anndata
import scanpy as sc
import seaborn as sns
import harmonypy as hm
import scrublet as scr
import matplotlib.pyplot as plt
from collections import OrderedDict

sc.settings.verbosity = 1  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=120, color_map='viridis')
sc.logging.print_header()

scanpy==1.8.2 anndata==0.7.6 umap==0.5.2 numpy==1.20.1 scipy==1.7.1 pandas==1.3.4 scikit-learn==1.0.1 statsmodels==0.13.0 python-igraph==0.9.8 pynndescent==0.5.4


## Read data

In [2]:
INDIR='..'
OUTDIR='.'

In [3]:
LVAD_orig = sc.read_h5ad(INDIR + '/CABG_merged_filtered_2022-02-15.h5ad')

In [5]:
LVAD_orig.obs.columns

Index(['n_genes', 'percent_mito', 'n_counts', 'log_counts', 'percent_ribo',
       'scrublet_score', 'predicted_doublets', 'Sample', 'Patient', 'Source',
       'Region', 'Group', 'Sample_type', 'Location', 'batch'],
      dtype='object')

In [7]:
LVAD_orig.var_names

Index(['MIR1302-2HG', 'FAM138A', 'OR4F5', 'AL627309.1', 'AL627309.3',
       'AL627309.2', 'AL627309.5', 'AL627309.4', 'AP006222.2', 'AL732372.1',
       ...
       'AC133551.1', 'AC136612.1', 'AC136616.1', 'AC136616.3', 'AC136616.2',
       'AC141272.1', 'AC023491.2', 'AC007325.1', 'AC007325.4', 'AC007325.2'],
      dtype='object', length=36601)

In [8]:
LVAD_orig.var.head()

Unnamed: 0,gene_ids,feature_types
MIR1302-2HG,ENSG00000243485,Gene Expression
FAM138A,ENSG00000237613,Gene Expression
OR4F5,ENSG00000186092,Gene Expression
AL627309.1,ENSG00000238009,Gene Expression
AL627309.3,ENSG00000239945,Gene Expression


In [9]:
LVAD_orig.var.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36601 entries, MIR1302-2HG to AC007325.2
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   gene_ids       36601 non-null  object  
 1   feature_types  36601 non-null  category
dtypes: category(1), object(1)
memory usage: 1.6+ MB


In [10]:
LVAD_orig.var = LVAD_orig.var.reset_index().set_index('gene_ids')

In [11]:
LVAD_orig.var_names

Index(['ENSG00000243485', 'ENSG00000237613', 'ENSG00000186092',
       'ENSG00000238009', 'ENSG00000239945', 'ENSG00000239906',
       'ENSG00000241860', 'ENSG00000241599', 'ENSG00000286448',
       'ENSG00000236601',
       ...
       'ENSG00000274175', 'ENSG00000275869', 'ENSG00000273554',
       'ENSG00000278782', 'ENSG00000277761', 'ENSG00000277836',
       'ENSG00000278633', 'ENSG00000276017', 'ENSG00000278817',
       'ENSG00000277196'],
      dtype='object', name='gene_ids', length=36601)

In [12]:
LVAD_orig.var

Unnamed: 0_level_0,index,feature_types
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000243485,MIR1302-2HG,Gene Expression
ENSG00000237613,FAM138A,Gene Expression
ENSG00000186092,OR4F5,Gene Expression
ENSG00000238009,AL627309.1,Gene Expression
ENSG00000239945,AL627309.3,Gene Expression
...,...,...
ENSG00000277836,AC141272.1,Gene Expression
ENSG00000278633,AC023491.2,Gene Expression
ENSG00000276017,AC007325.1,Gene Expression
ENSG00000278817,AC007325.4,Gene Expression


In [13]:
LVAD_orig.var.columns = ['gene_name_IC','feature_type']

In [14]:
LVAD_orig.var.index.value_counts()

ENSG00000243485    1
ENSG00000125952    1
ENSG00000070182    1
ENSG00000258289    1
ENSG00000176153    1
                  ..
ENSG00000223821    1
ENSG00000135297    1
ENSG00000164430    1
ENSG00000080007    1
ENSG00000277196    1
Name: gene_ids, Length: 36601, dtype: int64

### HCA

In [15]:
HCA_orig = sc.read_h5ad('/home/mlee/RDS/projects/cardiac_single_cell_biology/live/HCA/hca_heart_global_ctl200723_freeze.h5ad')

In [17]:
HCA_orig.obs.columns

Index(['NRP', 'age_group', 'cell_source', 'cell_type', 'donor', 'gender',
       'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'region',
       'sample', 'scrublet_score', 'source', 'type', 'version', 'cell_states',
       'Used'],
      dtype='object')

In [18]:
HCA_orig.obs.columns = ['NRP', 'age_group', 'cell_source', 'HCA_cell_type', 'Patient', 'gender',
       'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'Region',
       'Sample', 'scrublet_score', 'Source', 'type', 'version', 'HCA_cell_states',
       'Used']

In [20]:
HCA_orig.obs.Region.cat.categories

Index(['AX', 'LA', 'LV', 'RA', 'RV', 'SP'], dtype='object')

In [21]:
HCA_orig_AP = HCA_orig[HCA_orig.obs.Region.isin(['LV'])]

In [22]:
HCA_orig_AP.obs.Region

AAACCCAAGAGAATCT-1-H0015_LV           LV
AAACCCAAGTCCCTAA-1-H0015_LV           LV
AAACCCACAAGTTGGG-1-H0015_LV           LV
AAACCCACACCGAATT-1-H0015_LV           LV
AAACCCACATGGGTTT-1-H0015_LV           LV
                                      ..
TTTGATCTCATTCTTG-1-HCAHeart8102859    LV
TTTGGTTAGGAGTACC-1-HCAHeart8102859    LV
TTTGGTTAGGATCATA-1-HCAHeart8102859    LV
TTTGGTTAGGCATGGT-1-HCAHeart8102859    LV
TTTGGTTCACCGTACG-1-HCAHeart8102859    LV
Name: Region, Length: 107261, dtype: category
Categories (1, object): ['LV']

In [23]:
HCA_orig_AP_nuclei = HCA_orig_AP[(HCA_orig_AP.obs.Source == 'Nuclei') & (HCA_orig_AP.obs.version == 'V3')]

In [24]:
HCA_orig_AP_nuclei

View of AnnData object with n_obs × n_vars = 53559 × 33538
    obs: 'NRP', 'age_group', 'cell_source', 'HCA_cell_type', 'Patient', 'gender', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'Region', 'Sample', 'scrublet_score', 'Source', 'type', 'version', 'HCA_cell_states', 'Used'
    var: 'gene_ids-Harvard-Nuclei', 'feature_types-Harvard-Nuclei', 'gene_ids-Sanger-Nuclei', 'feature_types-Sanger-Nuclei', 'gene_ids-Sanger-Cells', 'feature_types-Sanger-Cells', 'gene_ids-Sanger-CD45', 'feature_types-Sanger-CD45'
    uns: 'cell_type_colors'
    obsm: 'X_pca', 'X_umap'

In [25]:
HCA_orig_AP_nuclei.obs['Group'] = 'control' 
HCA_orig_AP_nuclei.obs['Location'] = 'Control' 

Trying to set attribute `.obs` of view, copying.


In [27]:
HCA_orig_AP_nuclei.var_names

Index(['MIR1302-2HG', 'FAM138A', 'OR4F5', 'AL627309.1', 'AL627309.3',
       'AL627309.2', 'AL627309.4', 'AL732372.1', 'OR4F29', 'AC114498.1',
       ...
       'AC007325.2', 'BX072566.1', 'AL354822.1', 'AC023491.2', 'AC004556.1',
       'AC233755.2', 'AC233755.1', 'AC240274.1', 'AC213203.1', 'FAM231C'],
      dtype='object', length=33538)

In [28]:
HCA_orig_AP_nuclei.var.columns

Index(['gene_ids-Harvard-Nuclei', 'feature_types-Harvard-Nuclei',
       'gene_ids-Sanger-Nuclei', 'feature_types-Sanger-Nuclei',
       'gene_ids-Sanger-Cells', 'feature_types-Sanger-Cells',
       'gene_ids-Sanger-CD45', 'feature_types-Sanger-CD45'],
      dtype='object')

In [29]:
HCA_orig_AP_nuclei.var

Unnamed: 0,gene_ids-Harvard-Nuclei,feature_types-Harvard-Nuclei,gene_ids-Sanger-Nuclei,feature_types-Sanger-Nuclei,gene_ids-Sanger-Cells,feature_types-Sanger-Cells,gene_ids-Sanger-CD45,feature_types-Sanger-CD45
MIR1302-2HG,ENSG00000243485,Gene Expression,ENSG00000243485,0,ENSG00000243485,0,ENSG00000243485,0
FAM138A,ENSG00000237613,Gene Expression,ENSG00000237613,0,ENSG00000237613,0,ENSG00000237613,0
OR4F5,ENSG00000186092,Gene Expression,ENSG00000186092,0,ENSG00000186092,0,ENSG00000186092,0
AL627309.1,ENSG00000238009,Gene Expression,ENSG00000238009,0,ENSG00000238009,0,ENSG00000238009,0
AL627309.3,ENSG00000239945,Gene Expression,ENSG00000239945,0,ENSG00000239945,0,ENSG00000239945,0
...,...,...,...,...,...,...,...,...
AC233755.2,ENSG00000277856,Gene Expression,ENSG00000277856,0,ENSG00000277856,0,ENSG00000277856,0
AC233755.1,ENSG00000275063,Gene Expression,ENSG00000275063,0,ENSG00000275063,0,ENSG00000275063,0
AC240274.1,ENSG00000271254,Gene Expression,ENSG00000271254,0,ENSG00000271254,0,ENSG00000271254,0
AC213203.1,ENSG00000277475,Gene Expression,ENSG00000277475,0,ENSG00000277475,0,ENSG00000277475,0


In [30]:
HCA_orig_AP_nuclei.var.info()

<class 'pandas.core.frame.DataFrame'>
Index: 33538 entries, MIR1302-2HG to FAM231C
Data columns (total 8 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   gene_ids-Harvard-Nuclei       33538 non-null  object  
 1   feature_types-Harvard-Nuclei  33538 non-null  category
 2   gene_ids-Sanger-Nuclei        33538 non-null  object  
 3   feature_types-Sanger-Nuclei   33538 non-null  int8    
 4   gene_ids-Sanger-Cells         33538 non-null  object  
 5   feature_types-Sanger-Cells    33538 non-null  int8    
 6   gene_ids-Sanger-CD45          33538 non-null  object  
 7   feature_types-Sanger-CD45     33538 non-null  int8    
dtypes: category(1), int8(3), object(4)
memory usage: 2.4+ MB


In [31]:
HCA_orig_AP_nuclei.var.iloc[:,0:2]

Unnamed: 0,gene_ids-Harvard-Nuclei,feature_types-Harvard-Nuclei
MIR1302-2HG,ENSG00000243485,Gene Expression
FAM138A,ENSG00000237613,Gene Expression
OR4F5,ENSG00000186092,Gene Expression
AL627309.1,ENSG00000238009,Gene Expression
AL627309.3,ENSG00000239945,Gene Expression
...,...,...
AC233755.2,ENSG00000277856,Gene Expression
AC233755.1,ENSG00000275063,Gene Expression
AC240274.1,ENSG00000271254,Gene Expression
AC213203.1,ENSG00000277475,Gene Expression


In [32]:
HCA_orig_AP_nuclei.var = HCA_orig_AP_nuclei.var.iloc[:,0:2]

In [33]:
HCA_orig_AP_nuclei.var.columns = ['gene_ids', 'feature_types']

In [34]:
HCA_orig_AP_nuclei.var = HCA_orig_AP_nuclei.var.reset_index().set_index('gene_ids')

In [35]:
HCA_orig_AP_nuclei.var.columns = ['gene_name_HCA', 'feature_types']

In [36]:
HCA_orig_AP_nuclei.var 

Unnamed: 0_level_0,gene_name_HCA,feature_types
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000243485,MIR1302-2HG,Gene Expression
ENSG00000237613,FAM138A,Gene Expression
ENSG00000186092,OR4F5,Gene Expression
ENSG00000238009,AL627309.1,Gene Expression
ENSG00000239945,AL627309.3,Gene Expression
...,...,...
ENSG00000277856,AC233755.2,Gene Expression
ENSG00000275063,AC233755.1,Gene Expression
ENSG00000271254,AC240274.1,Gene Expression
ENSG00000277475,AC213203.1,Gene Expression


In [37]:
temp1 = ~HCA_orig_AP_nuclei.var_names.isin(LVAD_orig.var_names)
temp3 = HCA_orig_AP_nuclei.var[temp1]
temp3.shape
temp3

Unnamed: 0_level_0,gene_name_HCA,feature_types
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000251503,CENPS-CORT,Gene Expression
ENSG00000277726,AL109811.4,Gene Expression
ENSG00000242349,NPPA-AS1,Gene Expression
ENSG00000255275,AL080251.1,Gene Expression
ENSG00000270136,MINOS1-NBL1,Gene Expression
...,...,...
ENSG00000235609,AF127577.4,Gene Expression
ENSG00000265590,AP000275.2,Gene Expression
ENSG00000249624,AP000295.1,Gene Expression
ENSG00000249209,AP000311.1,Gene Expression


In [38]:
temp1 = ~LVAD_orig.var_names.isin(HCA_orig_AP_nuclei.var_names)
temp3 = LVAD_orig.var[temp1]
temp3.shape

(3869, 2)

In [39]:
temp3

Unnamed: 0_level_0,gene_name_IC,feature_type
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000241860,AL627309.5,Gene Expression
ENSG00000286448,AP006222.2,Gene Expression
ENSG00000228794,LINC01128,Gene Expression
ENSG00000242590,AL645608.5,Gene Expression
ENSG00000285812,AL390719.3,Gene Expression
...,...,...
ENSG00000288049,AC010889.2,Gene Expression
ENSG00000286247,AC009494.2,Gene Expression
ENSG00000288057,AC010086.3,Gene Expression
ENSG00000286187,AC024236.1,Gene Expression


### Merge LVAD and HCA

In [40]:
adata_orig = anndata.AnnData.concatenate(LVAD_orig, HCA_orig_AP_nuclei, join = 'inner')

In [41]:
adata_orig.var.columns

Index(['gene_name_IC-0', 'feature_type-0', 'gene_name_HCA-1',
       'feature_types-1'],
      dtype='object')

In [42]:
adata_orig.var.columns = ['gene_name_IC', 'feature_type-0', 'gene_name_HCA',
       'feature_types-1']
adata_orig.var

Unnamed: 0_level_0,gene_name_IC,feature_type-0,gene_name_HCA,feature_types-1
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000243485,MIR1302-2HG,Gene Expression,MIR1302-2HG,Gene Expression
ENSG00000237613,FAM138A,Gene Expression,FAM138A,Gene Expression
ENSG00000186092,OR4F5,Gene Expression,OR4F5,Gene Expression
ENSG00000238009,AL627309.1,Gene Expression,AL627309.1,Gene Expression
ENSG00000239945,AL627309.3,Gene Expression,AL627309.3,Gene Expression
...,...,...,...,...
ENSG00000277836,AC141272.1,Gene Expression,AC141272.1,Gene Expression
ENSG00000278633,AC023491.2,Gene Expression,AC023491.2,Gene Expression
ENSG00000276017,AC007325.1,Gene Expression,AC007325.1,Gene Expression
ENSG00000278817,AC007325.4,Gene Expression,AC007325.4,Gene Expression


In [43]:
adata_orig.var[adata_orig.var.gene_name_IC.isna()]

Unnamed: 0_level_0,gene_name_IC,feature_type-0,gene_name_HCA,feature_types-1
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [44]:
adata_orig.var = adata_orig.var.reset_index()

In [45]:
adata_orig.var['gene_name'] = adata_orig.var['gene_name_IC'].fillna(adata_orig.var['gene_ids'])
adata_orig.var[adata_orig.var.gene_name.isna()]

Unnamed: 0,gene_ids,gene_name_IC,feature_type-0,gene_name_HCA,feature_types-1,gene_name


In [46]:
adata_orig.var['feature_type'] = adata_orig.var['feature_type-0'].fillna(adata_orig.var['feature_types-1'])
adata_orig.var[adata_orig.var.feature_type.isna()]

Unnamed: 0,gene_ids,gene_name_IC,feature_type-0,gene_name_HCA,feature_types-1,gene_name,feature_type


In [47]:
adata_orig.var = adata_orig.var.reset_index().set_index('gene_name')

In [48]:
adata_orig.var = adata_orig.var.iloc[:,1:]

In [49]:
adata_orig.obs.Group = adata_orig.obs.Group.astype('category')
adata_orig.obs.Group.cat.reorder_categories(['control','IHD'], inplace=True)

  adata_orig.obs.Group.cat.reorder_categories(['control','IHD'], inplace=True)


In [51]:
adata_orig.write(OUTDIR + '/CABG_HCA-LV-V3_RAW_2022-02-15_inner.h5ad')

  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Sample' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Patient' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Location' as categorical
