# Notebook to run data integration of 'GSE143437' dataset 

- **Developed by**: Srivalli Kolla
- **Created date** : 03 July, 2024
- **Modification date** : 03 July, 2024
- **Würzburg Institute for Systems Immunology & Julius-Maximilian-Universität Würzburg**

# Import packages

In [1]:
import anndata as ad
import scanpy as sc

In [2]:
path = '../data/'

# Cardiomyocytes

## Data loading

In [3]:
cmc = ad.read_h5ad(path + 'heart_mm_nuclei-23-0092_CMC_states_ctl240131.raw.h5ad')
GSE143437 = ad.read_h5ad(path + 'GSE143437_QC_03_07_2024_raw.h5ad')


print("Columns in cmc.obs:", cmc.obs.columns)
print("Columns in GSE143437.obs:", GSE143437.obs.columns)
print("Columns in cmc.var:", cmc.var.columns)
print("Columns in GSE143437.var:", GSE143437.var.columns)

Columns in cmc.obs: Index(['cell_source', 'cell_type', 'donor', 'n_counts', 'n_genes',
       'percent_mito', 'percent_ribo', 'region', 'sample', 'scrublet_score',
       'cell_states', 'seed_labels', 'genotype', 'batch', 'doublet_scores',
       'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt',
       'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'percent_chrY',
       'XIST-counts', 'S_score', 'G2M_score', '_scvi_batch', '_scvi_labels',
       'C_scANVI'],
      dtype='object')
Columns in GSE143437.obs: Index(['sample_id', 'sample_identifier', 'nUMI', 'nGene', 'percent_mito',
       'injury', 'cell_annotation', 'doublet_scores', 'predicted_doublets',
       'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt',
       'total_counts_ribo', 'pct_counts_ribo', 'percent_chrY', 'XIST-counts',
       'XIST-percentage', 'gender_check_cov', 'S_score', 'G2M_score', 'phase'],
      dtype='object')
Columns in cmc.var: Index(['gene_ids', 'feature

## Column renaming

##### Steps

1. Create and delete columns required
2. Mapping for columns that have the same information but different names
3. Rename columns in cmc to match those in GSE143437
4. Handle duplicate columns by making them unique
5. Find common columns and convert to list
6. Filter and rename columns in cmc and GSE143437 to match AnnData expectations
7. Concatenate the two datasets with inner join

In [4]:
cmc.obs['gender_check_cov'] = 'Male' 

del cmc.obs['cell_type']

obs_rename_mapping = {
    'donor' : 'sample_identifier' ,
    'sample': 'sample_id',
    'n_genes': 'nGene',
    'percent_mito': 'pct_counts_mt',
    'percent_ribo': 'pct_counts_ribo',
    'pct_counts_mt' : 'pct_counts_mt', 
    'C_scANVI' : 'cell_annotation'
}


cmc.obs.rename(columns=obs_rename_mapping, inplace=True)


def make_unique(column_names):
    seen = set()
    for idx, col in enumerate(column_names):
        while col in seen:
            col += '_dup'
        seen.add(col)
        column_names[idx] = col
    return column_names

cmc.obs.columns = make_unique(list(cmc.obs.columns))
GSE143437.obs.columns = make_unique(list(GSE143437.obs.columns))

cmc.var.columns = make_unique(list(cmc.var.columns))
GSE143437.var.columns = make_unique(list(GSE143437.var.columns))

common_obs_columns = list(set(cmc.var.columns).intersection(set(GSE143437.var.columns)))
common_var_columns = list(set(cmc.var.columns).intersection(set(GSE143437.var.columns)))

cmc = cmc[:, cmc.var_names.isin(common_var_columns)].copy()
GSE143437 = GSE143437[:, GSE143437.var_names.isin(common_var_columns)].copy()

## Integration

In [5]:
integrated_cmc = ad.concat([cmc, GSE143437], join='inner', index_unique=None)
integrated_cmc.obs

  warn(


Unnamed: 0,sample_identifier,nGene,pct_counts_mt,pct_counts_ribo,sample_id,doublet_scores,n_genes_by_counts,total_counts,total_counts_mt,total_counts_ribo,percent_chrY,XIST-counts,S_score,G2M_score,cell_annotation,gender_check_cov
ACACTGATCATTATCC-1-A9_2,A9_2,3851.0,,,A9_2,0.049904,3851.0,14981.0,4.0,6.0,0.046726,0.0,-0.218301,-0.450239,vCM2,Male
TCTTCCTGTCATAACC-1-A9_2,A9_2,3577.0,,,A9_2,0.048159,3577.0,14906.0,4.0,10.0,0.080504,0.0,0.084806,-0.395886,vCM2,Male
ATCTCTAGTTTCAGAC-1-A9_2,A9_2,3632.0,,,A9_2,0.043243,3632.0,14969.0,1.0,9.0,0.053444,0.0,-0.352962,-0.206640,vCM2,Male
TGAATGCAGCTCCATA-1-A9_2,A9_2,3691.0,,,A9_2,0.057495,3691.0,14086.0,23.0,18.0,0.078092,1.0,-0.159373,-0.494394,vCM2,Male
GCAGCTGCACAAGTGG-1-A9_2,A9_2,3778.0,,,A9_2,0.061702,3778.0,14125.0,15.0,11.0,0.028319,0.0,0.351369,-0.141495,vCM2,Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D7_D_TTTGTCAAGGTGTTAA,D7_D,1173.0,3.552554,12.536700,SRR10870305,0.063719,1166.0,3406.0,121.0,427.0,0.000000,2.0,-0.210280,-0.512819,Anti-inflammatory macrophages,Female
D7_D_TTTGTCACAGTCAGCC,D7_D,3268.0,1.923886,12.352817,SRR10870305,0.053763,3370.0,9512.0,183.0,1175.0,0.021026,37.0,-0.064807,3.910752,MuSCs and progenitors,Female
D7_D_TTTGTCACATCGATGT,D7_D,2432.0,0.779578,9.859370,SRR10870305,0.034757,2456.0,6542.0,51.0,645.0,0.015286,4.0,-0.206855,-0.657905,FAPs,Female
D7_D_TTTGTCAGTCCGTTAA,D7_D,1407.0,1.275917,7.934609,SRR10870305,0.006983,1420.0,2508.0,32.0,199.0,0.000000,0.0,-0.480220,-0.241352,Endothelial,


In [6]:
cmc.write(path + 'cmc_integrated_gse143437.h5ad')

# Fibroblasts

## Data loading

In [7]:
fb = ad.read_h5ad(path + 'heart_mm_nuclei-23-0092_FB_states_ctl240131.raw.h5ad')
GSE143437 = ad.read_h5ad(path + 'GSE143437_QC_03_07_2024_raw.h5ad')

print("Columns in fb.obs:", fb.obs.columns)
print("Columns in GSE143437.obs:", GSE143437.obs.columns)

Columns in fb.obs: Index(['cell_source', 'cell_type', 'donor', 'n_counts', 'n_genes',
       'percent_mito', 'percent_ribo', 'region', 'sample', 'scrublet_score',
       'cell_states', 'seed_labels', 'genotype', 'batch', 'doublet_scores',
       'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt',
       'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'percent_chrY',
       'XIST-counts', 'S_score', 'G2M_score', '_scvi_batch', '_scvi_labels',
       'C_scANVI'],
      dtype='object')
Columns in GSE143437.obs: Index(['sample_id', 'sample_identifier', 'nUMI', 'nGene', 'percent_mito',
       'injury', 'cell_annotation', 'doublet_scores', 'predicted_doublets',
       'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt',
       'total_counts_ribo', 'pct_counts_ribo', 'percent_chrY', 'XIST-counts',
       'XIST-percentage', 'gender_check_cov', 'S_score', 'G2M_score', 'phase'],
      dtype='object')


## Column renaming

##### Steps

1. Mapping for columns that have the same information but different names
2. Rename columns in cmc to match those in GSE143437
3. Handle duplicate columns by making them unique
4. Find common columns and convert to list
5. Filter and rename columns in cmc and GSE143437 to match AnnData expectations
6. Concatenate the two datasets with inner join

In [8]:
fb.obs['gender_check_cov'] = 'Male' 

del fb.obs['cell_type']

obs_rename_mapping = {
    'donor' : 'sample_identifier' ,
    'sample': 'sample_id',
    'n_genes': 'nGene',
    'percent_mito': 'pct_counts_mt',
    'percent_ribo': 'pct_counts_ribo',
    'pct_counts_mt' : 'pct_counts_mt', 
    'C_scANVI' : 'cell_annotation'
}


fb.obs.rename(columns=obs_rename_mapping, inplace=True)


def make_unique(column_names):
    seen = set()
    for idx, col in enumerate(column_names):
        while col in seen:
            col += '_dup'
        seen.add(col)
        column_names[idx] = col
    return column_names

fb.obs.columns = make_unique(list(fb.obs.columns))
GSE143437.obs.columns = make_unique(list(GSE143437.obs.columns))

fb.var.columns = make_unique(list(fb.var.columns))
GSE143437.var.columns = make_unique(list(GSE143437.var.columns))

common_obs_columns = list(set(fb.var.columns).intersection(set(GSE143437.var.columns)))
common_var_columns = list(set(fb.var.columns).intersection(set(GSE143437.var.columns)))

fb = fb[:, fb.var_names.isin(common_var_columns)].copy()
GSE143437 = GSE143437[:, GSE143437.var_names.isin(common_var_columns)].copy()

## Integration

In [9]:
integrated_fb = ad.concat([fb, GSE143437], join='inner', index_unique=None)
integrated_fb.obs

  warn(


Unnamed: 0,sample_identifier,nGene,pct_counts_mt,pct_counts_ribo,sample_id,doublet_scores,n_genes_by_counts,total_counts,total_counts_mt,total_counts_ribo,percent_chrY,XIST-counts,S_score,G2M_score,cell_annotation,gender_check_cov
TCGCAGGAGGCTTCCG-1-A9_2,A9_2,4100.0,,,A9_2,0.286585,4100.0,9536.0,493.0,208.0,0.052433,0.0,0.138445,-0.249553,FB6,Male
CCGTTCACAAATTGCC-1-A9_2,A9_2,3727.0,,,A9_2,0.130000,3727.0,8218.0,147.0,42.0,0.158189,0.0,2.531231,4.013015,FB4,Male
CCAAGCGGTTTCGTAG-1-A9_2,A9_2,3080.0,,,A9_2,0.204651,3080.0,7227.0,365.0,288.0,0.083022,0.0,-0.197844,-0.295802,FB6,Male
AACGGGACACTCAAGT-1-A9_2,A9_2,3271.0,,,A9_2,0.269565,3271.0,6503.0,165.0,113.0,0.107643,0.0,0.319994,0.019119,FB6,Male
GGAATGGAGACAACAT-1-A9_2,A9_2,2987.0,,,A9_2,0.194631,2987.0,6277.0,375.0,135.0,0.111518,0.0,0.042216,-0.392916,FB6,Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D7_D_TTTGTCAAGGTGTTAA,D7_D,1173.0,3.552554,12.536700,SRR10870305,0.063719,1166.0,3406.0,121.0,427.0,0.000000,2.0,-0.210280,-0.512819,Anti-inflammatory macrophages,Female
D7_D_TTTGTCACAGTCAGCC,D7_D,3268.0,1.923886,12.352817,SRR10870305,0.053763,3370.0,9512.0,183.0,1175.0,0.021026,37.0,-0.064807,3.910752,MuSCs and progenitors,Female
D7_D_TTTGTCACATCGATGT,D7_D,2432.0,0.779578,9.859370,SRR10870305,0.034757,2456.0,6542.0,51.0,645.0,0.015286,4.0,-0.206855,-0.657905,FAPs,Female
D7_D_TTTGTCAGTCCGTTAA,D7_D,1407.0,1.275917,7.934609,SRR10870305,0.006983,1420.0,2508.0,32.0,199.0,0.000000,0.0,-0.480220,-0.241352,Endothelial,


In [10]:
integrated_fb.write(path +'fb_integrated_gse143437.h5ad')

# Lymphoid

## Data loading

In [11]:
lym = ad.read_h5ad(path + 'heart_mm_nuclei-23-0092_scANVI-Lymphoid_states_ctl240527.raw.h5ad')
GSE143437 = ad.read_h5ad(path + 'GSE143437_QC_03_07_2024_raw.h5ad')

print("Columns in lym.obs:", lym.obs.columns)
print("Columns in GSE143437.obs:", GSE143437.obs.columns)

Columns in lym.obs: Index(['orig.ident', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status',
       'Ethnicity', 'Group', 'Sex', 'annotation_broad', 'annotation_detailed',
       'sample_id', 'seed_labels', 'donor', 'cell_states', 'sample',
       'cell_source', 'genotype', 'nCount_HTO', 'nFeature_HTO',
       'HTO_classification', 'Library', 'CD45_Annotation', 'cell_type',
       'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'region',
       'scrublet_score', 'batch', 'doublet_scores', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'percent_mt2', 'percent_chrY', 'XIST-counts',
       'S_score', 'G2M_score', '_scvi_batch', '_scvi_labels', 'MPC_Annotation',
       'compartment'],
      dtype='object')
Columns in GSE143437.obs: Index(['sample_id', 'sample_identifier', 'nUMI', 'nGene', 'percent_mito',
       'injury', 'cell_annotation', 'doublet_scores', 'predicted_doublets',
       'n_genes_by_coun

In [12]:
lym.obs['cell_type']

AAACGCTGTTATAGAG-1_1-MDX_POOL_NEW      CD8Tctl
AAAGAACTCCCGTGTT-1_1-MDX_POOL_NEW            T
AACAAAGAGCGTTGTT-1_1-MDX_POOL_NEW      CD8Tctl
AACAAAGTCAAGCCTA-1_1-MDX_POOL_NEW    CD8Tnaive
AACAAGAAGTTGAATG-1_1-MDX_POOL_NEW      CD8Tctl
                                       ...    
TTCTAACAGTCACACT-1_2-MDX_MPC                NK
TTGAGTGCACTCCGAG-1_2-MDX_MPC               gdT
TTGGGCGTCCCGTGAG-1_2-MDX_MPC                NK
TTTACTGAGATCACTC-1_2-MDX_MPC                NK
TTTGGAGAGGATAATC-1_2-MDX_MPC                NK
Name: cell_type, Length: 65544, dtype: category
Categories (18, object): ['B_cells', 'B_mem', 'B_naive', 'CD4Tctl', ..., 'Plasma_cells', 'T', 'Treg', 'gdT']

## Column renaming

##### Steps

1. Mapping for columns that have the same information but different names
2. Rename columns in cmc to match those in GSE143437
3. Handle duplicate columns by making them unique
4. Find common columns and convert to list
5. Filter and rename columns in cmc and GSE143437 to match AnnData expectations
6. Concatenate the two datasets with inner join

In [13]:
lym.obs['gender_check_cov'] = 'Male' 

obs_rename_mapping = {
    'donor' : 'sample_identifier' ,
    'sample': 'sample_id',
    'n_genes': 'nGene',
    'percent_mito': 'pct_counts_mt',
    'percent_ribo': 'pct_counts_ribo',
    'pct_counts_mt' : 'pct_counts_mt', 
    'cell_type' : 'cell_annotation'
}


lym.obs.rename(columns=obs_rename_mapping, inplace=True)


def make_unique(column_names):
    seen = set()
    for idx, col in enumerate(column_names):
        while col in seen:
            col += '_dup'
        seen.add(col)
        column_names[idx] = col
    return column_names

lym.obs.columns = make_unique(list(lym.obs.columns))
GSE143437.obs.columns = make_unique(list(GSE143437.obs.columns))

lym.var.columns = make_unique(list(lym.var.columns))
GSE143437.var.columns = make_unique(list(GSE143437.var.columns))

common_obs_columns = list(set(lym.var.columns).intersection(set(GSE143437.var.columns)))
common_var_columns = list(set(lym.var.columns).intersection(set(GSE143437.var.columns)))

lym = lym[:, lym.var_names.isin(common_var_columns)].copy()
GSE143437 = GSE143437[:, GSE143437.var_names.isin(common_var_columns)].copy()

## Integration

In [14]:
integrated_lym = ad.concat([lym, GSE143437], join='inner', index_unique=None)
integrated_lym.obs

  warn(


Unnamed: 0,sample_id,sample_identifier,cell_annotation,nGene,pct_counts_mt,pct_counts_ribo,doublet_scores,n_genes_by_counts,total_counts,total_counts_mt,total_counts_ribo,percent_chrY,XIST-counts,S_score,G2M_score,gender_check_cov
AAACGCTGTTATAGAG-1_1-MDX_POOL_NEW,,Ctrl5,CD8Tctl,,,,,,,,,,,,,Male
AAAGAACTCCCGTGTT-1_1-MDX_POOL_NEW,,Ctrl2,T,,,,,,,,,,,,,Male
AACAAAGAGCGTTGTT-1_1-MDX_POOL_NEW,,Ctrl5,CD8Tctl,,,,,,,,,,,,,Male
AACAAAGTCAAGCCTA-1_1-MDX_POOL_NEW,,Ctrl5,CD8Tnaive,,,,,,,,,,,,,Male
AACAAGAAGTTGAATG-1_1-MDX_POOL_NEW,,Ctrl4,CD8Tctl,,,,,,,,,,,,,Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D7_D_TTTGTCAAGGTGTTAA,SRR10870305,D7_D,Anti-inflammatory macrophages,1173.0,3.552554,12.536700,0.063719,1166.0,3406.0,121.0,427.0,0.000000,2.0,-0.210280,-0.512819,Female
D7_D_TTTGTCACAGTCAGCC,SRR10870305,D7_D,MuSCs and progenitors,3268.0,1.923886,12.352817,0.053763,3370.0,9512.0,183.0,1175.0,0.021026,37.0,-0.064807,3.910752,Female
D7_D_TTTGTCACATCGATGT,SRR10870305,D7_D,FAPs,2432.0,0.779578,9.859370,0.034757,2456.0,6542.0,51.0,645.0,0.015286,4.0,-0.206855,-0.657905,Female
D7_D_TTTGTCAGTCCGTTAA,SRR10870305,D7_D,Endothelial,1407.0,1.275917,7.934609,0.006983,1420.0,2508.0,32.0,199.0,0.000000,0.0,-0.480220,-0.241352,


In [15]:
integrated_lym.write(path + 'lym_integrated_gse143437.h5ad')

# Myeloid

## Data loading

In [16]:
mye = ad.read_h5ad(path + 'heart_mm_nuclei-23-0092_scANVI-Myeloid_states_ctl240502.raw.h5ad')
GSE143437 = ad.read_h5ad(path + 'GSE143437_QC_03_07_2024_raw.h5ad')

print("Columns in mye.obs:", mye.obs.columns)
print("Columns in GSE143437.obs:", GSE143437.obs.columns)

Columns in mye.obs: Index(['orig.ident', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status',
       'Ethnicity', 'Group', 'Sex', 'annotation_broad', 'annotation_detailed',
       'sample_id', 'seed_labels', 'donor', 'cell_states', 'sample',
       'cell_source', 'genotype', 'nCount_HTO', 'nFeature_HTO',
       'HTO_classification', 'Library', 'CD45_Annotation', 'cell_type',
       'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'region',
       'scrublet_score', 'batch', 'doublet_scores', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'percent_mt2', 'percent_chrY', 'XIST-counts',
       'S_score', 'G2M_score', '_scvi_batch', '_scvi_labels', 'MPC_Annotation',
       'compartment'],
      dtype='object')
Columns in GSE143437.obs: Index(['sample_id', 'sample_identifier', 'nUMI', 'nGene', 'percent_mito',
       'injury', 'cell_annotation', 'doublet_scores', 'predicted_doublets',
       'n_genes_by_coun

In [17]:
mye.obs['cell_type']

AAACCCAGTCAAGCGA-1_1-MDX_POOL_NEW           TLF+MØ
AAACCCAGTGGCTTGC-1_1-MDX_POOL_NEW           TLF+MØ
AAACGCTGTAGTTACC-1_1-MDX_POOL_NEW           TLF+MØ
AAAGGATGTTATCTGG-1_1-MDX_POOL_NEW         Ly6CloMo
AAAGGATTCGTAGTCA-1_1-MDX_POOL_NEW              DC2
                                         ...      
TTTGGAGAGCGACTGA-1_2-MDX_MPC-1          MHCII+MØtr
TTTGGAGAGGATAATC-1_2-MDX_MPC-1              TLF+MØ
TTTGGAGTCTCGGCTT-1_2-MDX_MPC-1          MHCII+MØtr
TTTGTTGAGATTCGAA-1_2-MDX_MPC-1          MHCII+MØtr
TTTGTTGGTGATATAG-1_2-MDX_MPC-1       Ccr2+MHCII+MØ
Name: cell_type, Length: 41716, dtype: category
Categories (12, object): ['Ccr2+MHCII+MØ', 'DC', 'DC2', 'Isg15+MØ', ..., 'MØinf', 'NØ', 'Spp1+Gpnmb+MØ', 'TLF+MØ']

## Column renaming

##### Steps

1. Mapping for columns that have the same information but different names
2. Rename columns in cmc to match those in GSE143437
3. Handle duplicate columns by making them unique
4. Find common columns and convert to list
5. Filter and rename columns in cmc and GSE143437 to match AnnData expectations
6. Concatenate the two datasets with inner join

In [18]:
mye.obs['gender_check_cov'] = 'Male' 

obs_rename_mapping = {
    'donor' : 'sample_identifier' ,
    'sample': 'sample_id',
    'n_genes': 'nGene',
    'percent_mito': 'pct_counts_mt',
    'percent_ribo': 'pct_counts_ribo',
    'pct_counts_mt' : 'pct_counts_mt', 
    'cell_type' : 'cell_annotation'
}


mye.obs.rename(columns=obs_rename_mapping, inplace=True)


def make_unique(column_names):
    seen = set()
    for idx, col in enumerate(column_names):
        while col in seen:
            col += '_dup'
        seen.add(col)
        column_names[idx] = col
    return column_names

mye.obs.columns = make_unique(list(mye.obs.columns))
GSE143437.obs.columns = make_unique(list(GSE143437.obs.columns))

mye.var.columns = make_unique(list(mye.var.columns))
GSE143437.var.columns = make_unique(list(GSE143437.var.columns))

common_obs_columns = list(set(mye.var.columns).intersection(set(GSE143437.var.columns)))
common_var_columns = list(set(mye.var.columns).intersection(set(GSE143437.var.columns)))

mye = mye[:, mye.var_names.isin(common_var_columns)].copy()
GSE143437 = GSE143437[:, GSE143437.var_names.isin(common_var_columns)].copy()

## Integration

In [19]:
integrated_mye= ad.concat([mye, GSE143437], join='inner', index_unique=None)
integrated_mye.obs

  warn(


Unnamed: 0,sample_id,sample_identifier,cell_annotation,nGene,pct_counts_mt,pct_counts_ribo,doublet_scores,n_genes_by_counts,total_counts,total_counts_mt,total_counts_ribo,percent_chrY,XIST-counts,S_score,G2M_score,gender_check_cov
AAACCCAGTCAAGCGA-1_1-MDX_POOL_NEW,,Ctrl2,TLF+MØ,,,,,,,,,,,,,Male
AAACCCAGTGGCTTGC-1_1-MDX_POOL_NEW,,Ctrl3,TLF+MØ,,,,,,,,,,,,,Male
AAACGCTGTAGTTACC-1_1-MDX_POOL_NEW,,Ctrl4,TLF+MØ,,,,,,,,,,,,,Male
AAAGGATGTTATCTGG-1_1-MDX_POOL_NEW,,Ctrl1,Ly6CloMo,,,,,,,,,,,,,Male
AAAGGATTCGTAGTCA-1_1-MDX_POOL_NEW,,Ctrl5,DC2,,,,,,,,,,,,,Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D7_D_TTTGTCAAGGTGTTAA,SRR10870305,D7_D,Anti-inflammatory macrophages,1173.0,3.552554,12.536700,0.063719,1166.0,3406.0,121.0,427.0,0.000000,2.0,-0.210280,-0.512819,Female
D7_D_TTTGTCACAGTCAGCC,SRR10870305,D7_D,MuSCs and progenitors,3268.0,1.923886,12.352817,0.053763,3370.0,9512.0,183.0,1175.0,0.021026,37.0,-0.064807,3.910752,Female
D7_D_TTTGTCACATCGATGT,SRR10870305,D7_D,FAPs,2432.0,0.779578,9.859370,0.034757,2456.0,6542.0,51.0,645.0,0.015286,4.0,-0.206855,-0.657905,Female
D7_D_TTTGTCAGTCCGTTAA,SRR10870305,D7_D,Endothelial,1407.0,1.275917,7.934609,0.006983,1420.0,2508.0,32.0,199.0,0.000000,0.0,-0.480220,-0.241352,


In [20]:
integrated_mye.write(path + 'mye_integrated_gse143437.h5ad')

# Vascular

## Data loading

In [21]:
vas = ad.read_h5ad(path + 'heart_mm_nuclei-23-0092_scANVI-vascular_states_ctl240131.raw.h5ad')
GSE143437 = ad.read_h5ad(path + 'GSE143437_QC_03_07_2024_raw.h5ad')

print("Columns in vas.obs:", vas.obs.columns)
print("Columns in GSE143437.obs:", GSE143437.obs.columns)

Columns in vas.obs: Index(['cell_source', 'cell_type', 'donor', 'n_counts', 'n_genes',
       'percent_mito', 'percent_ribo', 'region', 'sample', 'scrublet_score',
       'cell_states', 'seed_labels', 'genotype', 'batch', 'doublet_scores',
       'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt',
       'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'percent_chrY',
       'XIST-counts', 'S_score', 'G2M_score', '_scvi_batch', '_scvi_labels',
       'C_scANVI'],
      dtype='object')
Columns in GSE143437.obs: Index(['sample_id', 'sample_identifier', 'nUMI', 'nGene', 'percent_mito',
       'injury', 'cell_annotation', 'doublet_scores', 'predicted_doublets',
       'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt',
       'total_counts_ribo', 'pct_counts_ribo', 'percent_chrY', 'XIST-counts',
       'XIST-percentage', 'gender_check_cov', 'S_score', 'G2M_score', 'phase'],
      dtype='object')


## Column renaming

##### Steps

1. Mapping for columns that have the same information but different names
2. Rename columns in cmc to match those in GSE143437
3. Handle duplicate columns by making them unique
4. Find common columns and convert to list
5. Filter and rename columns in cmc and GSE143437 to match AnnData expectations
6. Concatenate the two datasets with inner join

In [22]:
vas.obs['gender_check_cov'] = 'Male' 

del vas.obs['cell_type']


obs_rename_mapping = {
    'donor' : 'sample_identifier' ,
    'sample': 'sample_id',
    'n_genes': 'nGene',
    'percent_mito': 'pct_counts_mt',
    'percent_ribo': 'pct_counts_ribo',
    'pct_counts_mt' : 'pct_counts_mt', 
    'C_scANVI' : 'cell_annotation'
}


vas.obs.rename(columns=obs_rename_mapping, inplace=True)


def make_unique(column_names):
    seen = set()
    for idx, col in enumerate(column_names):
        while col in seen:
            col += '_dup'
        seen.add(col)
        column_names[idx] = col
    return column_names

vas.obs.columns = make_unique(list(vas.obs.columns))
GSE143437.obs.columns = make_unique(list(GSE143437.obs.columns))

vas.var.columns = make_unique(list(vas.var.columns))
GSE143437.var.columns = make_unique(list(GSE143437.var.columns))

common_obs_columns = list(set(vas.var.columns).intersection(set(GSE143437.var.columns)))
common_var_columns = list(set(vas.var.columns).intersection(set(GSE143437.var.columns)))

vas = vas[:, vas.var_names.isin(common_var_columns)].copy()
GSE143437 = GSE143437[:, GSE143437.var_names.isin(common_var_columns)].copy()

## Integration

In [23]:
integrated_vas= ad.concat([vas, GSE143437], join='inner', index_unique=None)
integrated_vas.obs

  warn(


Unnamed: 0,sample_identifier,nGene,pct_counts_mt,pct_counts_ribo,sample_id,doublet_scores,n_genes_by_counts,total_counts,total_counts_mt,total_counts_ribo,percent_chrY,XIST-counts,S_score,G2M_score,cell_annotation,gender_check_cov
CTCCAACCACAAGTTC-1-A9_2,A9_2,4489.0,,,A9_2,0.286585,4489.0,13466.0,220.0,117.0,0.051983,0.0,-0.355854,-0.387663,SMC2_art,Male
AGGTTGTCAGCACGAA-1-A9_2,A9_2,4408.0,,,A9_2,0.326531,4408.0,12946.0,274.0,59.0,0.046346,0.0,-0.264880,-0.409038,SMC2_art,Male
AGGCCACCAGAAGCTG-1-A9_2,A9_2,4665.0,,,A9_2,0.269565,4665.0,13071.0,754.0,282.0,0.045903,3.0,0.299016,-0.284226,SMC2_art,Male
AGTCTCCCACGATAGG-1-A9_2,A9_2,4438.0,,,A9_2,0.305466,4438.0,12511.0,282.0,97.0,0.103909,0.0,-0.133833,-0.517638,SMC2_art,Male
GGACGTCCAATCCAGT-1-A9_2,A9_2,4604.0,,,A9_2,0.254144,4604.0,12519.0,984.0,320.0,0.031951,0.0,-0.502134,-0.341586,EC7_atria,Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D7_D_TTTGTCAAGGTGTTAA,D7_D,1173.0,3.552554,12.536700,SRR10870305,0.063719,1166.0,3406.0,121.0,427.0,0.000000,2.0,-0.210280,-0.512819,Anti-inflammatory macrophages,Female
D7_D_TTTGTCACAGTCAGCC,D7_D,3268.0,1.923886,12.352817,SRR10870305,0.053763,3370.0,9512.0,183.0,1175.0,0.021026,37.0,-0.064807,3.910752,MuSCs and progenitors,Female
D7_D_TTTGTCACATCGATGT,D7_D,2432.0,0.779578,9.859370,SRR10870305,0.034757,2456.0,6542.0,51.0,645.0,0.015286,4.0,-0.206855,-0.657905,FAPs,Female
D7_D_TTTGTCAGTCCGTTAA,D7_D,1407.0,1.275917,7.934609,SRR10870305,0.006983,1420.0,2508.0,32.0,199.0,0.000000,0.0,-0.480220,-0.241352,Endothelial,


In [24]:
integrated_vas.write(path + 'vas_integrated_gse143437.h5ad')