In [1]:
from sklearn.metrics import r2_score
import pandas as pd
import numpy as np
import scanpy as sc
from sklearn.linear_model import LinearRegression

# Load adult data

In [2]:
adata = sc.read_h5ad("/data3/cyx/ForDatabase0606/Adult.Heart.SCVI.h5ad")

In [3]:
adata

AnnData object with n_obs × n_vars = 908378 × 43878
    obs: 'cell_ID', 'donor_ID', 'donor_gender', 'donor_age', 'original_name', 'organ', 'region', 'subregion', 'sample_status', 'seq_tech', 'cell_type', 'if_patient', 'donor_status', 'treatment', 'ethnicity', 'Ref', 'MCT', 'develop_stage', '_scvi_batch', '_scvi_labels', 'leiden', 'UMAP_1', 'UMAP_2'
    var: '0'
    uns: 'MCT_colors', 'Ref_colors', '_scvi_manager_uuid', '_scvi_uuid', 'cell_type_colors', 'leiden', 'neighbors', 'region_colors', 'subregion_colors', 'umap'
    obsm: 'X_scVI', 'X_umap'
    obsp: 'connectivities', 'distances'

# Generate Bulk data of All

In [74]:
sample_list = [adata.obs.donor_ID[i] + "_" + adata.obs.subregion[i] for i in range(adata.shape[0])]
adata.obs["sample"] = sample_list

In [75]:
np.unique(adata.obs["sample"],return_counts=True)

(array(['AdultHeart1_52Y_Left ventricle', 'AdultHeart2_47Y_Left ventricle',
        'Hua2020 Donor1_Interatrial septum',
        'Hua2020 Donor1_Interventricular septum',
        'Hua2020 Donor1_Left atria', 'Hua2020 Donor1_Left ventricle',
        'Hua2020 Donor1_Right atria', 'Hua2020 Donor1_Right ventricle',
        'Hua2020 Donor2_Interatrial septum',
        'Hua2020 Donor2_Interventricular septum',
        'Hua2020 Donor2_Left atria', 'Hua2020 Donor2_Left ventricle',
        'Hua2020 Donor2_Right atria', 'Hua2020 Donor2_Right ventricle',
        'N10_Left atria', 'N11_Left atria', 'N12_Left atria',
        'N1_Left ventricle', 'N2_Left ventricle', 'N3_Left ventricle',
        'N4_Left ventricle', 'N5_Left ventricle', 'N6_Left atria',
        'N7_Left atria', 'N8_Left atria', 'N9_Left atria',
        'Nature_D11_Interventricular septum', 'Nature_D11_Left atria',
        'Nature_D11_Left ventricle', 'Nature_D11_Right atria',
        'Nature_D11_Right ventricle', 'Nature_D1_Left atr

In [76]:
matrix_pseudobulk = pd.DataFrame(np.zeros([117,43878]))
matrix_pseudobulk.index = np.unique(adata.obs["sample"])
matrix_pseudobulk.columns = adata.var.index.values


meta_pseudobulk = pd.DataFrame(np.zeros([117,4]))
meta_pseudobulk.index = np.unique(adata.obs["sample"])
meta_pseudobulk.columns = ["donor_gender","donor_age","subregion","Ref"]

### bulk metadata

In [80]:
for ss in np.unique(adata.obs["sample"]):
    meta_pseudobulk.loc[ss,:] = adata.obs.loc[adata.obs["sample"]==ss,["donor_gender","donor_age","subregion","Ref"]].iloc[0,:]

In [81]:
meta_pseudobulk.donor_gender[meta_pseudobulk.donor_gender=="Male"] = 0
meta_pseudobulk.donor_gender[meta_pseudobulk.donor_gender=="Female"] = 1

meta_pseudobulk.donor_age[meta_pseudobulk.donor_age.isin(['21yr','33yr'])] = 0
meta_pseudobulk.donor_age[meta_pseudobulk.donor_age.isin(['36yr','39yr'])] = 1
meta_pseudobulk.donor_age[meta_pseudobulk.donor_age.isin(['40-45yr','42yr','43yr'])] = 2
meta_pseudobulk.donor_age[meta_pseudobulk.donor_age.isin(['45-50yr','45yr','46yr','47yr','48yr'])] = 3
meta_pseudobulk.donor_age[meta_pseudobulk.donor_age.isin(['50-55yr','50yr','51yr','52yr','54yr'])] = 4
meta_pseudobulk.donor_age[meta_pseudobulk.donor_age.isin(['55-60yr','59yr'])] = 5
meta_pseudobulk.donor_age[meta_pseudobulk.donor_age.isin(['60-65yr','60yr'])] = 6
meta_pseudobulk.donor_age[meta_pseudobulk.donor_age.isin(['65-70yr'])] = 7
meta_pseudobulk.donor_age[meta_pseudobulk.donor_age.isin(['70-75yr'])] = 8

#meta_pseudobulk.subregion[meta_pseudobulk.subregion=="Left ventricle"] = 0
#meta_pseudobulk.subregion[meta_pseudobulk.subregion=="Interventricular septum"] = 1
#meta_pseudobulk.subregion[meta_pseudobulk.subregion=="Right ventricle"] = 2
#meta_pseudobulk.subregion[meta_pseudobulk.subregion=="Left atria"] = 3
#meta_pseudobulk.subregion[meta_pseudobulk.subregion=="Interatrial septum"] = 4
#meta_pseudobulk.subregion[meta_pseudobulk.subregion=="Right atria"] = 5

In [82]:
meta_pseudobulk.to_csv("/data3/cyx/HHCAd_PB4GLM/PseudoBulk.metadata.csv")

### bulk expression matrix

In [31]:
for ss in np.unique(adata.obs["sample"]):
    matrix_sample = adata.X[adata.obs['sample'] == ss,:].todense()
    matrix_sample = np.exp(matrix_sample)-1
    vector_sample = matrix_sample.sum(axis =0)
    matrix_pseudobulk.loc[ss,:] = np.log1p(vector_sample/vector_sample.sum()*1e6)

In [35]:
matrix_pseudobulk.to_csv("/data3/cyx/HHCAd_PB4GLM/PseudoBulk.expression.csv")

# Generate Bulk data of each cell type

In [83]:
cell_types = ['Adipocyte', 'Cardiomyocyte cell', 'Endothelial cell',
       'Fibroblast', 'Lymphoid cell', 'Myeloid cell',
       'Neuron', 'Pericyte', 'Smooth muscle cell']

In [84]:
for ct in cell_types:
    adata_ct = adata[adata.obs.MCT==ct,:]
    
    sample_list_ct = np.unique(adata_ct.obs["sample"],return_counts=True)
    sample_list_ct = sample_list_ct[0][sample_list_ct[1]>=10]

    # genrate metadata dataframe of celltype
    meta_pseudobulk = pd.DataFrame(np.zeros([len(sample_list_ct),4]))
    meta_pseudobulk.index = sample_list_ct
    meta_pseudobulk.columns = ["donor_gender","donor_age","subregion","Ref"]
    for ss in sample_list_ct:
        meta_pseudobulk.loc[ss,:] = adata_ct.obs.loc[adata_ct.obs["sample"]==ss,["donor_gender","donor_age","subregion","Ref"]].iloc[0,:]
        
    meta_pseudobulk.donor_gender[meta_pseudobulk.donor_gender=="Male"] = 0
    meta_pseudobulk.donor_gender[meta_pseudobulk.donor_gender=="Female"] = 1

    meta_pseudobulk.donor_age[meta_pseudobulk.donor_age.isin(['21yr','33yr'])] = 0
    meta_pseudobulk.donor_age[meta_pseudobulk.donor_age.isin(['36yr','39yr'])] = 1
    meta_pseudobulk.donor_age[meta_pseudobulk.donor_age.isin(['40-45yr','42yr','43yr'])] = 2
    meta_pseudobulk.donor_age[meta_pseudobulk.donor_age.isin(['45-50yr','45yr','46yr','47yr','48yr'])] = 3
    meta_pseudobulk.donor_age[meta_pseudobulk.donor_age.isin(['50-55yr','50yr','51yr','52yr','54yr'])] = 4
    meta_pseudobulk.donor_age[meta_pseudobulk.donor_age.isin(['55-60yr','59yr'])] = 5
    meta_pseudobulk.donor_age[meta_pseudobulk.donor_age.isin(['60-65yr','60yr'])] = 6
    meta_pseudobulk.donor_age[meta_pseudobulk.donor_age.isin(['65-70yr'])] = 7
    meta_pseudobulk.donor_age[meta_pseudobulk.donor_age.isin(['70-75yr'])] = 8

    #meta_pseudobulk.subregion[meta_pseudobulk.subregion=="Left ventricle"] = 0
    #meta_pseudobulk.subregion[meta_pseudobulk.subregion=="Interventricular septum"] = 1
    #meta_pseudobulk.subregion[meta_pseudobulk.subregion=="Right ventricle"] = 2
    #meta_pseudobulk.subregion[meta_pseudobulk.subregion=="Left atria"] = 3
    #meta_pseudobulk.subregion[meta_pseudobulk.subregion=="Interatrial septum"] = 4
    #meta_pseudobulk.subregion[meta_pseudobulk.subregion=="Right atria"] = 5
    
    # genrate expression dataframe of celltype
    matrix_pseudobulk = pd.DataFrame(np.zeros([len(sample_list_ct),43878]))
    matrix_pseudobulk.index = sample_list_ct
    matrix_pseudobulk.columns = adata_ct.var.index.values
    for ss in sample_list_ct:
        matrix_sample = adata.X[adata_ct.obs['sample'] == ss,:].todense()
        matrix_sample = np.exp(matrix_sample)-1
        vector_sample = matrix_sample.sum(axis =0)
        matrix_pseudobulk.loc[ss,:] = vector_sample/vector_sample.sum()*1e6
        
    meta_pseudobulk.to_csv("/data3/cyx/HHCAd_PB4GLM/PseudoBulk"+ct+".metadata.csv")
    matrix_pseudobulk.to_csv("/data3/cyx/HHCAd_PB4GLM/PseudoBulk"+ct+".expression.csv")

# Load fetal data

In [85]:
adata = sc.read_h5ad("/data3/cyx/ForDatabase0606/Fetal.Heart.SCVI.h5ad")

In [86]:
adata

AnnData object with n_obs × n_vars = 117361 × 43878
    obs: 'cell_ID', 'donor_ID', 'donor_gender', 'donor_age', 'original_name', 'organ', 'region', 'subregion', 'sample_status', 'seq_tech', 'cell_type', 'if_patient', 'donor_status', 'treatment', 'ethnicity', 'Ref', 'MCT', 'develop_stage', 'PC_1', 'PC_2', 'TSNE_1', 'TSNE_2', 'UMAP_1', 'UMAP_2', '_scvi_batch', '_scvi_labels', 'leiden'
    var: '0'
    uns: 'Ref_colors', '_scvi_manager_uuid', '_scvi_uuid', 'cell_type_colors', 'leiden', 'neighbors', 'umap'
    obsm: 'X_scVI', 'X_umap'
    obsp: 'connectivities', 'distances'

# Generate Bulk data of All

In [87]:
sample_list = [adata.obs.donor_ID[i] + "_" + adata.obs.subregion[i] for i in range(adata.shape[0])]
adata.obs["sample"] = sample_list

In [91]:
matrix_pseudobulk = pd.DataFrame(np.zeros([99,43878]))
matrix_pseudobulk.index = np.unique(adata.obs["sample"])
matrix_pseudobulk.columns = adata.var.index.values


meta_pseudobulk = pd.DataFrame(np.zeros([99,4]))
meta_pseudobulk.index = np.unique(adata.obs["sample"])
meta_pseudobulk.columns = ["donor_gender","donor_age","subregion","Ref"]

### bulk metadata

In [92]:
for ss in np.unique(adata.obs["sample"]):
    meta_pseudobulk.loc[ss,:] = adata.obs.loc[adata.obs["sample"]==ss,["donor_gender","donor_age","subregion","Ref"]].iloc[0,:]

In [93]:
meta_pseudobulk

Unnamed: 0,donor_gender,donor_age,subregion,Ref
Asp_donor1_NA,Female,HE7W,,10.1016/j.cell.2019.11.025
FetalHeart2_GW11_NA,Female,HE11W,,10.1038/s41586-020-2157-4
FetalHeart_12W_NA,,HE12W,,10.1038/s41586-020-2157-4
H26547_NA,Female,HE17W,,10.1126/science.aba7721
H27098_NA,Male,HE18W,,10.1126/science.aba7721
...,...,...,...,...
HE7W_3_Right ventricle,,HE7W,Right ventricle,10.1016/j.celrep.2019.01.079
HE9W_1_Left atria,Male,HE9W,Left atria,10.1016/j.celrep.2019.01.079
HE9W_1_Left ventricle,Male,HE9W,Left ventricle,10.1016/j.celrep.2019.01.079
HE9W_1_Right atria,Male,HE9W,Right atria,10.1016/j.celrep.2019.01.079


In [102]:
meta_pseudobulk.donor_gender[meta_pseudobulk.donor_gender=="Male"] = 0
meta_pseudobulk.donor_gender[meta_pseudobulk.donor_gender=="Female"] = 1

meta_pseudobulk.donor_age = [int(x[2:][:-1]) for x in meta_pseudobulk.donor_age]

#meta_pseudobulk.subregion[meta_pseudobulk.subregion=="Left ventricle"] = 0
#meta_pseudobulk.subregion[meta_pseudobulk.subregion=="Interventricular septum"] = 1
#meta_pseudobulk.subregion[meta_pseudobulk.subregion=="Right ventricle"] = 2
#meta_pseudobulk.subregion[meta_pseudobulk.subregion=="Left atria"] = 3
#meta_pseudobulk.subregion[meta_pseudobulk.subregion=="Interatrial septum"] = 4
#meta_pseudobulk.subregion[meta_pseudobulk.subregion=="Right atria"] = 5

In [103]:
meta_pseudobulk.to_csv("/data3/cyx/HHCAd_PB4GLM/Fetal.PseudoBulk.metadata.csv")

### bulk expression matrix

In [104]:
for ss in np.unique(adata.obs["sample"]):
    matrix_sample = adata.X[adata.obs['sample'] == ss,:].todense()
    matrix_sample = np.exp(matrix_sample)-1
    vector_sample = matrix_sample.sum(axis =0)
    matrix_pseudobulk.loc[ss,:] = vector_sample/vector_sample.sum()*1e6

In [105]:
matrix_pseudobulk.to_csv("/data3/cyx/HHCAd_PB4GLM/Fetal.PseudoBulk.expression.csv")

# Generate Bulk data of each cell type

In [106]:
cell_types = ['Adipocyte', 'Cardiomyocyte cell', 'Endothelial cell',
       'Fibroblast', 'Lymphoid cell', 'Myeloid cell',
       'Neuron', 'Pericyte', 'Smooth muscle cell']

In [107]:
for ct in cell_types:
    adata_ct = adata[adata.obs.MCT==ct,:]
    
    sample_list_ct = np.unique(adata_ct.obs["sample"],return_counts=True)
    sample_list_ct = sample_list_ct[0][sample_list_ct[1]>=10]

    # genrate metadata dataframe of celltype
    meta_pseudobulk = pd.DataFrame(np.zeros([len(sample_list_ct),4]))
    meta_pseudobulk.index = sample_list_ct
    meta_pseudobulk.columns = ["donor_gender","donor_age","subregion","Ref"]
    for ss in sample_list_ct:
        meta_pseudobulk.loc[ss,:] = adata_ct.obs.loc[adata_ct.obs["sample"]==ss,["donor_gender","donor_age","subregion","Ref"]].iloc[0,:]
        
    meta_pseudobulk.donor_gender[meta_pseudobulk.donor_gender=="Male"] = 0
    meta_pseudobulk.donor_gender[meta_pseudobulk.donor_gender=="Female"] = 1

    meta_pseudobulk.donor_age = [int(x[2:][:-1]) for x in meta_pseudobulk.donor_age]

    #meta_pseudobulk.subregion[meta_pseudobulk.subregion=="Left ventricle"] = 0
    #meta_pseudobulk.subregion[meta_pseudobulk.subregion=="Interventricular septum"] = 1
    #meta_pseudobulk.subregion[meta_pseudobulk.subregion=="Right ventricle"] = 2
    #meta_pseudobulk.subregion[meta_pseudobulk.subregion=="Left atria"] = 3
    #meta_pseudobulk.subregion[meta_pseudobulk.subregion=="Interatrial septum"] = 4
    #meta_pseudobulk.subregion[meta_pseudobulk.subregion=="Right atria"] = 5
    
    # genrate expression dataframe of celltype
    matrix_pseudobulk = pd.DataFrame(np.zeros([len(sample_list_ct),43878]))
    matrix_pseudobulk.index = sample_list_ct
    matrix_pseudobulk.columns = adata_ct.var.index.values
    for ss in sample_list_ct:
        matrix_sample = adata.X[adata_ct.obs['sample'] == ss,:].todense()
        matrix_sample = np.exp(matrix_sample)-1
        vector_sample = matrix_sample.sum(axis =0)
        matrix_pseudobulk.loc[ss,:] = vector_sample/vector_sample.sum()*1e6
        
    meta_pseudobulk.to_csv("/data3/cyx/HHCAd_PB4GLM/Fetal.PseudoBulk"+ct+".metadata.csv")
    matrix_pseudobulk.to_csv("/data3/cyx/HHCAd_PB4GLM/Fetal.PseudoBulk"+ct+".expression.csv")