# Prepare CABG for EdgeR - Celltype
## Analysis date 2022/02/23

In [1]:
import numpy as np
import pandas as pd
import anndata
import scanpy as sc
import seaborn as sns
import harmonypy as hm
import scrublet as scr
import matplotlib.pyplot as plt
from collections import OrderedDict

from scipy.stats.stats import pearsonr   
import re
from scipy.stats import rankdata 
import scipy
import scipy.sparse

sc.settings.verbosity = 1  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=120, color_map='viridis')
sc.logging.print_versions()

%matplotlib inline

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [5, 5]

import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

The `sinfo` package has changed name and is now called `session_info` to become more discoverable and self-explanatory. The `sinfo` PyPI package will be kept around to avoid breaking old installs and you can downgrade to 0.3.2 if you want to use it without seeing this message. For the latest features and bug fixes, please install `session_info` instead. The usage and defaults also changed slightly, so please review the latest README at https://gitlab.com/joelostblom/session_info.
-----
anndata     0.7.6
scanpy      1.8.2
sinfo       0.3.4
-----
PIL                         8.4.0
backcall                    0.2.0
beta_ufunc                  NA
binom_ufunc                 NA
bottleneck                  1.3.2
cloudpickle                 2.0.0
cycler                      0.10.0
cython_runtime              NA
cytoolz                     0.11.0
dask                        2021.10.0
dateutil                    2.8.2
debugpy                     1.4.1
decorator                   5.1.0
defusedxml

In [2]:
def stacked_barplot(adata, xaxis, stack, xaxis_label, legend_label, shift_top=0.2, xaxis_rotation=45, figsize=(10,10), percent=True, save=None):
    tmp=pd.crosstab(adata.obs[xaxis],
                adata.obs[stack]).sum(axis=1)
    
    ax= pd.crosstab(adata.obs[xaxis], 
                    adata.obs[stack]).apply(lambda r: r/r.sum()*100, axis=1)
    #ax_1 = ax.plot.bar(figsize=(10,10),stacked=True, rot=0, colors=colors)
    ax_1 = ax.plot.bar(figsize=figsize,stacked=True, rot=0)
    #display(ax)
    #plt.legend(title="Subject") # loc='upper center', bbox_to_anchor=(0.1, 1.0), 
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), title=legend_label)
    plt.xlabel(xaxis_label,fontweight="bold")
    plt.ylabel('Percent Distribution',fontweight="bold")
    
    if percent:
        for rec in ax_1.patches:
            height = rec.get_height()
            ax_1.text(rec.get_x() + rec.get_width() / 2, rec.get_y() + height/2, "{:.0f}%".format(height),
                   ha = 'center', va='center', fontsize=14.5, weight='bold') #
    
    for xaxistick in range(ax.shape[0]):
        ax_1.text(xaxistick-shift_top,110,tmp[xaxistick],va='center', rotation=45, fontsize=14.5, weight='bold')
    
    plt.xticks(rotation=xaxis_rotation, ha = 'right')
    
    ax_1.grid(False)
    if save is not None:
        plt.savefig(save, bbox_inches='tight')
    
    plt.show()

In [3]:
def correlation_analysis(scanpy_object, column_tosplitby, sum_or_mean):
    d = {}
    for cluster_number in np.unique(scanpy_object.obs[column_tosplitby].values):
        #cluster_number=np.unique(heart.obs['louvain_annotated'].values)[1]
        scanpy_object_subset = scanpy_object[scanpy_object.obs[column_tosplitby].isin([cluster_number])]

        if sum_or_mean=="mean":
            d[cluster_number] = np.squeeze(np.asarray(scanpy_object_subset.raw.X.mean(axis=0))) #np.log(+1)
        elif sum_or_mean=="sum":
            d[cluster_number] = np.squeeze(np.asarray(scanpy_object_subset.raw.X.sum(axis=0))) #np.log(+1)
        del scanpy_object_subset
        
    return d

## Read data

In [4]:
INDIR='..'
OUTDIR='celltype'

In [5]:
adata_orig = sc.read(INDIR + '/CABG_regressout_ANNOTATED_2022-02-23_inner.h5ad')

In [6]:
adata_orig

AnnData object with n_obs × n_vars = 64826 × 4324
    obs: 'n_genes', 'percent_mito', 'n_counts', 'log_counts', 'percent_ribo', 'scrublet_score', 'predicted_doublets', 'Sample', 'Patient', 'Source', 'Region', 'Group', 'Sample_type', 'Location', 'batch', 'NRP', 'age_group', 'cell_source', 'HCA_cell_type', 'gender', 'type', 'version', 'HCA_cell_states', 'Used', 'leiden_010', 'leiden_011', 'leiden_012', 'leiden_013', 'leiden_014', 'leiden_015', 'leiden_016', 'leiden_017', 'leiden_018', 'leiden_019', 'leiden_020', 'leiden_021', 'leiden_022', 'leiden_023', 'leiden_024', 'leiden_025', 'leiden_026', 'leiden_027', 'leiden_028', 'leiden_029', 'leiden_030', 'leiden_031', 'leiden_032', 'leiden_033', 'leiden_034', 'leiden_035', 'leiden_036', 'leiden_037', 'leiden_038', 'leiden_039', 'leiden_001', 'leiden_002', 'leiden_003', 'leiden_004', 'leiden_005', 'leiden_006', 'leiden_007', 'leiden_008', 'leiden_009', 'leiden_040', 'leiden_050', 'leiden_060', 'leiden_070', 'leiden_080', 'leiden_090', 'leiden'

In [7]:
adata_orig.obs['Patient_location'] = adata_orig.obs.Patient.astype(str) + '_' + adata_orig.obs.Location.astype(str)

In [8]:
adata_orig.obs['Location_Patient'] = adata_orig.obs.Location.astype(str) + '_' + adata_orig.obs.Patient.astype(str)

In [10]:
adata = anndata.AnnData(X=adata_orig.raw.X, obs=adata_orig.obs, var=adata_orig.raw.var)

In [20]:
adata.obs.Location.astype('category')

AAACCCAAGGTACAGC-1-0-0                  Ischaemic
AAACCCACACTTGAGT-1-0-0                  Ischaemic
AAACGAAAGGAACGAA-1-0-0                  Ischaemic
AAACGAATCCGAAATC-1-0-0                  Ischaemic
AAACGCTGTAGGGTAC-1-0-0                  Ischaemic
                                          ...    
TTTACCATCCGCAGTG-1-HCAHeart8287124-1      Control
TTTACGTGTAGAGTTA-1-HCAHeart8287124-1      Control
TTTACTGGTGTTTACG-1-HCAHeart8287124-1      Control
TTTATGCGTCTCAGAT-1-HCAHeart8287124-1      Control
TTTCAGTGTACCTTCC-1-HCAHeart8287124-1      Control
Name: Location, Length: 64826, dtype: category
Categories (3, object): ['Control', 'Ischaemic', 'Remote']

In [22]:
adata_orig.obs.Location = adata_orig.obs.Location.astype('category').cat.reorder_categories(['Control','Remote','Ischaemic'])

## Prepare tables for EdgeR

## Pseudobulk for cell_type

In [25]:
global_all = adata

In [26]:
def paste0(i, j, k):
    return(str(i)+"__"+str(j)+"__"+str(k))


global_all.obs['pool_key']=[paste0(i,j,k) for i,j,k in zip(global_all.obs.Location, 
                                                        global_all.obs.Patient,
                                                       global_all.obs.cell_type2
                                                                           )]

In [28]:
global_all.raw=global_all
x=correlation_analysis(global_all, 'pool_key', 'sum')
x=pd.DataFrame(x)
x.index=global_all.var.index

In [30]:
x.to_csv(OUTDIR + "/ALL_BULK_PSEUDOBULK.csv")

### Gene filter

In [31]:
global_all.obs['Group_Patient_celltype']=[str(i) + "__" + str(j) + "__" + str(k) for i,j,k in zip(global_all.obs.Location, 
                                                                                      global_all.obs.Patient,
                                                                                        global_all.obs.cell_type2)]

In [32]:
subset_=sc.AnnData(global_all[:,global_all.var.index].raw.X, # 
                  obs=global_all.obs,
                  var=global_all[:,global_all.var.index].raw.var
                 )

In [33]:
subset_.raw=subset_
x=correlation_analysis(subset_, 'Group_Patient_celltype', 'mean') # MEan per Gene_Celltype_Patient
x=pd.DataFrame(x)
x.index=global_all.var.index

In [35]:
x.to_csv(OUTDIR + "/ALL_BULK_PSEUDOBULK_FILTERING.csv")

## Celltype Abundance

In [37]:
x=pd.crosstab(global_all.obs['cell_type2'],
            global_all.obs['Patient_location'])

In [38]:
x.to_csv( OUTDIR + "/ABSOLUTE_CELLTYPE_NUMBER.csv")

In [39]:
OUTDIR='celltype_CABG_vs_control'

## Prepare tables for EdgeR

## Pseudobulk for cell_type

In [28]:
global_all = adata

In [29]:
def paste0(i, j, k):
    return(str(i)+"__"+str(j)+"__"+str(k))


global_all.obs['pool_key']=[paste0(i,j,k) for i,j,k in zip(global_all.obs.Group, 
                                                        global_all.obs.Patient,
                                                       global_all.obs.cell_type2
                                                                           )]

In [31]:
global_all.raw=global_all
x=correlation_analysis(global_all, 'pool_key', 'sum')
x=pd.DataFrame(x)
x.index=global_all.var.index

In [45]:
x.to_csv(OUTDIR + "/ALL_CELLTYPE_PSEUDOBULK.csv")

### Gene filter

In [46]:
global_all.obs['Group_Patient_celltype']=[str(i) + "__" + str(j) + "__" + str(k) for i,j,k in zip(global_all.obs.Group, 
                                                                                      global_all.obs.Patient,
                                                                                        global_all.obs.cell_type2)]

In [47]:
subset_=sc.AnnData(global_all[:,global_all.var.index].raw.X, # 
                  obs=global_all.obs,
                  var=global_all[:,global_all.var.index].raw.var
                 )

In [48]:
subset_.raw=subset_
x=correlation_analysis(subset_, 'Group_Patient_celltype', 'mean') # MEan per Gene_Celltype_Patient
x=pd.DataFrame(x)
x.index=global_all.var.index

In [50]:
x.to_csv(OUTDIR + "/ALL_CELLTYPE_PSEUDOBULK_FILTERING.csv")

## Celltype Abundance

In [51]:
global_all.obs

Unnamed: 0,n_genes,percent_mito,n_counts,log_counts,percent_ribo,scrublet_score,predicted_doublets,Sample,Patient,Source,...,leiden_070,leiden_080,leiden_090,leiden,cell_type,cell_type2,Patient_location,Location_Patient,pool_key,Group_Patient_celltype
AAACCCAAGGTACAGC-1-0-0,1990,0.003428,3792.0,8.240649,0.003165,0.026052,False,22EG2,17MM00085,Nuclei,...,1,1,0,1,PC,PC,17MM00085_Ischaemic,Ischaemic_17MM00085,IHD__17MM00085__PC,IHD__17MM00085__PC
AAACCCACACTTGAGT-1-0-0,3523,0.000513,13644.0,9.521055,0.001026,0.142857,False,22EG2,17MM00085,Nuclei,...,8,9,9,10,vCM3,vCM,17MM00085_Ischaemic,Ischaemic_17MM00085,IHD__17MM00085__vCM,IHD__17MM00085__vCM
AAACGAAAGGAACGAA-1-0-0,3870,0.004059,13056.0,9.477003,0.001532,0.063584,False,22EG2,17MM00085,Nuclei,...,4,2,9,0,vCM1,vCM,17MM00085_Ischaemic,Ischaemic_17MM00085,IHD__17MM00085__vCM,IHD__17MM00085__vCM
AAACGAATCCGAAATC-1-0-0,1490,0.000337,2966.0,7.994969,0.001011,0.023256,False,22EG2,17MM00085,Nuclei,...,3,3,4,2,FB,FB,17MM00085_Ischaemic,Ischaemic_17MM00085,IHD__17MM00085__FB,IHD__17MM00085__FB
AAACGCTGTAGGGTAC-1-0-0,2475,0.003555,4782.0,8.472614,0.002719,0.066667,False,22EG2,17MM00085,Nuclei,...,3,3,4,2,FB,FB,17MM00085_Ischaemic,Ischaemic_17MM00085,IHD__17MM00085__FB,IHD__17MM00085__FB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTACCATCCGCAGTG-1-HCAHeart8287124-1,800,0.007812,1536.0,,0.003906,0.261745,,HCAHeart8287124,D11,Nuclei,...,7,7,8,6,Myeloid,Myeloid,D11_Control,Control_D11,control__D11__Myeloid,control__D11__Myeloid
TTTACGTGTAGAGTTA-1-HCAHeart8287124-1,1862,0.000884,4523.0,,0.000442,0.098229,,HCAHeart8287124,D11,Nuclei,...,15,18,19,0,vCM1,vCM,D11_Control,Control_D11,control__D11__vCM,control__D11__vCM
TTTACTGGTGTTTACG-1-HCAHeart8287124-1,931,0.013680,1462.0,,0.002052,0.113475,,HCAHeart8287124,D11,Nuclei,...,3,3,4,2,FB,FB,D11_Control,Control_D11,control__D11__FB,control__D11__FB
TTTATGCGTCTCAGAT-1-HCAHeart8287124-1,3173,0.000820,10980.0,,0.001821,0.125475,,HCAHeart8287124,D11,Nuclei,...,4,5,6,4,vCM2,vCM,D11_Control,Control_D11,control__D11__vCM,control__D11__vCM


In [52]:
x=pd.crosstab(global_all.obs['cell_type2'],
            global_all.obs['Patient'])

In [53]:
x.to_csv( OUTDIR + "/ABSOLUTE_CELLTYPE_NUMBER.csv")

In [65]:
adata2 = adata.copy()
sc.pp.normalize_total(adata, target_sum=1e4)
#sc.pp.log1p(adata)
adata.raw = adata.copy()

In [66]:
global_all = adata

### Mean expression for plotting

In [67]:
global_all.obs['Group_Patient_celltype']=[str(i) + "__" + str(j) + "__" + str(k) for i,j,k in zip(global_all.obs.Location, 
                                                                                      global_all.obs.Patient,
                                                                                        global_all.obs.cell_type2)]

In [68]:
subset_=sc.AnnData(global_all[:,global_all.var.index].raw.X, # 
                  obs=global_all.obs,
                  var=global_all[:,global_all.var.index].raw.var
                 )

In [69]:
subset_.raw=subset_
x=correlation_analysis(subset_, 'Group_Patient_celltype', 'mean') # MEan per Gene_Celltype_Patient
x=pd.DataFrame(x)
x.index=global_all.var.index

In [73]:
x.to_csv(OUTDIR + "/ALL_CELLTYPE_NORMALISED_MEAN_EXP.csv")

### Mean expression for plotting

In [33]:
global_all.obs['Group_Patient_celltype']=[str(i) + "__" + str(j) + "__" + str(k) for i,j,k in zip(global_all.obs.Location, 
                                                                                      global_all.obs.Patient,
                                                                                        global_all.obs.cell_type2)]

In [34]:
subset_=sc.AnnData(global_all[:,global_all.var.index].raw.X, # 
                  obs=global_all.obs,
                  var=global_all[:,global_all.var.index].raw.var
                 )

In [35]:
subset_.var = subset_.var.reset_index().set_index('gene_ids')

In [39]:
subset_.raw=subset_
x=correlation_analysis(subset_, 'Group_Patient_celltype', 'mean') # MEan per Gene_Celltype_Patient
x=pd.DataFrame(x)
x.index=subset_.var.index

In [41]:
x.to_csv(OUTDIR + "/ALL_CELLTYPE_NORMALISED_MEAN_EXP_geneID.csv")