In [1]:
import scanpy as sc
import pandas as pd
import numpy as np

In [2]:
data = sc.read_h5ad('./data/Tcell_PDL1.h5ad')

In [3]:
np.unique(data.obs['Sub_Cluster'])

array(['t_CD4-CXCL13', 't_CD4_Tcm-LMNA', 't_CD4_Tn-LEF1',
       't_CD4_Treg-FOXP3', 't_CD8-CXCL13', 't_CD8_MAIT-KLRB1',
       't_CD8_Teff-GNLY', 't_CD8_Tem-GZMK', 't_CD8_Trm-ZNF683',
       't_Tact-IFI6', 't_Tact-XIST', 't_Tprf-MKI67'], dtype=object)

In [4]:
phase = pd.read_csv('./data/cellcycle_phase.csv',index_col=0)
phase

Unnamed: 0,phase
AAACCTGAGAGTCTGG.Pre_P007_t,G1
AAACCTGCAAGTCATC.Pre_P007_t,G1
AAACCTGGTTGGGACA.Pre_P007_t,G1
AAACCTGTCAGCTCGG.Pre_P007_t,G1
AAACCTGTCCACGTGG.Pre_P007_t,G1
...,...
TTTGGTTTCAACACCA.Pre_P004_t,G1
TTTGTCATCATGTAGC.Pre_P004_t,G1
TTTGTCATCATTGCCC.Pre_P004_t,G1
TTTGTCATCCGAAGAG.Pre_P004_t,G1


In [5]:
data.obs = data.obs.join(phase)
data

AnnData object with n_obs × n_vars = 46019 × 27085
    obs: 'CellName', 'ID', 'Sample', 'Patient', 'group', 'Tissue', 'Origin', 'Response', 'Timeline', 'Treatment', 'batch', 'n_genes', 'percent_mito', 'percent_hsp', 'percent_ig', 'percent_rp', 'n_counts', 'leiden', 'Louvain', 'myleiden', 'defcls', 'Global_Cluster', 'Sub_Cluster', 'Global_tSNE_1', 'Global_tSNE_2', 'Global_UMAP_1', 'Global_UMAP_2', 'Sub_tSNE_1', 'Sub_tSNE_2', 'Sub_UMAP_1', 'Sub_UMAP_2', 'phase'

In [6]:
zhang = pd.read_table('./data/exhausted geneset.csv')
zhang

Unnamed: 0,Gene Symbol
0,HAVCR2
1,CXCL13
2,CCL3
3,SIRPG
4,IFNG
...,...
83,TNIP3
84,CD7
85,PSMD4
86,ATP6V1C2


In [7]:
zhang = np.unique(zhang['Gene Symbol'])

In [8]:
sc.pp.normalize_total(data,target_sum=1e6)

In [9]:
for j in np.unique(data.obs['Sub_Cluster']):
    adata = data[data.obs['Sub_Cluster']==j]
    ann = pd.read_csv('./result/cluster/' + j + '_cluster.csv',
                  usecols=['cell','cluster'],index_col=0,header=0)
    adata.obs = adata.obs.join(ann)
    df = pd.DataFrame(data=np.unique(adata.obs['cluster']),columns=['Cluster'])
    
    for i in df.index:
        a=adata[adata.obs['cluster']==df.loc[i,'Cluster']]
            
        df.loc[i,'Count'] = len(a)
        df.loc[i,'G1_Count'] = len(a[a.obs['phase']=='G1'])
        df.loc[i,'S_Count'] = len(a[a.obs['phase']=='S'])
        df.loc[i,'G2M_Count'] = len(a[a.obs['phase']=='G2M'])

    
    df['G1_Prop'] = df['G1_Count'] / df['Count']
    df['S_Prop'] = df['S_Count'] / df['Count']
    df['G2M_Prop'] = df['G2M_Count'] / df['Count']
    
    adata[:,adata.var_names.isin(zhang)]
    
    for i in df.index:
        a = adata[adata.obs['cluster']==df.loc[i,'Cluster']]
        df.loc[i,'exhuast_score'] = sum(a[:,a.var_names.isin(zhang)].to_df().mean())    
        
    df.to_csv('./data/spearman_data/' + j + '_spearman_data.csv',index=False,header=True)

In [9]:
adata = data[data.obs['Sub_Cluster']=='t_CD8-CXCL13']
adata

View of AnnData object with n_obs × n_vars = 5404 × 27085
    obs: 'CellName', 'ID', 'Sample', 'Patient', 'group', 'Tissue', 'Origin', 'Response', 'Timeline', 'Treatment', 'batch', 'n_genes', 'percent_mito', 'percent_hsp', 'percent_ig', 'percent_rp', 'n_counts', 'leiden', 'Louvain', 'myleiden', 'defcls', 'Global_Cluster', 'Sub_Cluster', 'Global_tSNE_1', 'Global_tSNE_2', 'Global_UMAP_1', 'Global_UMAP_2', 'Sub_tSNE_1', 'Sub_tSNE_2', 'Sub_UMAP_1', 'Sub_UMAP_2', 'phase'

In [10]:
ann = pd.read_csv('./result/cluster/t_CD8-CXCL13_cluster.csv',
                  usecols=['cell','cluster'],index_col=0,header=0)
ann

Unnamed: 0_level_0,cluster
cell,Unnamed: 1_level_1
AACTCAGAGCAGCGTA.Pre_P012_t,5
ACGGGCTTCTGCAGTA.Pre_P012_t,5
ACTTTCATCCGTCAAA.Pre_P012_t,5
AGTGTCAAGATGTCGG.Pre_P012_t,5
ATAGACCGTAAACACA.Pre_P012_t,5
...,...
CGATTGATCAGTCCCT.Post_P019_t,58
CTCAGAACAGCCAATT.Post_P019_t,58
GTAACGTAGAGCAATT.Post_P019_t,58
GTGAAGGAGCATCATC.Post_P019_t,58


In [11]:
adata.obs = adata.obs.join(ann)

In [12]:
df = pd.DataFrame(data=np.unique(adata.obs['cluster']),columns=['Cluster'])

In [13]:
for i in df.index:
    a = adata[adata.obs['cluster']==df.loc[i,'Cluster']]
    df.loc[i,'Count'] = len(a)
    df.loc[i,'G1_Count'] = len(a[a.obs['phase']=='G1'])
    df.loc[i,'S_Count'] = len(a[a.obs['phase']=='S'])
    df.loc[i,'G2M_Count'] = len(a[a.obs['phase']=='G2M'])



In [14]:
df['G1_Prop'] = df['G1_Count'] / df['Count']
df['S_Prop'] = df['S_Count'] / df['Count']
df['G2M_Prop'] = df['G2M_Count'] / df['Count']

In [15]:
adata[:,adata.var_names.isin(zhang)]

View of AnnData object with n_obs × n_vars = 5404 × 82
    obs: 'CellName', 'ID', 'Sample', 'Patient', 'group', 'Tissue', 'Origin', 'Response', 'Timeline', 'Treatment', 'batch', 'n_genes', 'percent_mito', 'percent_hsp', 'percent_ig', 'percent_rp', 'n_counts', 'leiden', 'Louvain', 'myleiden', 'defcls', 'Global_Cluster', 'Sub_Cluster', 'Global_tSNE_1', 'Global_tSNE_2', 'Global_UMAP_1', 'Global_UMAP_2', 'Sub_tSNE_1', 'Sub_tSNE_2', 'Sub_UMAP_1', 'Sub_UMAP_2', 'phase', 'cluster'

In [17]:
a = adata[:,adata.var_names.isin(zhang)]

In [19]:
sum(a.to_df().mean())

22350.867310538888

In [21]:
sum(adata[:,adata.var_names.isin(zhang)].to_df().mean())

22350.867310538888

In [24]:
for i in df.index:
    a = adata[adata.obs['cluster']==df.loc[i,'Cluster']]
    df.loc[i,'exhuast_score'] = sum(a[:,a.var_names.isin(zhang)].to_df().mean())

In [25]:
df

Unnamed: 0,Cluster,Count,G1_Count,S_Count,G2M_Count,G1_Prop,S_Prop,G2M_Prop,exhuast_score
0,5,193.0,178.0,12.0,3.0,0.92228,0.062176,0.015544,27997.462612
1,6,234.0,219.0,15.0,0.0,0.935897,0.064103,0.0,23846.53259
2,8,194.0,185.0,7.0,2.0,0.953608,0.036082,0.010309,21287.518646
3,9,210.0,199.0,9.0,2.0,0.947619,0.042857,0.009524,18316.858102
4,12,118.0,67.0,39.0,12.0,0.567797,0.330508,0.101695,28528.485188
5,13,108.0,53.0,30.0,25.0,0.490741,0.277778,0.231481,29113.12218
6,15,189.0,179.0,9.0,1.0,0.94709,0.047619,0.005291,23908.396244
7,16,181.0,179.0,2.0,0.0,0.98895,0.01105,0.0,25437.94457
8,20,233.0,208.0,20.0,5.0,0.892704,0.085837,0.021459,18913.351235
9,21,186.0,169.0,15.0,2.0,0.908602,0.080645,0.010753,19608.068613


In [59]:
df.to_csv('./data/spearman_data.csv',index=False,header=True)