In [1]:
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from collections import Counter, defaultdict
import os
from scipy.stats import spearmanr

In [2]:
def compute_correlation(drug_sig, disease_sig):
    drug_sig = np.array(drug_sig)
    disease_sig = np.array(disease_sig)
    corrs = np.zeros((len(drug_sig)), dtype=float)
    pvals = np.zeros((len(drug_sig)), dtype=float)
    for i, d1 in enumerate(drug_sig):
        res = spearmanr(d1, disease_sig)
        corrs[i] = spearmanr(d1, disease_sig)[0]
        pvals[i] = spearmanr(d1, disease_sig)[1]
    return corrs, pvals

# BING results

In [58]:
drug_status = pd.read_csv('DeepCE_BING/other files/drug_status_drugbank.txt', sep='\t')
status_dict = dict(zip(drug_status['Name'].str.lower(), drug_status['Group']))
len(status_dict)

14594

In [59]:
cell_id_set = ['A375', 'HA1E', 'HELA', 'HT29', 'MCF7', 'PC3', 'YAPC']

In [60]:
predicted_drugs = np.load('DeepCE_BING/other files/predicted_drugs.npy')
predicted_drugs.shape

(7948,)

In [61]:
drug_sources = np.load('DeepCE_BING/other files/predicted_drug_sources.npy')
drug_sources.shape

(7948,)

In [62]:
genes = []
with open('DeepCE_BING/other files/gene_vector.csv', 'r') as f:
    for line in f:
        line = line.strip().split(',')
        genes.append(line[0].strip())
len(genes)

10174

## 146 mediator candidates

In [106]:
targets=pd.read_excel('DeepCE_BING/other files/Mediator_Annotations.xlsx')
targets.head(2)

Unnamed: 0,Gene,NCBI Gene ID,Gene Name,Original Symbol,Relevance to Fibrosis,Relevance to IPF,Known/Novel Candidate,Therapeutic Investigation,Notes on Therapeutic Potential,Druggable
0,CCL13,6357.0,C-C motif chemokine ligand 13,CCL13,Medium,Medium,Novel,Yes,"Chemokine, involved in immune response",Yes
1,CCL19,6363.0,C-C motif chemokine ligand 19,CCL19,Medium,Medium,Novel,Yes,"Chemokine, involved in immune response",Yes


In [107]:
targets = targets['Gene'].unique().tolist()
len(targets)

151

In [108]:
mod_info = pd.read_csv('WGCNA_Results/WGCNA_Modules_info.txt', sep='\t')
mod_info.head(2)

Unnamed: 0,Gene,Module,Module_membership,Membership_pval,logFC,adj_pval,pval,sex,sex_pval,sex_adj,...,race_asian_adj,rs35705950_genotype_GG,rs35705950_genotype_GG_pval,rs35705950_genotype_GG_adj,rs35705950_genotype_GT,rs35705950_genotype_GT_pval,rs35705950_genotype_GT_adj,rs35705950_genotype_TT,rs35705950_genotype_TT_pval,rs35705950_genotype_TT_adj
0,TSPAN6,brown,0.92,2.3900000000000002e-84,0.735,3.960773e-12,2.609257e-13,-0.0354,0.613,0.8816,...,0.9909,-0.111,0.1134,0.4203,0.0958,0.1706,0.4895,0.0435,0.5349,0.9961
1,TNMD,turquoise,0.099,0.157,0.122,0.7311232,0.6558297,-0.074,0.2904,0.705,...,0.99,-0.168,0.01578,0.2148,0.191,0.005855,0.1307,-0.0438,0.5321,0.9961


In [109]:
mod_info_targets = mod_info[mod_info['Gene'].map(lambda gene: gene in targets)]
mod_info_targets.shape

(143, 70)

In [110]:
idx, patient_sig = [], []
for _, row in mod_info_targets.iterrows():
    gene = row['Gene']
    if gene in genes:
        idx.append(genes.index(gene))
        patient_sig.append(row['logFC'])
print(f'Number of genes in this signature: {len(idx)}')

Number of genes in this signature: 62


In [111]:
# Computing correlations 
output_drugs = defaultdict(list)
output_correlations = defaultdict(list)
output_pvals = defaultdict(list)
output_drug_sources = defaultdict(list)
for cell in cell_id_set:
    sig_file = f'data/predictions/predicted_{cell}.npy'
    hq_sig_file = f'data/predictions/hq_{cell}.npy'
    hq_drugs = f'data/predictions/hq_{cell}_comps.npy'
    sigs = np.load(sig_file)
    hq_sigs = np.load(hq_sig_file)
    hq_drugs = np.load(hq_drugs)
    print(f'Cell: {cell}, Sigs: {sigs.shape}, HQ sigs: {hq_sigs.shape}, HQ drugs: {hq_drugs.shape}')
    new_drug_list = np.concatenate((predicted_drugs, hq_drugs))
    new_drug_sources = np.concatenate((drug_sources, np.array(['L1000']*hq_drugs.shape[0])))
    drug_sig = np.concatenate((sigs, hq_sigs))
    drug_sig = drug_sig[:, idx]
    print(f'{new_drug_list.shape}, {drug_sig.shape}, {new_drug_sources.shape}')
    correlations, pvals = compute_correlation(drug_sig, patient_sig)
    output_drugs[cell] = new_drug_list
    output_correlations[cell] = np.round(correlations, 4)
    output_pvals[cell] = pvals
    output_drug_sources[cell] = new_drug_sources

Cell: A375, Sigs: (7948, 10174), HQ sigs: (659, 10174), HQ drugs: (659,)
(8607,), (8607, 62), (8607,)
Cell: HA1E, Sigs: (7948, 10174), HQ sigs: (459, 10174), HQ drugs: (459,)
(8407,), (8407, 62), (8407,)
Cell: HELA, Sigs: (7948, 10174), HQ sigs: (276, 10174), HQ drugs: (276,)
(8224,), (8224, 62), (8224,)
Cell: HT29, Sigs: (7948, 10174), HQ sigs: (458, 10174), HQ drugs: (458,)
(8406,), (8406, 62), (8406,)
Cell: MCF7, Sigs: (7948, 10174), HQ sigs: (359, 10174), HQ drugs: (359,)
(8307,), (8307, 62), (8307,)
Cell: PC3, Sigs: (7948, 10174), HQ sigs: (432, 10174), HQ drugs: (432,)
(8380,), (8380, 62), (8380,)
Cell: YAPC, Sigs: (7948, 10174), HQ sigs: (259, 10174), HQ drugs: (259,)
(8207,), (8207, 62), (8207,)


In [112]:
writer = pd.ExcelWriter("Results/candidate_drugs_DeepCE.xlsx")

In [113]:
for cell in output_drugs:
    drugs = output_drugs[cell]
    sources = output_drug_sources[cell]
    corrs = np.round(output_correlations[cell], 4) 
    pvals = np.round(output_pvals[cell], 4) 
    df = pd.DataFrame([drugs, sources, corrs, pvals]).T
    df.columns = ['Drug', 'Source', 'Correlation', 'P-value']
    df['Drug_Status'] = df['Drug'].map(lambda drug: status_dict[drug.lower()] 
                                       if drug.lower() in status_dict else "NA")
    df.to_excel(writer, sheet_name=cell, index=False)
writer.close()

In [114]:
output_correlations2 = {}
output_pvals2 = {}
output_sources2 = {}
for cell in output_correlations:
    output_correlations2[cell] = dict(zip(output_drugs[cell], output_correlations[cell]))
    output_pvals2[cell] = dict(zip(output_drugs[cell], output_pvals[cell]))
    output_sources2[cell] = dict(zip(output_drugs[cell], output_drug_sources[cell]))

In [115]:
all_candidate_drugs = {}
for cell in output_drugs:
    drugs = output_drugs[cell]
    for drug in drugs:
        all_candidate_drugs[drug] = output_sources2[cell][drug]
len(all_candidate_drugs)

8367

In [116]:
summary_info = []
for drug, source in all_candidate_drugs.items():
    drug_info = [drug, source]
    for cell in cell_id_set:
        if drug in output_correlations2[cell]:
            drug_info.append(output_correlations2[cell][drug])
            drug_info.append(output_pvals2[cell][drug])
        else:
            drug_info.append(pd.NA)
            drug_info.append(pd.NA)
    summary_info.append(drug_info)

In [117]:
columns = ['A375', 'A375_Pval', 'HA1E', 'HA1E_Pval', 'HELA', 'HELA_Pval', 'HT29', 'HT29_Pval', 'MCF7', 'MCF7_Pval',
           'PC3', 'PC3_Pval', 'YAPC', 'YAPC_Pval']
pval_cols = ['A375_Pval', 'HA1E_Pval', 'HELA_Pval', 'HT29_Pval', 'MCF7_Pval', 'PC3_Pval', 'YAPC_Pval']
corr_cols = ['A375', 'HA1E', 'HELA', 'HT29', 'MCF7','PC3', 'YAPC']

In [118]:
summary_info = pd.DataFrame(summary_info)
summary_info.columns = ['Drug', 'Source'] + columns
summary_info.head(2)

Unnamed: 0,Drug,Source,A375,A375_Pval,HA1E,HA1E_Pval,HELA,HELA_Pval,HT29,HT29_Pval,MCF7,MCF7_Pval,PC3,PC3_Pval,YAPC,YAPC_Pval
0,gibberellic-acid,L1000,0.1136,0.379228,0.232,0.069602,-0.1371,0.287813,0.274,0.031155,0.2905,0.022002,0.1842,0.151842,0.0477,0.712781
1,valproic-acid,L1000,0.19,0.139048,0.3465,0.005802,0.0725,0.575353,0.267,0.035913,0.154,0.231934,0.2594,0.041792,0.2092,0.102687


In [119]:
# Supplementary File 6
summary_info.to_excel('Results/summary_info_DeepCE.xlsx', index=False)

In [120]:
summary_info = summary_info.fillna(1.0)

In [121]:
cnts = []
for idx, row in summary_info.iterrows():
    cnt = 0
    for cell in cell_id_set:
        if (row[cell]<0) & (row[f'{cell}_Pval']<0.05):
            cnt += 1
    cnts.append(cnt)

In [122]:
summary_info['Count'] = cnts

In [123]:
cnts = []
for idx, row in summary_info.iterrows():
    cnt = 0
    for cell in cell_id_set:
        if (row[cell]<0) & (row[f'{cell}_Pval']<0.01):
            cnt += 1
    cnts.append(cnt)

In [124]:
summary_info['Count2'] = cnts

In [125]:
summary_info.to_excel('Results/summary_info_DeepCE.xlsx', index=False)