# differential-expression-analysis-dream
8.12.24

Running on standard DE analysis on DreamAI imputed quants, for each CPTAC cohort separately. 
We just need to make sure that we're comparing the same proteins when we compare Lupine, DreamAI and no imputation/naive imputation. 

In [1]:
import pandas as pd
import numpy as np 
from tqdm import tqdm
from Bio import SeqIO
from scipy import stats
from adjustText import adjust_text

import matplotlib.pyplot as plt
import seaborn as sns

# plotting templates
sns.set(context="talk", style="ticks") 
pal = sns.color_palette("tab10")

#### Configs

In [2]:
# The unimputed joint quants matrix
joint_fname="/net/noble/vol2/home/lincolnh/data/quant-data/UMich-normalized/joint-quants-normalized-shifted.csv"
min_pres=18

# The Ensemble (GENCODEv44) fasta
ensembl_path="/net/noble/vol2/home/lincolnh/code/2023_harris_deep_impute/results/2023-11-13_UMich_dataset/fastas/"
ensembl_df="gencode.v44.pc_translations.fa"

# The HGNC database file
hgnc_database_path="/net/noble/vol2/home/lincolnh/data/quant-data/HGNC_database.txt"

# The metadata dictionary, previously created
meta_path="/net/noble/vol2/home/lincolnh/code/2023_harris_deep_impute/results/2024-05-10_metadata_mapping/meta-dict.csv"

cohort_ids=["BRCA", "CCRCC", "COAD", "GBM", "HGSC", 
            "HNSCC", "LSCC", "LUAD", "PDAC", "UCEC"]

# Set the thresholds
adjusted_alpha=1e-2
fc_thresh=0.5
pres_frac_thresh=0.5 # Default here is 50%

curr_cohort = "CCRCC"

rng = np.random.default_rng(seed=18)

#### Functions

In [3]:
def bh_adjustment(pvals):
    """
    Performs the Benjamini-Hochberg procedure
    for p-value ADJUSTMENT. So this means we actually 
    return a list of corrected p-values, not just a 
    boolean specifying which p-values to keep per the
    FDR controlled at some threshold. 
    
    Parameters
    ----------
    pvals : np.ndarray, 
        The sorted list of uncorrected p-values. Sorted
        from smallest to largest

    Returns
    -----------
    pvals_adjusted : list, 
        A list of the BH adjusted p-values
    """
    pvals_adjusted = []

    for i in range(0, len(pvals)):
        rank = i + 1
        curr_pval = pvals[i]
        pval_adj = (curr_pval * len(pvals)) / rank
        pvals_adjusted.append(pval_adj)        

    return pvals_adjusted

#### Subset metadata to a single cohort, get the tumor and nontumor IDs

In [4]:
# Might need `index_col=0` here
meta_dict = pd.read_csv(meta_path)
meta_dict = meta_dict[meta_dict["cohort"] == curr_cohort]
meta_dict = meta_dict.reset_index(drop=True)

tumor_samples_meta = meta_dict[(meta_dict["sample_type"] == "Primary Tumor") | (meta_dict["sample_type"] == "Tumor")]
nontumor_samples_meta = meta_dict[(meta_dict["sample_type"] != "Primary Tumor") & (meta_dict["sample_type"] != "Tumor")]

tumor_IDs = list(tumor_samples_meta["aliquot_ID"])
nontumor_IDs = list(nontumor_samples_meta["aliquot_ID"])

print(len(tumor_IDs))
print(len(nontumor_IDs))

110
84


#### Pre-process the unimputed joint quants matrix

In [5]:
# Read in the joint quants matrix
joint_mat = pd.read_csv(joint_fname, index_col=0)

# Remove some of these extraneous runs
keywords = ["RefInt", "QC", "pool", "Tumor", "Pooled", 
            "Pool", "Reference", "NCI", "NX", "Ref"]
to_drop = []

for sample_id in list(joint_mat.columns):
    exclude=False
    for kw in keywords:
        if kw in sample_id:
            exclude=True
            break
    to_drop.append(exclude)

keep_cols = np.array(joint_mat.columns)[~np.array(to_drop)]
joint_mat = joint_mat[keep_cols]

joint = np.array(joint_mat)

# Remove proteins with too many missing values
num_present = np.sum(~np.isnan(joint), axis=1)
discard = num_present < min_pres
joint = np.delete(joint, discard, axis=0)
keep_prots = np.array(joint_mat.index)[~discard]

print(f"joint quants mat shape, post-filter: {joint.shape}")

joint_start = pd.DataFrame(joint, columns=keep_cols, index=keep_prots)

joint quants mat shape, post-filter: (18162, 1755)


#### Read in a DreamAI imputed quants matrix and attach row/column IDs

In [6]:
dream_imputed_cohort = pd.read_csv("imputed/" + curr_cohort + "_dreamAI_recon.csv", index_col=0)
unimputed_cohort = pd.read_csv("unimputed/" + curr_cohort + "_unimputed_annotated.csv", index_col=0)

dream_imputed_cohort.index = unimputed_cohort.index
dream_imputed_cohort.columns = unimputed_cohort.columns

print(dream_imputed_cohort.shape)

(11821, 194)


#### Remove proteins with an initial missingness fraction >50% from the current cohort matrix

In [7]:
cohort_quants_start = joint_start[tumor_IDs + nontumor_IDs]

num_present = np.sum(~np.isnan(cohort_quants_start), axis=1)
pres_fracs = num_present / cohort_quants_start.shape[1]

cohort_quants_start = cohort_quants_start[pres_fracs >= pres_frac_thresh]
print(cohort_quants_start.shape)

keep_prots_cohort = list(cohort_quants_start.index)

(9324, 194)


#### Subset the DreamAI imputed quants by the initial missingnesss fraction

In [8]:
dream_imputed_cohort = dream_imputed_cohort.loc[keep_prots_cohort]
print(dream_imputed_cohort.shape)

(9324, 194)


#### Exponentiate to get the original, untransformed intensities
These quants have previously been log2 transformed, so here we're inversing that transformation 

In [9]:
joint_start = np.power(2, joint_start)
dream_imputed_cohort = np.power(2, dream_imputed_cohort)

#### Get quants matrices for tumor and non-tumor samples

In [10]:
tumor_quants = dream_imputed_cohort[tumor_IDs]
nontumor_quants = dream_imputed_cohort[nontumor_IDs]

tumor_mat = np.array(tumor_quants)
nontumor_mat = np.array(nontumor_quants)

print(tumor_mat.shape)
print(nontumor_mat.shape)

(9324, 110)
(9324, 84)


#### Calculate the Wilcoxon rank sum statistics

In [11]:
pvals = []
rs_stats = []

for i in range(0, tumor_mat.shape[0]):
    stat, pval = stats.ranksums(tumor_mat[i], nontumor_mat[i], nan_policy="omit")
    pvals.append(pval)
    rs_stats.append(stat)

# Init a dataframe to hold the p-values and adjusted p-values
stats_df = pd.DataFrame(columns = ["ENSP", "pval", "adj_pval", "orig_idx"])
stats_df["ENSP"] = list(tumor_quants.index)
stats_df["pval"] = pvals

#### Do the Benjamini Hochberg correction 

In [12]:
# Sort by uncorrected p-values
stats_df = stats_df.sort_values(by="pval")
# Do the BH adjustment
pvals_corrected = bh_adjustment(np.array(stats_df["pval"]))
stats_df["adj_pval"] = pvals_corrected
stats_df["orig_idx"] = list(stats_df.index)

# Return to the initial order
stats_df = stats_df.sort_values(by="orig_idx")

#### Get the log2 fold changes

In [13]:
tumor_expr_means = np.nanmean(tumor_mat, axis=1)
nontumor_expr_means = np.nanmean(nontumor_mat, axis=1)

log_fold_changes = np.log2(tumor_expr_means / nontumor_expr_means)

fdr = -np.log10(np.array(stats_df["adj_pval"]))

#### Create an aggregated dataframe for analysis

In [14]:
aggregated = pd.DataFrame(columns=["ENSP", "HGNC", "p-value", "FC"])
aggregated["ENSP"] = list(tumor_quants.index)
aggregated["p-value"] = np.array(stats_df["adj_pval"])
aggregated["FC"] = log_fold_changes
aggregated["FDR"] = fdr
#aggregated.head(5)

#### Create a dictionary mapping ENSPs to HGNCs

In [15]:
# Read in the HGNC database file
hgnc_db = pd.read_csv(hgnc_database_path, sep="\t")

# Read in the ENSEMBL fasta
ensembl_fasta = ensembl_path + ensembl_df
fasta_seqs = SeqIO.parse(open(ensembl_fasta), "fasta")

# Init both dictionaries
gene_x_prot = {}
prot_x_gene = {}

# Fill in the dictionary 
for fasta in fasta_seqs:
    name, descript, sequence = \
        fasta.id, fasta.description, str(fasta.seq)
    # Get the ENSP and ENSG IDs
    ensp_id = name.split("|")[0]
    ensg_id = name.split("|")[2]
    # Strip the ".x" characters. Hope this is ok.
    ensp_id = ensp_id.split(".")[0]
    ensg_id = ensg_id.split(".")[0]
    
    # Update the first dictionary
    prot_x_gene[ensp_id] = ensg_id
    
    # Update the second
    if ensg_id in gene_x_prot:
        gene_x_prot[ensg_id].append(ensp_id)
    else:
        gene_x_prot[ensg_id] = [ensp_id]

  hgnc_db = pd.read_csv(hgnc_database_path, sep="\t")


#### Append the HGNC IDs to the aggregated df

In [16]:
for idx in range(0, aggregated.shape[0]):
    curr = aggregated.iloc[idx]
    curr_ensp = curr["ENSP"]
    try:
        curr_ensg = prot_x_gene[curr_ensp]
    except KeyError:
        curr_ensg = None

    # Add the ENSG ID
    aggregated.loc[idx, "ENSG"] = curr_ensg

    # Add in the HGNC gene ID as well 
    if curr_ensg is not None:
        try:
            hgnc_row = hgnc_db[hgnc_db["ensembl_gene_id"] == curr_ensg]
            hgnc_id = hgnc_row["symbol"].item()

            aggregated.loc[idx, "HGNC"] = hgnc_id
        except ValueError:
            pass

#### Define up- and down-regulated genes/proteins
According to our adjusted p-value threshold and log FC threshold. 

In [17]:
up_df = aggregated[(aggregated["p-value"] < adjusted_alpha) & (aggregated["FC"] >= fc_thresh)]
down_df = aggregated[(aggregated["p-value"] < adjusted_alpha) & (aggregated["FC"] <= -fc_thresh)]

print(f"num up-reg: {up_df.shape[0]}")
print(f"num down-reg: {down_df.shape[0]}")

num up-reg: 96
num down-reg: 425


---
## Fucking around
For CCRCC. 

#### Get lists of up-regulated proteins after Dream and Lupine imputation. 

In [18]:
up_reg_dream = list(up_df["HGNC"])

up_reg_lupine = ['PARVG', 'LOX', 'POSTN', 'ABCG1', 'NUPR1', 'POSTN', 'APBB1IP', 
                 'ADAMTS7', 'GPX8', 'LOXL2', 'TNFAIP6', 'CDH4', 'ARHGAP22', 'GBP2', 
                 'ANGPTL4', 'SLC2A1', 'NNMT', 'EPX', 'INTU', 'HK2', 'GYPA', 'SEMA5B', 
                 'HAVCR1', 'PLOD2', 'FCER1G', 'SLC2A3', 'LAIR1', 'IKBIP', 'PFKP', np.nan, 
                 'GBP5', 'SCARB1', 'FCGR3A', 'CTHRC1', 'IDO1', 'SCGN', 'TMSB10', 
                 'HLA-DQB2', 'POSTN', 'IL32', 'PNCK', 'HAPLN1', 'SLC4A1', 'SLC16A3', 
                 'FTH1', 'CAV1', 'ANGPT2', 'FHL1', 'P4HA2', 'P4HA1', 'TYMP', 'ALOX5', 
                 np.nan, 'ENO2', 'ENPP3', 'LPCAT1', 'EMILIN2', 'SPTA1', 'ITGB2', 'CYP2J2', 
                 'IKBIP', 'SLC43A3', 'BTN3A2', 'PLIN2', 'SDS', 'C4orf3', 'ALB', 'TYROBP', 
                 np.nan, 'ITGAX', 'SIRPA', 'ANXA4', 'TMEM243', 'CD70', 'PDK1', 'CYBB', 
                 'AHNAK2', 'ESM1', 'TGM2', 'NT5DC3', 'FABP6', 'CA9', 'COL23A1', 'MFSD13A', 
                 'THBS2', 'FABP7', 'SCD', 'NCK1', 'NDUFA4L2', 'HBE1', 'RAB42', 'SPATA18', 
                 'P2RX7', 'H1-5', 'CCND1', 'PYGL', 'POSTN', np.nan, 'SPATA18', 'PNMA2', 
                 'FTL', 'BIRC3']

#### Get rid of the NaNs
This is not the most elegant solution. 

In [19]:
up_reg_lupine = set(up_reg_lupine)
up_reg_dream = set(up_reg_dream)

up_reg_lupine = list(up_reg_lupine)[1:-1]
up_reg_dream = list(up_reg_dream)[1:-1]

print(len(up_reg_lupine))
print(len(up_reg_dream))

92
86


#### Get the intersection and Lupine exclusive sets

In [20]:
inter = np.intersect1d(up_reg_lupine, up_reg_dream)
print(len(inter))

lupine_exclusive = list(set(up_reg_lupine) - set(up_reg_dream))

lupine_exclusive

85


['INTU', 'HLA-DQB2', 'IL32', 'BTN3A2', 'CDH4', 'ARHGAP22', 'BIRC3']

#### What if we could attach the imputed/unimputed quants? 
So we could ask whether any of these proteins are low-abundance. 

In [21]:
tumor_quant_means = np.mean(tumor_quants, axis=1)
nontumor_quant_means = np.mean(nontumor_quants, axis=1)

aggregated["tumor_means"] = list(tumor_quant_means)
aggregated["nontumor_means"] = list(nontumor_quant_means)

agg_lupine_sub = aggregated[aggregated["HGNC"].isin(lupine_exclusive)]

#### Get some vital stats

In [22]:
quants_mean = np.mean(aggregated["tumor_means"])
quants_min = np.min(aggregated["tumor_means"])
quants_max = np.max(aggregated["tumor_means"])

print(quants_mean)
print(quants_min)
print(quants_max)

87.84673915766685
10.218958895941745
603.6274307679147


#### I wonder if we could make the case that these six DE proteins are "low-abundance"?

In [23]:
agg_lupine_sub

Unnamed: 0,ENSP,HGNC,p-value,FC,FDR,ENSG,tumor_means,nontumor_means
638,ENSP00000443301,CDH4,1.895278e-21,0.472038,20.722327,ENSG00000179242,24.774652,17.861177
672,ENSP00000249601,ARHGAP22,3.801962e-26,0.491907,25.419992,ENSG00000128805,26.457371,18.813425
1614,ENSP00000334003,INTU,8.497643e-16,0.496679,15.070702,ENSG00000164066,30.602475,21.689082
3209,ENSP00000396330,HLA-DQB2,7.312075e-26,0.460004,25.135959,ENSG00000232629,29.414726,21.38404
3487,ENSP00000324742,IL32,1.086267e-11,0.488842,10.964064,ENSG00000008517,81.163298,57.836692
5922,ENSP00000348751,BTN3A2,5.5225550000000005e-27,0.493148,26.25786,ENSG00000186470,41.460355,29.456461
9166,ENSP00000263464,BIRC3,1.208733e-27,0.453889,26.91767,ENSG00000023445,19.229287,14.03877
