# differential-expression-analysis-naive
8.12.24

Our standard differential expression analysis workflow, ensuring a 1-to-1 comparison of Lupine, Dream and naive imputation. Here we're getting DE imputed proteins after naive imputation slash no impute. 

Need to make sure that we're comparing the same proteins across all three of these notebooks. 

In [17]:
import pandas as pd
import numpy as np 
from tqdm import tqdm
from Bio import SeqIO
from scipy import stats

#### Configs

In [41]:
# The unimputed joint quants matrix
joint_fname="/net/noble/vol2/home/lincolnh/data/quant-data/UMich-normalized/joint-quants-normalized-shifted.csv"
min_pres=18

# The Ensemble (GENCODEv44) fasta
ensembl_path="/net/noble/vol2/home/lincolnh/code/2023_harris_deep_impute/results/2023-11-13_UMich_dataset/fastas/"
ensembl_df="gencode.v44.pc_translations.fa"

# The HGNC database file
hgnc_database_path="/net/noble/vol2/home/lincolnh/data/quant-data/HGNC_database.txt"

# The metadata dictionary, previously created
meta_path="/net/noble/vol2/home/lincolnh/code/2023_harris_deep_impute/results/2024-05-10_metadata_mapping/meta-dict.csv"

cohort_ids=["BRCA", "CCRCC", "COAD", "GBM", "HGSC", 
            "HNSCC", "LSCC", "LUAD", "PDAC", "UCEC"]

# Set the thresholds
adjusted_alpha=1e-3
fc_thresh=0.5 
pres_frac_thresh=0.5 # Default here is 50%

curr_cohort = "CCRCC"

# The random generator, for Gaussian sample impute
rng = np.random.default_rng(18)

#### Functions

In [42]:
def random_draw_impute(vec):
    """
    Imputes for a single *column* at a time using the Gaussian
    random draw procedure. This closely resembles the Perseus
    procedure described here: 
    https://cox-labs.github.io/coxdocs/replacemissingfromgaussian.html
    
    Parameters
    ----------
    vec : np.array, 
        A 1D vector, that is, column from the matrix
        to be imputed. 

    Returns
    ----------
    vec_recon : np.array, 
        The imputed vector
    """
    width_param=0.3
    downshift_param=1.8
    #print(len(vec))
    
    # Get the means and std
    v_mean = np.nanmean(vec)
    v_std = np.nanstd(vec)
    
    # Get the locations of the MVs
    nans = np.isnan(vec)
    vec_recon = vec.copy()
    
    # How many total MVs? 
    n_mv = np.count_nonzero(nans)
    
    center = v_mean - (v_std * downshift_param)

    # Replace missing values with random draws
    vec_recon[nans] = rng.normal(
                        loc=center,
                        scale=v_std*width_param,
                        size=n_mv
    )
    # Make sure we don't have any negative values
    vec_recon = np.abs(vec_recon)
    
    return vec_recon

def bh_adjustment(pvals):
    """
    Performs the Benjamini-Hochberg procedure
    for p-value ADJUSTMENT. So this means we actually 
    return a list of corrected p-values, not just a 
    boolean specifying which p-values to keep per the
    FDR controlled at some threshold. 
    
    Parameters
    ----------
    pvals : np.ndarray, 
        The sorted list of uncorrected p-values. Sorted
        from smallest to largest

    Returns
    -----------
    pvals_adjusted : list, 
        A list of the BH adjusted p-values
    """
    pvals_adjusted = []

    for i in range(0, len(pvals)):
        rank = i + 1
        curr_pval = pvals[i]
        pval_adj = (curr_pval * len(pvals)) / rank
        pvals_adjusted.append(pval_adj)        

    return pvals_adjusted

#### Subset the metadata to a single cohort, get the tumor and nontumor sample IDs

In [43]:
# Might need `index_col=0` here
meta_dict = pd.read_csv(meta_path)
meta_dict = meta_dict[meta_dict["cohort"] == curr_cohort]
meta_dict = meta_dict.reset_index(drop=True)

tumor_samples_meta = meta_dict[(meta_dict["sample_type"] == "Primary Tumor") | (meta_dict["sample_type"] == "Tumor")]
nontumor_samples_meta = meta_dict[(meta_dict["sample_type"] != "Primary Tumor") & (meta_dict["sample_type"] != "Tumor")]

tumor_IDs = list(tumor_samples_meta["aliquot_ID"])
nontumor_IDs = list(nontumor_samples_meta["aliquot_ID"])

print(len(tumor_IDs))
print(len(nontumor_IDs))

110
84


#### Pre-process the unimputed joint quants matrix
This should get us to the exact same starting point as the Lupine ensemble impute procedure. 

In [44]:
# Read in the joint quants matrix
joint_mat = pd.read_csv(joint_fname, index_col=0)

# Remove some of these extraneous runs
keywords = ["RefInt", "QC", "pool", "Tumor", "Pooled", 
            "Pool", "Reference", "NCI", "NX", "Ref"]
to_drop = []

for sample_id in list(joint_mat.columns):
    exclude=False
    for kw in keywords:
        if kw in sample_id:
            exclude=True
            break
    to_drop.append(exclude)

keep_cols = np.array(joint_mat.columns)[~np.array(to_drop)]
joint_mat = joint_mat[keep_cols]

joint = np.array(joint_mat)

# Remove proteins with too many missing values
num_present = np.sum(~np.isnan(joint), axis=1)
discard = num_present < min_pres
joint = np.delete(joint, discard, axis=0)
keep_prots = np.array(joint_mat.index)[~discard]

print(f"joint quants mat shape, post-filter: {joint.shape}")

joint_start = pd.DataFrame(joint, columns=keep_cols, index=keep_prots)

joint quants mat shape, post-filter: (18162, 1755)


#### Exponentiate to get the original, untransformed intensities
These quants had previously been log2 transformed (by the CPTAC project). So we're inversing that transformation.

In [45]:
joint_start = np.power(2, joint_start)

#### Remove the proteins with >50% missingness
From the unimputed runs corresponding to the current cohort. 

In [46]:
cohort_quants_start = joint_start[tumor_IDs + nontumor_IDs]

num_present = np.sum(~np.isnan(cohort_quants_start), axis=1)
pres_fracs = num_present / cohort_quants_start.shape[1]

cohort_quants = cohort_quants_start[pres_fracs >= pres_frac_thresh]
print(cohort_quants.shape)

keep_prots_cohort = list(cohort_quants.index)

(9324, 194)


#### Get quants matrices for tumor and non-tumor samples

In [47]:
tumor_quants = cohort_quants[tumor_IDs]
nontumor_quants = cohort_quants[nontumor_IDs]

tumor_mat = np.array(tumor_quants)
nontumor_mat = np.array(nontumor_quants)

print(tumor_mat.shape)
print(nontumor_mat.shape)

(9324, 110)
(9324, 84)


#### The optional naive impute step 

In [48]:
tumor_mat = np.apply_along_axis(random_draw_impute, 0, tumor_mat)
nontumor_mat = np.apply_along_axis(random_draw_impute, 0, nontumor_mat)

#### Calculate the Wilcoxon-rank sum statistics

In [49]:
pvals = []
rs_stats = []

for i in range(0, tumor_mat.shape[0]):
    stat, pval = stats.ranksums(tumor_mat[i], nontumor_mat[i], nan_policy="omit")
    pvals.append(pval)
    rs_stats.append(stat)

# Init a dataframe to hold the p-values and adjusted p-values
stats_df = pd.DataFrame(columns = ["ENSP", "pval", "adj_pval", "orig_idx"])
stats_df["ENSP"] = list(tumor_quants.index)
stats_df["pval"] = pvals

#### Do the Benjamini-Hochberg correction

In [50]:
# Sort by uncorrected p-values
stats_df = stats_df.sort_values(by="pval")
# Do the BH adjustment
pvals_corrected = bh_adjustment(np.array(stats_df["pval"]))
stats_df["adj_pval"] = pvals_corrected
stats_df["orig_idx"] = list(stats_df.index)

# Return to the initial order
stats_df = stats_df.sort_values(by="orig_idx")

#### Get the log fold changes

In [51]:
tumor_expr_means = np.nanmean(tumor_mat, axis=1)
nontumor_expr_means = np.nanmean(nontumor_mat, axis=1)

log_fold_changes = np.log2(tumor_expr_means / nontumor_expr_means)

fdr = -np.log10(pvals_corrected)

#### Create an aggregated dataframe for analysis

In [52]:
aggregated = pd.DataFrame(columns=["ENSP", "HGNC", "p-value", "FC"])
aggregated["ENSP"] = list(tumor_quants.index)
aggregated["p-value"] = pvals_corrected
aggregated["FC"] = log_fold_changes
aggregated["FDR"] = fdr
#aggregated.head(5)

#### Create a dictionary mapping ENSPs to HGNCs

In [53]:
# Read in the HGNC database file
hgnc_db = pd.read_csv(hgnc_database_path, sep="\t")

# Read in the ENSEMBL fasta
ensembl_fasta = ensembl_path + ensembl_df
fasta_seqs = SeqIO.parse(open(ensembl_fasta), "fasta")

# Init both dictionaries
gene_x_prot = {}
prot_x_gene = {}

# Fill in the dictionary 
for fasta in fasta_seqs:
    name, descript, sequence = \
        fasta.id, fasta.description, str(fasta.seq)
    # Get the ENSP and ENSG IDs
    ensp_id = name.split("|")[0]
    ensg_id = name.split("|")[2]
    # Strip the ".x" characters. Hope this is ok.
    ensp_id = ensp_id.split(".")[0]
    ensg_id = ensg_id.split(".")[0]
    
    # Update the first dictionary
    prot_x_gene[ensp_id] = ensg_id
    
    # Update the second
    if ensg_id in gene_x_prot:
        gene_x_prot[ensg_id].append(ensp_id)
    else:
        gene_x_prot[ensg_id] = [ensp_id]

  hgnc_db = pd.read_csv(hgnc_database_path, sep="\t")


#### Append the HGNC IDs

In [54]:
for idx in range(0, aggregated.shape[0]):
    curr = aggregated.iloc[idx]
    curr_ensp = curr["ENSP"]
    try:
        curr_ensg = prot_x_gene[curr_ensp]
    except KeyError:
        curr_ensg = None

    # Add the ENSG ID
    aggregated.loc[idx, "ENSG"] = curr_ensg

    # Add in the HGNC gene ID as well 
    if curr_ensg is not None:
        try:
            hgnc_row = hgnc_db[hgnc_db["ensembl_gene_id"] == curr_ensg]
            hgnc_id = hgnc_row["symbol"].item()

            aggregated.loc[idx, "HGNC"] = hgnc_id
        except ValueError:
            pass

#### Define up- and down-regulated genes/proteins 
According to our adjusted p-value threshold and log FC threshold. 

In [55]:
up_df = aggregated[(aggregated["p-value"] < adjusted_alpha) & (aggregated["FC"] >= fc_thresh)]
down_df = aggregated[(aggregated["p-value"] < adjusted_alpha) & (aggregated["FC"] <= -fc_thresh)]

print(f"num up-reg: {up_df.shape[0]}")
print(f"num down-reg: {down_df.shape[0]}")

num up-reg: 56
num down-reg: 296
