# Correlation between CNV event and mutation status of frequently mutated genes

In [1]:
import pandas as pd
import numpy as np
import os
import cptac
import cptac.utils as ut
import altair as alt
import scipy.stats
import statsmodels.stats.multitest

In [2]:
dss = {
    "brca": cptac.Brca,
#     "ccrcc": cptac.Ccrcc,
    "colon": cptac.Colon,
#     "endometrial": cptac.Endometrial,
#     "gbm": cptac.Gbm,
    "hnscc": cptac.Hnscc,
    "lscc": cptac.Lscc,
    "luad": cptac.Luad,
    "ovarian": cptac.Ovarian
}

In [3]:
def test_freq_mut_cnv_correlation(
    cancer_type,
    dss,
    mut_freq_cutoff=0.1
):
    # Load the dataset
    ds = dss[cancer_type]()
    
    # Run freq_mut, and get all genes with mutation frequency above cutoff
    freqs = ut.get_frequently_mutated(ds, cutoff=mut_freq_cutoff)
    freq_mut_genes = freqs["Gene"]
    
    # Get the mutations table, in a format that will be easy to join to
    mut = ds.join_metadata_to_mutations(
        metadata_df_name="clinical",
        mutations_genes=freq_mut_genes,
        metadata_cols=[], # This will format the table in the way we want
        mutations_filter=[],
        how="right",
        quiet=True,
        tissue_type="tumor"
    )
    
    # Get just the mutation status columns, and create a binarized version of them
    mut_status = mut.loc[:, mut.columns.isin([gene + "_Mutation_Status" for gene in freq_mut_genes])]
    mut_binary = pd.DataFrame(index=mut_status.index.copy())
    
    for col in mut_status:
        assert not pd.isnull(mut_status[col]).any()
        mut_binary = mut_binary.assign(**{col: np.where(mut_status[col] == "Wildtype_Tumor", False, True)})
        
    # Get the event table
    event = pd.\
    read_csv(f"{cancer_type}_has_event.tsv", sep="\t", index_col=0).\
    rename(columns={"gain_event": "8q_gain", "loss_event": "8p_loss"})
    
    pvals = []
    gene_col_labels = []
    cnv_col_labels = []

    for cnv_col in event.columns:
        for mut_col in mut_binary.columns:

            # Create contingency table
            contingency_table = pd.crosstab(event[cnv_col], mut_binary[mut_col])

            # Run test
            chi2, p, dof, exp_freq = scipy.stats.chi2_contingency(contingency_table)
            
            # Append labels for this row
            gene_col_labels.append(mut_col)
            cnv_col_labels.append(cnv_col)

            # Check assumptions: No group has expected value < 1, and no more than
            # 20% of groups have expected frequency < 5.
            exp_freq = pd.DataFrame(exp_freq)

            if (exp_freq < 1).any().any():
                pvals.append(np.nan)
            elif (exp_freq < 5).sum().sum() > 0.2 * exp_freq.shape[0] * exp_freq.shape[1]:
                pvals.append(np.nan)
            else:
                pvals.append(p)
        
    pvals = pd.DataFrame({
        "cancer_type": cancer_type,
        "cnv_event": cnv_col_labels,
        "gene": gene_col_labels,
        "pval": pvals
    })
    
    return pvals

In [4]:
all_results = pd.DataFrame()

for cancer_type in dss.keys():
    cancer_res = test_freq_mut_cnv_correlation(cancer_type=cancer_type, dss=dss)
    all_results = all_results.append(cancer_res)
    all_results = all_results.reset_index(drop=True)

                                          



                                         



                                            

In [5]:
all_results_filtered = all_results[all_results["pval"].notna()].reset_index()

## Multiple testing correction

In [6]:
reject, pvals_corrected, alphacSidak, alphacBonf = statsmodels.stats.multitest.multipletests(
    pvals=all_results_filtered["pval"], 
    alpha=0.05, 
    method="fdr_bh"
)

all_results_filtered = all_results_filtered.assign(adj_p=pvals_corrected)

In [7]:
all_results_filtered[all_results_filtered["adj_p"] <= 0.05]

Unnamed: 0,index,cancer_type,cnv_event,gene,pval,adj_p
827,1111,luad,8q_gain,RELN_Mutation_Status,3e-05,0.027455


In [8]:
alt.Chart(all_results_filtered).mark_bar().encode(
    x=alt.X(
        "adj_p",
        bin=alt.Bin(step=0.05)
    ),
    y=alt.Y(
        "count()"
    )
)