# Correlation between CNV event and mutation status of frequently mutated genes

In [1]:
import pandas as pd
import numpy as np
import os
import cptac
import cptac.utils as ut
import altair as alt
import scipy.stats
import statsmodels.stats.multitest

In [12]:
dss = {
    "brca": cptac.Brca,
#     "ccrcc": cptac.Ccrcc,
    "colon": cptac.Colon,
#     "endometrial": cptac.Endometrial,
#     "gbm": cptac.Gbm,
    "hnscc": cptac.Hnscc,
    "lscc": cptac.Lscc,
    "luad": cptac.Luad,
    "ovarian": cptac.Ovarian
}

In [36]:
def test_freq_mut_cnv_correlation(
    cancer_type,
    dss,
    mut_freq_cutoff=0.1
):
    # Load the dataset
    ds = dss[cancer_type]()
    
    # Run freq_mut, and get all genes with mutation frequency above cutoff
    freqs = ut.get_frequently_mutated(ds, cutoff=mut_freq_cutoff)
    freq_mut_genes = freqs["Gene"]
    
    # Get the mutations table, in a format that will be easy to join to
    mut = ds.join_metadata_to_mutations(
        metadata_df_name="clinical",
        mutations_genes=freq_mut_genes,
        metadata_cols=[], # This will format the table in the way we want
        mutations_filter=[],
        how="right",
        quiet=True,
        tissue_type="tumor"
    )
    
    # Get just the mutation status columns, and create a binarized version of them
    mut_status = mut.loc[:, mut.columns.isin([gene + "_Mutation_Status" for gene in freq_mut_genes])]
    mut_binary = pd.DataFrame(index=mut_status.index.copy())
    
    for col in mut_status:
        assert not pd.isnull(mut_status[col]).any()
        mut_binary = mut_binary.assign(**{col: np.where(mut_status[col] == "Wildtype_Tumor", False, True)})
        
    # Get the event table
    event = pd.\
    read_csv(f"{cancer_type}_has_event.tsv", sep="\t", index_col=0).\
    rename(columns={"gain_event": "8q_gain", "loss_event": "8p_loss"})
    
    pvals = []
    gene_col = []

    for cnv_col in event.columns:
        for mut_col in mut_binary.columns:

            # Create contingency table
            contingency_table = pd.crosstab(event[cnv_col], mut_binary[mut_col])

            # Run test
            chi2, p, dof, exp_freq = scipy.stats.chi2_contingency(contingency_table)
            
            gene_col.append(mut_col)

            # Check assumptions: No group has expected value < 1, and no more than
            # 20% of groups have expected frequency < 5.
            exp_freq = pd.DataFrame(exp_freq)

            if (exp_freq < 1).any().any():
                pvals.append("Not all expected frequencies were > 1.")
            elif (exp_freq < 5).sum().sum() > 0.2 * exp_freq.shape[0] * exp_freq.shape[1]:
                pvals.append("More than 20% of groups had expected frequency < 5.")
            else:
                pvals.append(p)
        
    pvals = pd.DataFrame({
        "cancer_type": cancer_type,
        "cnv_event": cnv_col,
        "gene": gene_col,
        "pval": pvals
    })
    
    return pvals

In [39]:
all_results = pd.DataFrame()

for cancer_type in dss.keys():
    cancer_res = test_freq_mut_cnv_correlation(cancer_type=cancer_type, dss=dss)
    all_results = all_results.append(cancer_res)

                                          



                                         



                                            

In [40]:
all_results

Unnamed: 0,cancer_type,cnv_event,gene,pval
0,brca,8p_loss,ABCA13_Mutation_Status,0.727568
1,brca,8p_loss,AHNAK_Mutation_Status,0.325759
2,brca,8p_loss,AHNAK2_Mutation_Status,More than 20% of groups had expected frequency...
3,brca,8p_loss,ANK2_Mutation_Status,More than 20% of groups had expected frequency...
4,brca,8p_loss,ARID1B_Mutation_Status,More than 20% of groups had expected frequency...
5,brca,8p_loss,ASH1L_Mutation_Status,More than 20% of groups had expected frequency...
6,brca,8p_loss,BOD1L1_Mutation_Status,More than 20% of groups had expected frequency...
7,brca,8p_loss,CDH1_Mutation_Status,More than 20% of groups had expected frequency...
8,brca,8p_loss,DNAH11_Mutation_Status,More than 20% of groups had expected frequency...
9,brca,8p_loss,DNAH7_Mutation_Status,More than 20% of groups had expected frequency...


## Multiple testing correction

In [None]:
reject, pvals_corrected, alphacSidak, alphacBonf = statsmodels.stats.multitest.multipletests(
    pvals=all_results["pval"], 
    alpha=0.05, 
    method="fdr_bh"
)

all_results = all_results.assign(adj_p=pvals_corrected)

In [None]:
all_results[all_results["adj_p"] <= 0.05]

In [None]:
all_results

In [None]:
alt.Chart(all_results).mark_bar().encode(
    x=alt.X(
        "adj_p",
        bin=alt.Bin(step=0.05)
    ),
    y=alt.Y(
        "count()"
    )
)