# Correlation between CNV event and mutation status of frequently mutated genes

In [1]:
import pandas as pd
import numpy as np
import os
import cptac
import cptac.utils as ut
import altair as alt
import scipy.stats
import statsmodels.stats.multitest
import cnvutils

In [2]:
dss = {
    "brca": cptac.Brca,
#     "ccrcc": cptac.Ccrcc,
    "colon": cptac.Colon,
#     "endometrial": cptac.Endometrial,
#     "gbm": cptac.Gbm,
    "hnscc": cptac.Hnscc,
    "lscc": cptac.Lscc,
    "luad": cptac.Luad,
    "ovarian": cptac.Ovarian
}

In [3]:
def test_freq_mut_cnv_correlation(
    cancer_type,
    dss,
    mut_freq_cutoff=0.1
):
    # Load the dataset
    ds = dss[cancer_type]()
    
    # Run freq_mut, and get all genes with mutation frequency above cutoff
    freqs = ut.get_frequently_mutated(ds, cutoff=mut_freq_cutoff)
    freq_mut_genes = freqs["Gene"]
    
    # Get the mutations table, in a format that will be easy to join to
    mut = ds.join_metadata_to_mutations(
        metadata_df_name="clinical",
        mutations_genes=freq_mut_genes,
        metadata_cols=[], # This will format the table in the way we want
        mutations_filter=[],
        how="right",
        quiet=True,
        tissue_type="tumor"
    )
    
    # Get just the mutation status columns, and create a binarized version of them
    mut_status = mut.loc[:, mut.columns.isin([gene + "_Mutation_Status" for gene in freq_mut_genes])]
    mut_binary = pd.DataFrame(index=mut_status.index.copy())
    
    for col in mut_status:
        assert not pd.isnull(mut_status[col]).any()
        mut_binary = mut_binary.assign(**{col: np.where(mut_status[col] == "Wildtype_Tumor", False, True)})
        
    # Get the event table
    event = pd.\
    read_csv(f"{cancer_type}_has_event.tsv", sep="\t", index_col=0).\
    rename(columns={"gain_event": "8q_gain", "loss_event": "8p_loss"})
    
    pvals = []
    gene_col_labels = []
    cnv_col_labels = []

    for cnv_col in event.columns:
        for mut_col in mut_binary.columns:

            # Create contingency table
            contingency_table = pd.crosstab(event[cnv_col], mut_binary[mut_col])

            # Run test
            chi2, p, dof, exp_freq = scipy.stats.chi2_contingency(contingency_table)
            
            # Append labels for this row
            gene_col_labels.append(mut_col)
            cnv_col_labels.append(cnv_col)

            # Check assumptions: No group has expected value < 1, and no more than
            # 20% of groups have expected frequency < 5.
            exp_freq = pd.DataFrame(exp_freq)

            if (exp_freq < 1).any().any():
                pvals.append(np.nan)
            elif (exp_freq < 5).sum().sum() > 0.2 * exp_freq.shape[0] * exp_freq.shape[1]:
                pvals.append(np.nan)
            else:
                pvals.append(p)
        
    pvals = pd.DataFrame({
        "cancer_type": cancer_type,
        "cnv_event": cnv_col_labels,
        "gene": gene_col_labels,
        "pval": pvals
    })
    
    return pvals

In [4]:
all_results = pd.DataFrame()

for cancer_type in dss.keys():
    cancer_res = test_freq_mut_cnv_correlation(cancer_type=cancer_type, dss=dss)
    all_results = all_results.append(cancer_res)
    all_results = all_results.reset_index(drop=True)

                                          



                                            

In [5]:
all_results_filtered = all_results[all_results["pval"].notna()].reset_index()

## Multiple testing correction

In [6]:
reject, pvals_corrected, alphacSidak, alphacBonf = statsmodels.stats.multitest.multipletests(
    pvals=all_results_filtered["pval"], 
    alpha=0.05, 
    method="fdr_bh"
)

all_results_filtered = all_results_filtered.assign(adj_p=pvals_corrected)

In [7]:
all_results_filtered[all_results_filtered["adj_p"] <= 0.05]

Unnamed: 0,index,cancer_type,cnv_event,gene,pval,adj_p
351,356,colon,8p_loss,ACVR2A_Mutation_Status,4.6e-05,0.020773
371,403,colon,8p_loss,CCDC168_Mutation_Status,4.4e-05,0.020773


In [8]:
alt.Chart(all_results_filtered).mark_bar().encode(
    x=alt.X(
        "adj_p",
        bin=alt.Bin(step=0.05)
    ),
    y=alt.Y(
        "count()"
    )
)

In [9]:
all_results_filtered

Unnamed: 0,index,cancer_type,cnv_event,gene,pval,adj_p
0,0,brca,8q_gain,FRG1_Mutation_Status,0.857168,0.973623
1,1,brca,8q_gain,MT-ND5_Mutation_Status,0.941911,0.976201
2,2,brca,8q_gain,MUC16_Mutation_Status,0.563799,0.858987
3,3,brca,8q_gain,MUC4_Mutation_Status,0.468132,0.781659
4,4,brca,8q_gain,PIK3CA_Mutation_Status,0.069141,0.295921
5,5,brca,8q_gain,TP53_Mutation_Status,0.029993,0.203839
6,6,brca,8q_gain,TTN_Mutation_Status,0.914900,0.973623
7,7,brca,8p_loss,FRG1_Mutation_Status,0.366623,0.678516
8,8,brca,8p_loss,MT-ND5_Mutation_Status,0.873582,0.973623
9,9,brca,8p_loss,MUC16_Mutation_Status,0.372818,0.682997


In [57]:
def pval_plot(df, title, group_col, val_col, y=True, sig=0.05):
    
    val_log_col = "neg_log_p"
    log_cutoff = -np.log10(sig)
    df = df.assign(**{val_log_col: - np.log10(df[val_col])})
    
    if y:
        chart_y = alt.Y(
            val_log_col,
            title="-log(p)",
        )
        
    else:
        chart_y = alt.Y(
            val_log_col,
            axis=alt.Axis(
                labels=False,
                ticks=False,
                title=None
            )
        )
        
        
    chart = alt.Chart(df).mark_point().encode(
        x=group_col,
        y=chart_y,
        color=group_col
    )
    
#     chart_text = chart.transform_filter(
#         alt.datum.neg_log_p >= log_cutoff
#     ).mark_text(
#         align="left",
#         baseline='middle',
#         dx=7
#     ).encode(
#         text='gene'
#     )

    line = alt.Chart(pd.DataFrame({
        'y': [log_cutoff],
        "label": [f"-log({sig})"]
    })).mark_rule(color="crimson").encode(
        y="y"
    )

    text = line.mark_text(
        align="right",
        dx=-65
    ).encode(
        text="label"
    )
        
        
    if y:
        return (chart + line + text).properties(title=title)
    else:
        return (chart + line).properties(title=title)
    
alt.hconcat(
    pval_plot(
        all_results_filtered[all_results_filtered["cnv_event"] == "8p_loss"], "8p loss", "cancer_type", "adj_p"),
    pval_plot(
        all_results_filtered[all_results_filtered["cnv_event"] == "8q_gain"], "8q gain", "cancer_type", "adj_p", False)
    
).resolve_scale(y="shared").properties(
    title=["Chi-squared results for correlation of chr8 CNV events", "with frequently mutated genes (>10%)"]
).configure_title(
    anchor="middle"
)