In [1]:
import pandas as pd
import json
import glob
import os
import tqdm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from statsmodels.sandbox.stats.multicomp import fdrcorrection0
import matplotlib.patches as mpatches
import umap
import matplotlib
import matplotlib.cm as cm

### Load metadata

In [2]:
df_meta = pd.read_csv("../../data/SuppTable1_sample_metadata - metadata.tsv",sep="\t")

### Read output from dndscv

In [39]:
def count_unique(grp):
    return len(set(grp))

In [40]:
path_base="/home/fran/Documents/cuppen/HPC/tunnel/cuppen/projects/P0025_PCAWG_HMF/drivers/processed/resistance/dndscv/"
list_info = []
for file_output in tqdm.tqdm(glob.glob(os.path.join(path_base,"*","*.dndscv.results.tsv.gz"))):
    # name of ttype
    ttype=os.path.dirname(file_output).split("/")[-1]
    treatment = os.path.basename(file_output).split(".")[0]
    if treatment == "untreated":
        continue
    # read drivers
    df_drivers = pd.read_csv(file_output,sep="\t")
    df_drivers=df_drivers[df_drivers["qglobal_cv"]<0.1]
    df_drivers["non_synonimous"] = df_drivers["n_mis"] + df_drivers["n_non"] +  df_drivers["n_spl"] + df_drivers["n_ind"]
    
    # get frequency of drivers
    df_samples = pd.read_csv(os.path.join(os.path.dirname(file_output),f"{treatment}.dndscv.annotmuts.tsv.gz"),sep="\t",usecols=["sampleID","gene","impact"],engine="python")
    n_samples = len(df_samples["sampleID"].unique())
    df_samples=df_samples[((df_samples["gene"].isin(df_drivers[(df_drivers["qglobal_cv"]<0.1)]["gene_name"].unique()))&(df_samples["impact"]!="Synonymous"))].rename(columns={"gene":"gene_name"})
    df_samples=df_samples.groupby("gene_name",as_index=False).agg(mutated_samples=("sampleID",count_unique))
    df_samples["freq"] = df_samples["mutated_samples"] / n_samples
    
    df_drivers = df_drivers.merge(df_samples[["freq","gene_name","mutated_samples"]])
    df_drivers["drug_type"] = treatment
    df_drivers["ttype"] = ttype
    df_drivers["total_samples"] = n_samples
    list_info.append(df_drivers)
    
df_total = pd.concat(list_info)


100%|██████████| 120/120 [00:37<00:00,  3.21it/s]


In [41]:
driver_genes = set(df_total["gene_name"].values)

### Control

In [42]:
path_base="/home/fran/Documents/cuppen/HPC/tunnel/cuppen/projects/P0025_PCAWG_HMF/drivers/processed/resistance/dndscv/"
list_info = []
for file_output in tqdm.tqdm(glob.glob(os.path.join(path_base,"*","untreated.dndscv.results.tsv.gz"))):
    # name of ttype
    ttype=os.path.dirname(file_output).split("/")[-1]
    treatment="untreated"    
    # read drivers
    df_drivers = pd.read_csv(file_output,sep="\t")
    df_drivers["non_synonimous"] = df_drivers["n_mis"] + df_drivers["n_non"] +  df_drivers["n_spl"] + df_drivers["n_ind"]
    df_drivers=df_drivers[df_drivers["gene_name"].isin(driver_genes)]
    # get frequency of drivers
    df_samples = pd.read_csv(os.path.join(os.path.dirname(file_output),f"{treatment}.dndscv.annotmuts.tsv.gz"),sep="\t",usecols=["sampleID","gene","impact"],engine="python")
    n_samples = len(df_samples["sampleID"].unique())
    df_samples=df_samples[(df_samples["gene"].isin(driver_genes))].rename(columns={"gene":"gene_name"})
    df_samples=df_samples.groupby("gene_name",as_index=False).agg(mutated_samples=("sampleID",count_unique))
    df_samples["freq"] = df_samples["mutated_samples"] / n_samples
    
    df_drivers = df_drivers.merge(df_samples[["freq","gene_name","mutated_samples"]])
    df_drivers["drug_type"] = treatment
    df_drivers["ttype"] = ttype
    df_drivers["total_samples"] = n_samples
    list_info.append(df_drivers)
    
df_control = pd.concat(list_info)
control_genes = df_control[df_control["qglobal_cv"]<0.1]

100%|██████████| 28/28 [00:10<00:00,  2.68it/s]


### Perform ttype specific comparison

In [43]:
def concat(grp):
    return ",".join(list(grp))
df_summary=df_total.groupby(["gene_name"],as_index=False).agg(frequency_max=("freq",np.nanmax),frequency_mean=("freq",np.nanmean),avg_muts=("non_synonimous",np.nanmean),max_muts=("non_synonimous",np.nanmax),max_mutated_samples=("mutated_samples",np.nanmax),mean_mutated_samples=("mutated_samples",np.nanmean),n_drugs=("drug_type","count"),drugs=("drug_type",concat))

### Make comparison

In [44]:
ttypes=list(df_total["ttype"].unique())
df_control.set_index(["ttype"],inplace=True)
df_total.set_index(["ttype","drug_type"],inplace=True)

In [45]:
results=[]
qvalues=[]
for ttype in tqdm.tqdm(ttypes):
    pvalues=[]
    for treatment in set(df_total.loc[ttype].index):
        q=df_control.loc[ttype]
        total_control= int(q["total_samples"].values[0])
        for i,r in  df_total.loc[(ttype,treatment)].iterrows():
            gene,mutated_samples,total_samples = r["gene_name"],r["mutated_samples"],r["total_samples"]
            v=q[q["gene_name"]==gene]
            control_mutated=0
            if v.shape[0] > 0:
                control_mutated = int(v["mutated_samples"].values[0])
            odds,pvalue=scipy.stats.fisher_exact([[mutated_samples,total_samples-mutated_samples],[control_mutated,total_control-control_mutated]])
            stat,pvalue_g=scipy.stats.power_divergence([mutated_samples,total_samples-mutated_samples],[np.nanmax([control_mutated,0.1]),total_control-control_mutated],lambda_="log-likelihood")
            results.append([gene,ttype,treatment,odds,pvalue,pvalue_g,r["non_synonimous"],mutated_samples,total_samples,control_mutated,total_control,r["qglobal_cv"]])
            pvalues.append(pvalue)
    qvalues+=list(fdrcorrection0(pvalues)[1])
        

  
100%|██████████| 22/22 [00:04<00:00,  4.71it/s]


In [46]:
df_stats = pd.DataFrame(results,columns=["gene","ttype","drug_type","odds_ratio","pvalue","pvalue_g","n_muts_t","n_samples_mutated_t","n_samples_t","n_samples_mutated_c","n_samples_c","qvalue_dndscv"])
df_stats["qvalue"] = fdrcorrection0(df_stats["pvalue"].values)[1]
df_stats["qvalue_ttype"] = qvalues
max_odds=np.max([np.log2(x) for x in df_stats[np.isfinite(df_stats["odds_ratio"])]["odds_ratio"].values])
df_stats["log_odds_ratio"] = df_stats.apply(lambda row: np.log2(row["odds_ratio"]+0.001) if np.isfinite(row["odds_ratio"]) else np.log2((row["n_muts_t"]/row["n_samples_mutated_t"])/(0.1/row["n_samples_c"])),axis=1)
df_stats["log_qvalue"] = -np.log10(df_stats["qvalue"])
df_stats["freq_mut"] = df_stats["n_samples_mutated_t"] / df_stats["n_samples_t"]

In [47]:
len(df_stats[(df_stats["qvalue"]<0.1)&(df_stats["log_odds_ratio"]>0)]["drug_type"].unique())

19

### Include excess

In [48]:
def get_excess(q):
    if q == 0:
        return 0
    return (q - 1) / q
df_total["e_mis"] = df_total.apply(lambda row: get_excess(row["wmis_cv"]),axis=1 )
df_total["e_non"] = df_total.apply(lambda row: get_excess(row["wnon_cv"]),axis=1 )
df_total["e_spl"] = df_total.apply(lambda row: get_excess(row["wspl_cv"]),axis=1 )
df_total["e_ind"] = df_total.apply(lambda row: get_excess(row["wind_cv"]),axis=1 )

### Only mechanisms

In [52]:
annotated_mc = pd.read_csv("../data/SuppTable8_TEDs - Annotated TEDs.tsv",sep="\t").rename(columns={"drug":"drug_type","cancertype":"ttype"})[["gene","ttype","drug_type"]]
annotated_mc["annotated"] = True
d_maxv_mc=df_stats[(df_stats["qvalue"]<0.1)].groupby(["gene","ttype"]).agg({"qvalue":np.nanmin}).to_dict()["qvalue"]
df_stats["representative_mech"] = df_stats.apply(lambda row: d_maxv_mc[(row["gene"],row["ttype"])] == row["qvalue"] if ((row["gene"],row["ttype"]) in d_maxv_mc and row["qvalue"] < 0.05) else False,axis=1)
df_stats = df_stats.merge(annotated_mc,how="left").fillna({"annotated":False})
df_stats["label"] = df_stats.apply(lambda row: row["gene"]+ " - " + row["drug_type"] +" ("+row["ttype"]+")",axis=1)
df_stats["type_alt"] = "coding mutations"

### Save data

In [53]:
df_stats[["gene","drug_type","ttype","label","type_alt","qvalue","log_qvalue","log_odds_ratio","n_samples_mutated_t","n_samples_mutated_c","annotated","representative_mech"]].rename(
    columns={"ttype":"cancer_type_code","drug_type":"drug","q.value_TT_fdr":"qvalue","log_odds_ratio":"log_odds_ratio","n_samples_mutated_t":"mutated_t","n_samples_mutated_c":"mutated_c"}).to_csv("../data/coding_results_resistance.tsv.gz",sep="\t",index=False)