In [1]:
import pandas as pd
import glob
import tqdm 
import os
from scipy import stats
import numpy as np
import scipy
import mne


### Metadata

In [8]:
df_meta = pd.read_csv("../metadata/dataset_metadata_supp_table3.tsv",sep="\t")
df_meta=df_meta[df_meta["is_selected"]==True]

### Extract Hartwig drivers per sample

In [10]:
l=[]
for sample in tqdm.tqdm(df_meta[df_meta["cohort"]=="Hartwig"]["sample_id"].unique()):
    filein=f"/home/fran/Documents/cuppen/HPC/tunnel//cuppen/shared_resources/HMF_data/DR-104-update4/somatics/{sample}/linx/{sample}.linx.driver.catalog.tsv"
    if os.path.exists(filein):
        drivers = pd.read_csv(filein,sep="\t")
        drivers["sample"] = sample
        l.append(drivers)
df_drivers_hmf = pd.concat(l)
df_drivers_hmf["is_driver"] =Tru


  0%|          | 0/4440 [00:00<?, ?it/s][A
  0%|          | 2/4440 [00:00<04:36, 16.07it/s][A
  0%|          | 3/4440 [00:00<08:36,  8.59it/s][A
  0%|          | 4/4440 [00:00<11:48,  6.26it/s][A
  0%|          | 5/4440 [00:00<13:59,  5.28it/s][A
  0%|          | 6/4440 [00:01<15:50,  4.67it/s][A
  0%|          | 7/4440 [00:01<16:33,  4.46it/s][A
  0%|          | 8/4440 [00:01<16:49,  4.39it/s][A
  0%|          | 9/4440 [00:01<17:07,  4.31it/s][A
  0%|          | 10/4440 [00:02<18:15,  4.04it/s][A
  0%|          | 11/4440 [00:02<18:57,  3.90it/s][A
  0%|          | 12/4440 [00:02<18:59,  3.89it/s][A
  0%|          | 13/4440 [00:02<19:06,  3.86it/s][A
  0%|          | 14/4440 [00:03<19:02,  3.87it/s][A
  0%|          | 15/4440 [00:03<19:06,  3.86it/s][A
  0%|          | 16/4440 [00:03<18:38,  3.96it/s][A
  0%|          | 17/4440 [00:03<18:36,  3.96it/s][A
  0%|          | 18/4440 [00:04<18:44,  3.93it/s][A
  0%|          | 19/4440 [00:04<18:53,  3.90it/s][A
  0%|    

### Extract PCAWG drivers per sample

In [30]:
l=[]
for sample in tqdm.tqdm(df_meta[df_meta["cohort"]=="PCAWG"]["sample_id"].unique()):
    filein=f"/home/fran/Documents/cuppen/HPC/tunnel//cuppen/shared_resources/PCAWG/pipeline5/per-donor/{sample}-from-jar/linxsoft1.17/{sample}T.linx.driver.catalog.tsv"
    if os.path.exists(filein):
        drivers = pd.read_csv(filein,sep="\t")
        drivers["sample"] = sample
        l.append(drivers)
df_drivers_pcawg = pd.concat(l)
df_drivers_pcawg["is_driver"] =True


  0%|          | 0/1880 [00:00<?, ?it/s][A
  0%|          | 1/1880 [00:00<07:22,  4.24it/s][A
  0%|          | 2/1880 [00:00<07:41,  4.07it/s][A
  0%|          | 3/1880 [00:00<07:58,  3.92it/s][A
  0%|          | 4/1880 [00:01<07:57,  3.93it/s][A
  0%|          | 5/1880 [00:01<08:10,  3.83it/s][A
  0%|          | 6/1880 [00:01<08:03,  3.88it/s][A
  0%|          | 7/1880 [00:01<08:29,  3.68it/s][A
  0%|          | 8/1880 [00:02<11:42,  2.66it/s][A
  0%|          | 9/1880 [00:02<10:46,  2.90it/s][A
  1%|          | 10/1880 [00:03<09:52,  3.16it/s][A
  1%|          | 11/1880 [00:03<09:22,  3.33it/s][A
  1%|          | 12/1880 [00:03<08:57,  3.48it/s][A
  1%|          | 13/1880 [00:03<08:37,  3.61it/s][A
  1%|          | 14/1880 [00:04<08:31,  3.65it/s][A
  1%|          | 15/1880 [00:04<08:32,  3.64it/s][A
  1%|          | 16/1880 [00:04<08:30,  3.65it/s][A
  1%|          | 17/1880 [00:04<08:38,  3.60it/s][A
  1%|          | 18/1880 [00:05<08:21,  3.71it/s][A
  1%|     

### Combined

In [45]:
df_drivers_total = pd.concat([df_drivers_pcawg,df_drivers_hmf]).rename(columns={"sample":"sample_id"})
df_drivers_total.to_csv("../results/data/features_correlation/drivers_per_sample.tsv.gz",sep="\t",index=False,compression="gzip")

### Perform analysis of assotiation between GIE and drivers in a tumor type specific manner

In [4]:
def perform_fisher_exact_test(df_drivers,columns_query,column_dependent,ttype,threshold_min=0):
    l=[]
    for driver in columns_query:
        v = df_drivers[(df_drivers[driver]==True)&(df_drivers[column_dependent]==True)]
        if v.shape[0] >= threshold_min:
            # enrichment
            a=df_drivers[(df_drivers[driver]==True)&(df_drivers[column_dependent])].shape[0]
            b=df_drivers[(df_drivers[driver]==True)&(~df_drivers[column_dependent])].shape[0]

            c=df_drivers[(df_drivers[driver]==False)&(df_drivers[column_dependent])].shape[0]
            d=df_drivers[(df_drivers[driver]==False)&(~df_drivers[column_dependent])].shape[0]
            o,p=scipy.stats.fisher_exact([[a,b],[c,d]],alternative="two-sided")
            if np.isfinite(p):
                l.append([ttype,driver,a,b,c,d,np.log2(o),p])
    df_t =pd.DataFrame(l,columns=["cancer_type_code","column","a","b","c","d","odds","pvalue"]).drop_duplicates()
    df_t["qvalue"]=mne.stats.fdr_correction(df_t["pvalue"])[1]
    df_t["variable_dependent"] = column_dependent
    return df_t


-------------------------------

# Perform analysis

### Load dependent variables

In [5]:
df_d = pd.read_csv("../results/data/features_correlation/randomized_escape_for_features.tsv.gz",sep="\t") # simulated GIE
df_d1 = pd.read_csv("../results/data/features_correlation/escape_for_features.tsv",sep="\t") # GIE event

### Load drivers per sample

In [2]:
df_drivers_total_raw = pd.read_csv("../results/data/features_correlation/drivers_per_sample.tsv.gz",sep="\t")
genes_gie = set(pd.read_csv("../metadata/immune_selected_genes.tsv",sep="\t")["Gene"].values)

In [37]:
len(df_drivers_total["gene"].unique())

367

### Exclude genes next to amplification targets (PD-1 and SETDB1) 

In [23]:
pd1=("9","p2")
setdb1=("1","q2")

In [24]:
#important to include all samples, even those with no drivers
df_drivers_total=df_drivers_total_raw[~((df_drivers_total_raw["chromosome"].str.contains(pd1[0]))&(df_drivers_total_raw["chromosomeBand"].str.contains(pd1[1])))&~((df_drivers_total_raw["chromosome"].str.contains(setdb1[0]))&(df_drivers_total_raw["chromosomeBand"].str.contains(setdb1[1])))&(df_drivers_total_raw["driverLikelihood"]>0.5)][["sample_id","gene"]].drop_duplicates().merge(df_meta[["cancer_type_code","sample_id","cancer_type"]].drop_duplicates(),how="right").merge(df_d).merge(df_d1)

### Association with GIE events

In [25]:
l=[]
ct=df_drivers_total["cancer_type_code"].value_counts()>=15 # tumor types with more than 14 samples
for ttype in ct[ct].index:
    q=df_drivers_total[(df_drivers_total["cancer_type_code"]==ttype)]["gene"].value_counts()
    selected_genes=set(q[q>=10].index) - genes_gie # at least 10 samples mutated and not GIE genes
    q=df_drivers_total[(df_drivers_total["cancer_type_code"]==ttype)&(df_drivers_total["gene"].isin(selected_genes))]
    q["driver"]=True
    if q.shape[0] ==0:
        continue
    query_df=q.pivot_table(columns=["gene"],index=["sample_id"],fill_value=False,values="driver").reset_index().merge(df_meta[df_meta["cancer_type_code"]==ttype][["sample_id"]],how="right").merge(df_d1).merge(df_d).fillna(False)
    for ct in ["selected_genetic_immune_escape","excluding_loh_hla"]:
        l.append(perform_fisher_exact_test(query_df,columns_query=selected_genes,column_dependent=ct,threshold_min=5,ttype=ttype))
df_drivers_stats = pd.concat(l)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pa

### Control, association with background genes (simulated GIE)

In [26]:
l=[]
ct=df_drivers_total["cancer_type_code"].value_counts()>=15 # tumor types with more than 14 samples
for ttype in ct[ct].index:
    q=df_drivers_total[(df_drivers_total["cancer_type_code"]==ttype)]["gene"].value_counts()
    selected_genes=set(q[q>=10].index) - genes_gie
    q=df_drivers_total[(df_drivers_total["cancer_type_code"]==ttype)&(df_drivers_total["gene"].isin(selected_genes))]
    q["driver"]=True
    if q.shape[0] ==0:
        continue
    query_df=q.pivot_table(columns=["gene"],index=["sample_id"],fill_value=False,values="driver").reset_index().merge(df_meta[df_meta["cancer_type_code"]==ttype][["sample_id"]],how="right").merge(df_d1).merge(df_d).fillna(False)
    for ct in list(df_d.columns.values[1:101])+list(df_d.columns.values[201:301]):
        l.append(perform_fisher_exact_test(query_df,columns_query=selected_genes,column_dependent=ct,threshold_min=5,ttype=ttype))
df_drivers_stats_control = pd.concat(l)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pa

### Save raw output

In [27]:
df_drivers_stats[(df_drivers_stats["qvalue"]<0.05)&(df_drivers_stats["variable_dependent"]=="selected_genetic_immune_escape")].to_csv("../results/data/results_feature_correlation/drivers_selected_gie.tsv",sep="\t",index=False)
df_drivers_stats_control[df_drivers_stats_control["variable_dependent"].str.contains("background_genetic_immune_escape_")].to_csv("../results/data/results_feature_correlation/drivers_selected_gie_background.tsv",sep="\t",index=False)

### Now, for every significantly associated gene with GIE, check the association with the control set. Annotate those with >=2% of randomizations displaying the association between GIE and the driver event

Is important to perform this test for both enrichment and depletion

In [49]:
query=df_drivers_stats_control[(df_drivers_stats_control["variable_dependent"].str.contains("background_genetic_immune_escape"))]
query["significant"] = query["qvalue"] < 0.05
query.set_index(["column","cancer_type_code"],inplace=True)
pairs=query.index
l=[]
for label,cancer_type_code in set(pairs):
    tmp=query.loc[(label,cancer_type_code)]
    freq_positive=tmp[(tmp["significant"])&((tmp["odds"]>0)|(np.isposinf(tmp["odds"])))].shape[0] / tmp.shape[0]
    freq_negative=tmp[(tmp["significant"])&((tmp["odds"]<0)|(np.isneginf(tmp["odds"])))].shape[0] / tmp.shape[0]
    l.append([cancer_type_code,label,freq_positive,freq_negative])
df_control_data = pd.DataFrame(l,columns=["cancer_type_code","column","freq_significant_pos","freq_significant_neg"])
df_control_data["background_significant_pos"] = df_control_data["freq_significant_pos"] >0.01
df_control_data["background_significant_neg"] = df_control_data["freq_significant_neg"] >0.01

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  impo

### Merge with GIE driver assotiation and select those that are not significant in 99% of randomizations

In [50]:
v=df_drivers_stats[(df_drivers_stats["qvalue"]<0.05)&(df_drivers_stats["variable_dependent"]=="selected_genetic_immune_escape")].merge(df_control_data[["cancer_type_code","column","background_significant_pos","freq_significant_pos","background_significant_neg","freq_significant_neg"]],how="left")
v[(v["background_significant_pos"]==False)&(v["background_significant_neg"]==False)].to_csv("../results/data/features_correlation/drivers_selected_gie.tsv",sep="\t",index=False)
v[(v["background_significant_pos"]==False)&(v["background_significant_neg"]==False)]

Unnamed: 0,cancer_type_code,column,a,b,c,d,odds,pvalue,qvalue,variable_dependent,background_significant_pos,freq_significant_pos,background_significant_neg,freq_significant_neg
2,COREAD,RPL22,10,3,168,481,3.254541,0.00021,0.002004,selected_genetic_immune_escape,False,0.0,False,0.0
5,COREAD,CASP8,7,3,171,481,2.714433,0.005147,0.020072,selected_genetic_immune_escape,False,0.0,False,0.0
9,COREAD,KMT2D,11,5,167,479,2.657681,0.000483,0.002691,selected_genetic_immune_escape,False,0.0,False,0.0
10,COREAD,TGFBR2,20,17,158,467,1.797963,0.000383,0.00249,selected_genetic_immune_escape,False,0.0,False,0.0
23,CUP,ARID1A,5,8,11,84,2.254814,0.023579,0.047157,selected_genetic_immune_escape,False,0.0,False,0.0


### Calculate fraction of mutated samples that are MSI for each driver gene associated with GIE

In [45]:
for gene in ["RPL22","CASP8","KMT2D","TGFBR2","TP53"]:
    x=df_drivers_total[(df_drivers_total["cancer_type_code"]=="COREAD")&(df_drivers_total["gene"]==gene)].merge(df_d1[["sample_id","selected_genetic_immune_escape"]]).merge(df_meta[["sample_id","msi_status"]])
    total=x.shape[0]
    msi=x[x["msi_status"]=="MSI"].shape[0]
    print (gene,msi/total)
    

RPL22 0.8461538461538461
CASP8 0.5
KMT2D 0.375
TGFBR2 0.5135135135135135
TP53 0.015122873345935728


In [30]:
#KMDT2 10.1158/2159-8290.CD-19-1448, partially target of MSI, ~50% MSI
#RPL22 all are MSI
#CASP8 50% MSI, 
#TGFBR2 heavily assotiated with MSI

# Repeat the analysis but excluding LOH of HLA-I alterations in the GIE events as well as in the background GIE

### Obtain frequency of significantly assotiated drivers in control

In [31]:
query=df_drivers_stats_control[(df_drivers_stats_control["variable_dependent"].str.contains("excluding_lohhla_"))]
query["significant"] = query["qvalue"] < 0.05
query.set_index(["column","cancer_type_code"],inplace=True)
pairs=query.index
l=[]
for label,cancer_type_code in set(pairs):
    tmp=query.loc[(label,cancer_type_code)]
    freq_positive=tmp[(tmp["significant"])&((tmp["odds"]>0)|(np.isposinf(tmp["odds"])))].shape[0] / tmp.shape[0]
    freq_negative=tmp[(tmp["significant"])&((tmp["odds"]<0)|(np.isneginf(tmp["odds"])))].shape[0] / tmp.shape[0]
    l.append([cancer_type_code,label,freq_positive,freq_negative])
df_control_data = pd.DataFrame(l,columns=["cancer_type_code","column","freq_significant_pos","freq_significant_neg"])
df_control_data["background_significant_pos"] = df_control_data["freq_significant_pos"] >0.01
df_control_data["background_significant_neg"] = df_control_data["freq_significant_neg"] >0.01

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  impo

In [32]:
df_drivers_stats[(df_drivers_stats["qvalue"]<0.05)&(df_drivers_stats["variable_dependent"]=="excluding_loh_hla")].to_csv("../results/data/results_feature_correlation/drivers_selected_excluding_lohhla.tsv",sep="\t",index=False)
df_drivers_stats_control[df_drivers_stats_control["variable_dependent"].str.contains("excluding_lohhla_")].to_csv("../results/data/results_feature_correlation/drivers_selected_excluding_lohhla_background.tsv",sep="\t",index=False)

### Merge with GIE driver assotiation and select those that are not significant in 99% of randomizations

In [33]:
v=df_drivers_stats[(df_drivers_stats["qvalue"]<0.05)&(df_drivers_stats["variable_dependent"]=="excluding_loh_hla")].merge(df_control_data[["cancer_type_code","column","background_significant_pos","freq_significant_pos","background_significant_neg","freq_significant_neg"]],how="left")
v[(v["background_significant_pos"]==False)&(v["background_significant_neg"]==False)].to_csv("../results/data/features_correlation/drivers_excluding_lohhla_gie.tsv",sep="\t",index=False)
v[(v["background_significant_pos"]==False)&(v["background_significant_neg"]==False)]

Unnamed: 0,cancer_type_code,column,a,b,c,d,odds,pvalue,qvalue,variable_dependent,background_significant_pos,freq_significant_pos,background_significant_neg,freq_significant_neg
4,COREAD,PIK3R1,8,15,70,569,2.116111,0.002929,0.006569,excluding_loh_hla,False,0.0,False,0.0
6,COREAD,ZNRF3,7,7,71,577,3.02268,0.000422,0.001269,excluding_loh_hla,False,0.0,False,0.0
12,COREAD,FAT1,5,10,73,574,1.975082,0.023186,0.035552,excluding_loh_hla,False,0.0,False,0.0
13,COREAD,KMT2D,6,10,72,574,2.258016,0.006733,0.012905,excluding_loh_hla,False,0.0,False,0.0
17,NSCLC,TP53,89,328,12,158,1.837,1.2e-05,0.000376,excluding_loh_hla,False,0.01,False,0.0
24,SKCM,NRAS,26,84,37,257,1.104293,0.008661,0.020209,excluding_loh_hla,False,0.0,False,0.0
25,DLBCL,PIM1,12,7,25,70,2.263034,0.002935,0.020547,excluding_loh_hla,False,0.0,False,0.0
26,CUP,ARID1A,5,8,2,93,4.861087,0.000212,0.000212,excluding_loh_hla,False,0.0,False,0.0
