In [1]:
import pandas as pd
import os
import glob


### Important information, read below!


For each cancer type as well as the pan-cancer cohort we run a LOH, amplification and deep-deletion randomization script to detect genes under positive selection in both Hartwig and PCAWG samples. This analysis is performed at different genomic scales: hihgly-focal (<3Mb), focal (<75% chr. arm) and non-focal. 

This analyis is performed in two steps: 

1. Randomization per cancer type, cohort, CNV-type and scale. The script that performs the randomizations can be found at scripts/positive_selection/cnv/positive_selection_CNV_shuffle.py, whereas the script to submit all randomizations across cancer types is scripts/positive_selection/cnv/pipeline_pos_selection_cnv.py. Further details can be found in the source code of the scripts. 

2. Mapping of the randomization into the genes of interest. The output of these scripts is further parsed by CNV-type  (LOH, amp, deepdel) using the scripts scripts/positive_selection/cnv/process_output_LOH.py (process_output_AMP.py & process_output_DEL.py respectively). This parsing maps the genes of interest (HLA-I genes for LOH, and rest of immune-related genes for AMP&DEL) into the randomizations performed in the first step. Moreover

The output from the 2. steps provides, for the genes of interest, the significance of the performed analysis as well as the figures of the randomization used in the manuscript. 

### Read output for CNV in Hartwig

In [2]:
l=[]
for filein in glob.glob("/home/fran/Documents/cuppen/HPC/tunnel/cuppen/projects/P0020_genetics_immune_escape/large_scale_primary_met/processed/hmf/positive_selection/LOH/ignore_*/plots/*.tsv.gz"):
    df=pd.read_csv(filein,sep="\t")
    #df=df[df["q_value_ana_global"]<.1]
    type_alt,focal=os.path.basename(filein).split(".")[0].split("_")[1:3]
    df["type_alteration"] = type_alt
    df["focal"] = focal
    if df.shape[0] >0:

        l.append(df)
df_total = pd.concat(l)
df_total["cancer_type"] = df_total.apply(lambda r: r["ttype"].replace("__"," "),axis=1)
df_total[(df_total["type_alteration"]!="loh")|((df_total["type_alteration"]=="loh")&(df_total["gene"].str.contains("HLA")))].drop(["ttype","pvalue_emp_locla"],axis=1).sort_values("q_value_ana_global").to_csv("../results/data/raw_results_cnv_hartwig.tsv.gz",sep="\t",index=False,compression="gzip")

### Only significant

In [3]:
l=[]
for filein in glob.glob("/home/fran/Documents/cuppen/HPC/tunnel/cuppen/projects/P0020_genetics_immune_escape/large_scale_primary_met/processed/hmf/positive_selection/LOH/ignore_*/plots/*.tsv.gz"):
    df=pd.read_csv(filein,sep="\t")
    df=df[df["q_value_ana_global"]<.1]
    type_alt,focal=os.path.basename(filein).split(".")[0].split("_")[1:3]
    df["type_alteration"] = type_alt
    df["focal"] = focal
    if df.shape[0] >0:

        l.append(df)
    

In [4]:
df_total[(df_total["q_value_ana_global"]<0.1)&(df_total["type_alteration"]=="loh")&(df_total["gene"]=="HLA-A")].sort_values("q_value_ana_global")

Unnamed: 0,ttype,gene,region,p_value_ana_global,q_value_ana_global,odds_ratio_global,n_obs,n_sim,mean_global,type_alteration,focal,pvalue_emp_locla,cancer_type
51,Cervix__carcinoma,HLA-A,chr6_29900001-30000000,0.001671,0.036239,2.650135,14,6.3,5.28275,loh,focal,,Cervix carcinoma
36,Pancreas__neuroendocrine,HLA-A,chr6_29900001-30000000,0.005864,0.054733,2.048647,18,9.0,8.786289,loh,nonfocal,,Pancreas neuroendocrine
60,Colorectum__carcinoma,HLA-A,chr6_29900001-30000000,0.004563,0.076115,2.17408,17,6.7,7.819398,loh,hfocal,,Colorectum carcinoma


In [5]:
import numpy as np
def concat(grp):
    return ",".join(set(grp))
result_hartwig=df_total[(df_total["q_value_ana_global"]<0.1)&(df_total["n_obs"]>2)&((df_total["type_alteration"]=="loh")&(df_total["gene"].str.contains("HLA"))|((df_total["type_alteration"]!="loh")))&(df_total["odds_ratio_global"]>1.)
# Manual inspection of HLA-A deepdeletions in esophagus reveal a potential artifact caused by the high number of LINE insertions in this region. See methods of manuscript for more details
         &(~((df_total["ttype"].isin(["pancancer","Esophagus__cancer"]))&(df_total["gene"]=="HLA-A")&(df_total["type_alteration"]=="deepdel")))].groupby(["ttype","gene","type_alteration"]).agg({"region":concat,"q_value_ana_global":np.nanmin,"odds_ratio_global":np.nanmax,"n_obs":np.nanmax,"n_sim":np.nanmax,"mean_global":np.nanmax,"focal":concat})
result_hartwig.to_csv("../results/data/CNV_positive_selection_HMF.tsv",sep="\t")

In [6]:
result_hartwig

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,region,q_value_ana_global,odds_ratio_global,n_obs,n_sim,mean_global,focal
ttype,gene,type_alteration,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Breast__cancer,JAK2,deepdel,chr9_5000001-5100000,0.001638807,12.835132,6,0.6,0.645068,"hfocal,focal"
Cervix__carcinoma,HLA-A,loh,chr6_29900001-30000000,0.03623937,2.650135,14,6.3,5.28275,focal
Cervix__carcinoma,HLA-B,loh,chr6_31300001-31400000,0.01278601,2.83943,15,6.2,5.28275,focal
Cervix__carcinoma,HLA-C,loh,chr6_31200001-31300000,0.01278601,2.83943,15,6.3,5.28275,focal
Colorectum__carcinoma,HLA-A,loh,chr6_29900001-30000000,0.07611535,2.17408,17,6.7,7.819398,hfocal
Colorectum__carcinoma,IRF2,deepdel,chr4_185300001-185400000,9.155631e-09,31.076011,9,0.4,0.333633,"hfocal,focal"
Diffuse__large__B-cell__lymphoma,B2M,deepdel,chr15_45000001-45100000,0.0002228786,255.070796,3,0.0,0.018634,"hfocal,focal"
Non__small__cell__lung__cancer,SETDB1,amp,"chr1_150800001-150900000,chr1_150900001-151000000",4.46813e-07,6.396106,17,2.7,2.657867,"hfocal,focal"
Ovarian__cancer,B2M,deepdel,chr15_45000001-45100000,0.006481895,32.917999,4,0.1,0.157215,"hfocal,focal"
Ovarian__cancer,JAK2,deepdel,chr9_4900001-5000000,0.09498565,24.688499,3,0.1,0.121514,hfocal


In [11]:
np.log2(2.174080)

1.1204050284423892

### Read output for PCAWG

In [7]:
l=[]
for filein in glob.glob("/home/fran/Documents/cuppen/HPC/tunnel/cuppen/projects/P0020_genetics_immune_escape/large_scale_primary_met/processed/pcawg/positive_selection/LOH/ignore_*/plots/*.tsv.gz"):
    df=pd.read_csv(filein,sep="\t")
    #df=df[df["q_value_ana_global"]<.1]
    type_alt,focal=os.path.basename(filein).split(".")[0].split("_")[1:3]
    df["type_alteration"] = type_alt
    df["focal"] = focal
    if df.shape[0] >0:

        l.append(df)
df_total = pd.concat(l)
df_total["cancer_type"] = df_total.apply(lambda r: r["ttype"].replace("__"," "),axis=1)
df_total[(df_total["type_alteration"]!="loh")|((df_total["type_alteration"]=="loh")&(df_total["gene"].str.contains("HLA")))].drop(["ttype","pvalue_emp_locla"],axis=1).sort_values("q_value_ana_global").to_csv("../results/data/raw_results_cnv_pcawg.tsv.gz",sep="\t",index=False,compression="gzip")

In [8]:
result_pcawg=df_total[(df_total["q_value_ana_global"]<0.1)&(df_total["n_obs"]>2)&((df_total["type_alteration"]=="loh")&(df_total["gene"].str.contains("HLA"))|((df_total["type_alteration"]!="loh")))&(df_total["odds_ratio_global"]>1.)
# Manual inspection of HLA-A deepdeletions in esophagus reveal a potential artifact caused by the high number of LINE insertions in this region. See methods of manuscript for more details
         &(~((df_total["ttype"].isin(["pancancer","Esophagus__cancer"]))&(df_total["gene"]=="HLA-A")&(df_total["type_alteration"]=="deepdel")))].groupby(["ttype","gene","type_alteration"]).agg({"region":concat,"q_value_ana_global":np.nanmin,"odds_ratio_global":np.nanmax,"n_obs":np.nanmax,"n_sim":np.nanmax,"mean_global":np.nanmax,"focal":concat})
result_pcawg.to_csv("../results/data/CNV_positive_selection_PCAWG.tsv",sep="\t")

In [9]:
result_pcawg

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,region,q_value_ana_global,odds_ratio_global,n_obs,n_sim,mean_global,focal
ttype,gene,type_alteration,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Breast__cancer,SETDB1,amp,"chr1_150800001-150900000,chr1_150900001-151000000",0.000794,7.758616,8,1.6,1.108309,"hfocal,focal"
Diffuse__large__B-cell__lymphoma,CD58,deepdel,chr1_117000001-117100000,0.004178,81.666981,3,0.0,0.055577,"hfocal,focal"
Diffuse__large__B-cell__lymphoma,HLA-A,loh,chr6_29900001-30000000,0.013238,4.267569,12,3.8,3.48478,"focal,nonfocal"
Diffuse__large__B-cell__lymphoma,HLA-B,loh,chr6_31300001-31400000,0.005445,4.267569,13,4.1,3.48478,"focal,nonfocal"
Diffuse__large__B-cell__lymphoma,HLA-C,loh,chr6_31200001-31300000,0.005445,4.267569,13,4.1,3.48478,"focal,nonfocal"
Hepatocellular__carcinoma,IRF2,deepdel,chr4_185300001-185400000,0.000174,155.317257,4,0.3,0.10264,"hfocal,focal"
Hepatocellular__carcinoma,SETDB1,amp,"chr1_150800001-150900000,chr1_150900001-151000000",0.045483,33.778273,3,0.4,0.231437,"hfocal,focal"
Kidney__chromophobe__cancer,HLA-A,loh,chr6_29900001-30000000,0.0001,2.596173,29,12.7,11.170288,nonfocal
Kidney__chromophobe__cancer,HLA-B,loh,chr6_31300001-31400000,0.0001,2.596173,29,13.0,11.170288,nonfocal
Kidney__chromophobe__cancer,HLA-C,loh,chr6_31200001-31300000,0.0001,2.596173,29,13.0,11.170288,nonfocal


In [12]:
np.log2(2.596173)

1.3763865227538876

In [13]:
df_meta

NameError: name 'df_meta' is not defined