In [1]:
import glob
import os
import pandas as pd
import numpy as np
import tqdm as tqdm
import matplotlib.pyplot as plt
import seaborn as sns

----

### Important

For each cancer type as well as the pan-cancer cohort we run dndscv with default options in both Hartwig and PCAWG samples:

1. To estimate global MHC-I (combining HLA-A, HLA-B and HLA-C) dn/ds ratios. Only in pancancer manner.
2. To estimate the gene-specific and pan-cancer dN/dS ratios in a pan-cancer manner for MHC-I genes. 
3. To identify positively selected genes by recurrent mutation of the list of 21 genes selected for anylsis in this publication (see Supp. Table 1). This analysis is performed in a pan-cancer and cancer type specific manner. 

The script scripts/positive_selection/mutations/run_driver_pipeline_dndscv.py was used to run the Hartwig and PCAWG a pan-cancer and cancer type specific analyses.
This scripts uses scripts/positive_selection/mutations/run_dndscv.R to compute the dndscv driver genes and dN/dS ratios

### Genes

In [2]:
# hla locus
mhc_I = ["HLA-A","HLA-B","HLA-C"]
# APP related proteins
transport_mhc = ["TAP1","TAP2","TAPBP"]
scaffold_mhc = ["B2M","CALR"]
#interferon
interferon=["JAK1","JAK2","STAT1","IRF2","APLNR","IFNGR1", "IFNGR2"]
#TFs HLA locus
tfs=["NLRC5","RFX5","CIITA"]
# CD58
cd58_nk = ["CD58"]
d_data = {"MHC-I":mhc_I,"transport_mhc":transport_mhc,"scaffold_mhc":scaffold_mhc,"IFN-gamma":interferon,"Activators-MHC":tfs,"cd58_nk":cd58_nk}
pallete = {"wt":"#ef8a62","alteration":"#67a9cf","alteration_primary":"#7570b3"}

In [3]:
total= []
for k in d_data:
    total+=d_data[k]

### Metastasis

In [4]:
l,l1=[], []
for filein in glob.glob("/home/fran/Documents/cuppen/HPC/tunnel/cuppen/projects/P0020_genetics_immune_escape/large_scale_primary_met/processed/hmf/positive_selection/dndscv/*.ci.tsv.gz"):
    ttype = os.path.basename(filein).split(".")[0]
    df = pd.read_csv(filein,sep="\t")
    df_d = pd.read_csv(filein.replace(".ci.",".results."),sep="\t")
    df=df[df["gene"].isin(total)]
    df_d = df_d[df_d["gene_name"].isin(total)]
    df["ttype"] = ttype
    df_d["ttype"] = ttype
    l.append(df)
    l1.append(df_d)
df_total = pd.concat(l)
df_total_d = pd.concat(l1)

In [5]:
df_total_d.sort_values("qglobal_cv").to_csv("../results/data/dndscv_output_immune_genes_hmf.tsv.gz",sep="\t",index=False,compression="gzip")
df_total_d[df_total_d["qglobal_cv"]<.1].to_csv("../results/data/MUT_positive_selection_HMF.tsv",sep="\t")

In [6]:
df_total_d[df_total_d["qglobal_cv"]<.1]

Unnamed: 0,gene_name,n_syn,n_mis,n_non,n_spl,n_ind,wmis_cv,wnon_cv,wspl_cv,wind_cv,pmis_cv,ptrunc_cv,pallsubs_cv,pind_cv,qmis_cv,qtrunc_cv,qallsubs_cv,pglobal_cv,qglobal_cv,ttype
13,CD58,0,3,0,0,1,336.748852,0.0,0.0,212.66107,7e-06,0.9684713,4.064594e-05,0.004644831,0.011633,0.9955272,0.04891302,3.11181e-06,0.00446567,Diffuse__large__B-cell__lymphoma
16,HLA-B,0,0,1,0,2,0.0,1371.287781,1371.287781,161.04475,0.883371,0.0009770165,0.004190507,0.0003767278,0.971273,0.5711779,0.9999474,2.266815e-05,0.02678975,Diffuse__large__B-cell__lymphoma
12,B2M,1,1,0,0,3,11.91104,0.0,0.0,86.230721,0.079149,0.8973482,0.2122826,1.070398e-05,0.791533,0.9637774,0.9860799,3.179978e-05,0.04914534,Kidney__clear__cell__carcinoma
17,B2M,0,3,2,1,7,7.545842,82.950659,82.950659,35.350032,0.014809,1.141644e-05,7.829318e-06,1.606587e-07,0.875552,0.01638341,0.007864941,3.572487e-11,3.987491e-08,Colorectum__carcinoma
29,HLA-B,1,8,1,0,6,5.160554,11.138516,11.138516,13.132236,0.002332,0.0895086,0.003687634,0.0001474961,0.690477,0.8527777,0.7717527,8.389552e-06,0.005618483,Colorectum__carcinoma
47,HLA-A,1,3,2,0,5,1.844576,20.150373,20.150373,10.454854,0.399184,0.005453681,0.018298,0.0009488283,0.907935,0.8527777,0.947387,0.0002076671,0.08692167,Colorectum__carcinoma
14,B2M,0,3,0,1,3,10.066629,47.65457,47.65457,45.445549,0.008991,0.01889372,0.003372658,6.893604e-05,0.845466,0.8710051,0.836544,3.783756e-06,0.005067963,Skin__melanoma
28,B2M,2,15,9,4,16,3.73171,33.69819,33.69819,18.354051,0.000652,4.540812e-14,6.661338e-14,1.07365e-11,0.147281,4.561473e-11,4.317192e-11,0.0,0.0,pancancer
48,HLA-A,11,24,15,2,14,1.462092,10.899306,10.899306,5.92052,0.198323,9.631451e-11,7.114442e-10,2.431934e-05,0.883074,7.154733e-08,3.324099e-07,5.655476e-13,2.318861e-10,pancancer
64,HLA-B,12,26,7,2,19,1.548579,5.937452,5.937452,8.133033,0.126463,0.0001069171,0.0004471238,7.374858e-08,0.840779,0.03521427,0.07583199,8.288294e-10,2.561848e-07,pancancer


In [7]:
df_total_d.groupby(["gene_name"]).agg({"qglobal_cv":np.nanmin,})

Unnamed: 0_level_0,qglobal_cv
gene_name,Unnamed: 1_level_1
APLNR,0.9999999
B2M,0.0
CALR,0.9999999
CD58,0.00446567
CIITA,0.9999999
HLA-A,2.318861e-10
HLA-B,2.561848e-07
HLA-C,0.8043624
IFNGR1,0.9999999
IFNGR2,0.9999999


In [8]:
df_total[df_total["ttype"]=="pancancer"]

Unnamed: 0,gene,mis_mle,tru_mle,mis_low,tru_low,mis_high,tru_high,ttype
1311,APLNR,0.776854,0.797481,0.510975,0.130395,1.175064,2.569402,pancancer
1821,B2M,3.73171,33.69819,1.771751,15.470161,7.966608,73.417059,pancancer
2920,CALR,1.661569,1.355356,0.897876,0.214942,3.144119,4.727889,pancancer
3383,CD58,2.318056,5.389002,1.12718,1.714391,4.94309,14.534938,pancancer
3819,CIITA,0.878985,0.564659,0.597035,0.137159,1.299806,1.540059,pancancer
7947,HLA-A,1.462092,10.899306,0.818425,5.707021,2.624957,20.460065,pancancer
7948,HLA-B,1.548579,5.937452,0.883128,2.602499,2.733253,12.407409,pancancer
7949,HLA-C,1.876695,0.842009,1.029243,0.046822,3.491726,4.076144,pancancer
8326,IFNGR1,1.197972,1.976353,0.59754,0.455646,2.452043,6.05688,pancancer
8327,IFNGR2,0.813689,1.604058,0.329886,0.253658,1.850276,5.637941,pancancer


### PCAWG

In [9]:
l,l1=[], []
for filein in glob.glob("/home/fran/Documents/cuppen/HPC/tunnel/cuppen/projects/P0020_genetics_immune_escape/large_scale_primary_met/processed/pcawg/positive_selection/dndscv/*.ci.tsv.gz"):
    ttype = os.path.basename(filein).split(".")[0]
    df = pd.read_csv(filein,sep="\t")
    df_d = pd.read_csv(filein.replace(".ci.",".results."),sep="\t")
    df=df[df["gene"].isin(total)]
    df_d = df_d[df_d["gene_name"].isin(total)]
    df["ttype"] = ttype
    df_d["ttype"] = ttype
    l.append(df)
    l1.append(df_d)
df_total = pd.concat(l)
df_total_d = pd.concat(l1)

In [10]:
df_total_d.sort_values("qglobal_cv").to_csv("../results/data/dndscv_output_immune_genes_pcawg.tsv.gz",sep="\t",index=False,compression="gzip")
df_total_d[df_total_d["qglobal_cv"]<.1].to_csv("../results/data/MUT_positive_selection_PCAWG.tsv",sep="\t")

In [11]:
df_total_d[df_total_d["qglobal_cv"]<.1]

Unnamed: 0,gene_name,n_syn,n_mis,n_non,n_spl,n_ind,wmis_cv,wnon_cv,wspl_cv,wind_cv,pmis_cv,ptrunc_cv,pallsubs_cv,pind_cv,qmis_cv,qtrunc_cv,qallsubs_cv,pglobal_cv,qglobal_cv,ttype
3,B2M,0,11,5,1,5,61.618363,371.50036,371.50036,108.028419,1.71153e-09,1.143907e-11,1.509903e-14,1.029211e-05,4.912334e-06,7.660746e-08,7.583867e-11,0.0,0.0,Diffuse__large__B-cell__lymphoma
13,CD58,0,0,2,2,1,0.0,258.96158,258.96158,65.682088,0.5936073,3.161678e-08,1.188904e-07,0.01463931,0.8036403,9.074468e-05,0.0001492892,3.684426e-08,5.287414e-05,Diffuse__large__B-cell__lymphoma
18,HLA-B,0,5,2,1,1,22.475429,177.715,177.715,31.913489,0.0001337234,2.174982e-06,4.686941e-07,0.02896927,0.07462878,0.003972507,0.0004708266,2.59536e-07,0.0002744388,Diffuse__large__B-cell__lymphoma
9,B2M,1,17,6,1,7,14.978107,65.820672,65.820672,33.929853,2.6527e-11,1.520341e-10,0.0,2.535773e-08,4.099646e-08,2.349628e-07,0.0,0.0,0.0,pancancer
28,HLA-A,3,5,7,2,5,1.403459,25.353874,25.353874,9.378269,0.5336295,1.989837e-09,1.429466e-08,0.0005923218,0.9736949,2.498613e-06,1.025693e-05,2.243328e-10,1.554162e-07,pancancer
35,HLA-B,1,13,2,2,7,4.697558,15.264558,15.264558,13.51312,0.0002746188,0.000290679,1.10588e-05,9.405607e-06,0.1103473,0.1358147,0.004534335,2.494949e-09,1.392389e-06,pancancer
68,CD58,0,3,3,2,3,1.301947,18.968152,18.968152,9.086144,0.6981241,2.321816e-05,0.0001261295,0.006334899,0.9839546,0.01504761,0.04445735,1.201714e-05,0.003499077,pancancer


In [12]:
df_total[df_total["ttype"]=="pancancer"]

Unnamed: 0,gene,mis_mle,tru_mle,mis_low,tru_low,mis_high,tru_high,ttype
1311,APLNR,1.928298,0.0,0.877057,0.0,4.267048,6.169914,pancancer
1821,B2M,14.978107,65.820672,7.000784,23.972262,33.716361,170.398574,pancancer
2920,CALR,0.559939,0.0,0.129846,0.0,1.69337,3.685393,pancancer
3383,CD58,1.301947,18.968152,0.289591,5.846961,4.338929,54.551348,pancancer
3819,CIITA,0.764091,3.527758,0.373166,1.178649,1.505339,8.643785,pancancer
7947,HLA-A,1.403459,25.353874,0.447406,10.374602,3.769648,59.644766,pancancer
7948,HLA-B,4.697558,15.264558,2.074182,4.197229,10.933958,45.407432,pancancer
7949,HLA-C,3.122262,3.758362,1.24451,0.205062,7.699601,19.559046,pancancer
8326,IFNGR1,1.28679,0.0,0.439064,0.0,3.445995,4.630625,pancancer
8327,IFNGR2,0.376331,2.988823,0.020613,0.163709,1.929414,15.323415,pancancer
