In [1]:
import glob
import os
import pandas as pd
import numpy as np
import tqdm as tqdm
import matplotlib.pyplot as plt
import seaborn as sns

----

### Important

For each cancer type as well as the pan-cancer cohort we run dndscv with default options in both Hartwig and PCAWG samples:

1. To estimate global MHC-I (combining HLA-A, HLA-B and HLA-C) dn/ds ratios. Only in pancancer manner.
2. To estimate the gene-specific and pan-cancer dN/dS ratios in a pan-cancer manner for MHC-I genes. 
3. To identify positively selected genes by recurrent mutation of the list of 21 genes selected for anylsis in this publication (see Supp. Table 1). This analysis is performed in a pan-cancer and cancer type specific manner. 

The script scripts/positive_selection/mutations/run_driver_pipeline_dndscv.py was used to run the Hartwig and PCAWG a pan-cancer and cancer type specific analyses.
This scripts uses scripts/positive_selection/mutations/run_dndscv.R to compute the dndscv driver genes and dN/dS ratios

### Genes

In [2]:
# hla locus
mhc_I = ["HLA-A","HLA-B","HLA-C"]
# APP related proteins
transport_mhc = ["TAP1","TAP2","TAPBP"]
scaffold_mhc = ["B2M","CALR"]
#interferon
interferon=["JAK1","JAK2","STAT1","IRF2","APLNR","IFNGR1", "IFNGR2"]
#TFs HLA locus
tfs=["NLRC5","RFX5","CIITA"]
# CD58
cd58_nk = ["CD58"]
d_data = {"MHC-I":mhc_I,"transport_mhc":transport_mhc,"scaffold_mhc":scaffold_mhc,"IFN-gamma":interferon,"Activators-MHC":tfs,"cd58_nk":cd58_nk}
pallete = {"wt":"#ef8a62","alteration":"#67a9cf","alteration_primary":"#7570b3"}

In [3]:
total= []
for k in d_data:
    total+=d_data[k]

### Metastasis

In [None]:
l,l1=[], []
for filein in glob.glob("/home/fran/Documents/cuppen/HPC/tunnel/cuppen/projects/P0020_genetics_immune_escape/large_scale_primary_met/processed/hmf/positive_selection/dndscv/*.ci.tsv.gz"):
    ttype = os.path.basename(filein).split(".")[0]
    df = pd.read_csv(filein,sep="\t")
    df_d = pd.read_csv(filein.replace(".ci.",".results."),sep="\t")
    df=df[df["gene"].isin(total)]
    df_d = df_d[df_d["gene_name"].isin(total)]
    df["ttype"] = ttype
    df_d["ttype"] = ttype
    l.append(df)
    l1.append(df_d)
df_total = pd.concat(l)
df_total_d = pd.concat(l1)

In [None]:
df_total_d.sort_values("qglobal_cv").to_csv("../results/data/dndscv_output_immune_genes_hmf.tsv.gz",sep="\t",index=False,compression="gzip")
df_total_d[df_total_d["qglobal_cv"]<.1].to_csv("../results/data/MUT_positive_selection_HMF.tsv",sep="\t")

In [None]:
df_total_d[df_total_d["qglobal_cv"]<.1]

In [None]:
df_total_d.groupby(["gene_name"]).agg({"qglobal_cv":np.nanmin,})

In [None]:
df_total[df_total["ttype"]=="pancancer"]

### PCAWG

In [None]:
l,l1=[], []
for filein in glob.glob("/home/fran/Documents/cuppen/HPC/tunnel/cuppen/projects/P0020_genetics_immune_escape/large_scale_primary_met/processed/pcawg/positive_selection/dndscv/*.ci.tsv.gz"):
    ttype = os.path.basename(filein).split(".")[0]
    df = pd.read_csv(filein,sep="\t")
    df_d = pd.read_csv(filein.replace(".ci.",".results."),sep="\t")
    df=df[df["gene"].isin(total)]
    df_d = df_d[df_d["gene_name"].isin(total)]
    df["ttype"] = ttype
    df_d["ttype"] = ttype
    l.append(df)
    l1.append(df_d)
df_total = pd.concat(l)
df_total_d = pd.concat(l1)

In [None]:
df_total_d.sort_values("qglobal_cv").to_csv("../results/data/dndscv_output_immune_genes_pcawg.tsv.gz",sep="\t",index=False,compression="gzip")
df_total_d[df_total_d["qglobal_cv"]<.1].to_csv("../results/data/MUT_positive_selection_PCAWG.tsv",sep="\t")

In [None]:
df_total_d[df_total_d["qglobal_cv"]<.1]

In [None]:
df_total[df_total["ttype"]=="pancancer"]