In [1]:
import os 
import pandas as pd
    
import json
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import glob
import allel
import itertools
from matplotlib import gridspec
import matplotlib.pyplot as plt
import pickle
from statannot import add_stat_annotation
from scipy import stats
import matplotlib.cm as cm
import matplotlib
import random

from math import pi
import scipy

pd.options.display.max_rows=100
from pandas.core.common import SettingWithCopyWarning
import warnings
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
pd.options.display.max_columns=200

# Important: 

The file with the processed mutation and CNV rate of 100 randomly selected genes was generated by the scripts in 0_process_data/GIE_events/background_genes/random_genes/, please check them carefully. 

The output of the scripts is a hmf_reports_background_genes.tsv.gz file, with harbours the somatic alterations in the 100 randomly seleted genes across Hartwig samples



In [2]:
path_escape="/home/fran/Documents/cuppen/HPC/tunnel/cuppen/projects/P0020_genetics_immune_escape/large_scale_primary_met/analysis/results/immune_escape/hmf_reports_background_genes.tsv.gz" 

### Load metadata

In [6]:
def prepare_metadata():
    df_meta = pd.read_csv("../metadata/dataset_metadata_supp_table3.tsv",sep="\t")
    df_meta=df_meta[(df_meta["is_selected"]==True)]
    return df_meta[df_meta["cohort"]=="Hartwig"]

df = pd.read_csv(f"{path_escape}",sep="\t").rename(columns={"sample":"sample_id"})
df_metadata = prepare_metadata()
df=df.merge(df_metadata)
summary_cohort = pd.read_csv("../metadata/table_cancer_types_info.tsv",sep="\t")

In [7]:
df.shape

(4439, 455)

In [8]:
genes_selected=list(pd.read_csv("../external_data/background_genes.tsv",sep="\t")["gene"].values)

### Define alterations to be considered

In [10]:
alterations = pd.read_csv("../metadata/immune_selected_genes.tsv",sep="\t")
alterations=alterations[alterations["Selected"]]

In [11]:
allowed_csq = set(["frameshift_variant","stop_gained","stop_lost","splice_acceptor_variant","splice_donor_variant","stop_gained","splice_region_variant","start_lost"]) # monoallelic
allowed_csq_biallelic = set(["missense_variant","structural_interaction_variant","conservative_inframe_deletion","conservative_inframe_insertion","disruptive_inframe_deletion","disruptive_inframe_insertion"]) # only biallelic 

def non_synonymous(data,genes):
    l=[]
    for entry in data:
        found=False
        if ";" in str(entry):
            for mut in entry.split("__"):
                d=mut.split(";")
                if d[0] in genes and (d[1] in allowed_csq or d[1] in allowed_csq_biallelic):
                    found=True
                    break

        l.append(found)
    return l

def is_loh(df,gene):
    #columns= ["loh_"+g for g in genes]
    return df["loh_"+gene]>0 # Only considering one gene, because of high proximity of HLA-I genes
    
def is_loh_focal(df,gene):

    return (df["loh_"+gene]>0)&(df["loh_focal"].str.contains(gene)) # Only considering one gene, because of high proximity of HLA-I genes

def is_loh_nonfocal(df,gene):
    return (df["loh_"+gene]>0)&(df["loh_nonfocal"].str.contains(gene)) # Only considering one gene, because of high proximity of HLA-I genes

def is_loh_hfocal(df,gene):
    return (df["loh_"+gene]>0)&(df["loh_hfocal"].str.contains(gene)) # Only considering one gene, because of high proximity of HLA-I genes

def is_deepdel(df,genes):
    columns= ["del_"+g for g in genes]
    return df[columns].sum(axis=1)>0 

def get_cn_neutral(df,gene):
    
    l=[]
    v=df[f"ploidy_minor_major_{gene}"].values
    loh=list(df[f"loh_{gene}"].values)
    for k,value in enumerate(v):
        mi,major=value.split(",")
        if int(major) > 1:
            l.append(True and loh[k]==True)
        else:
            l.append(False and loh[k]==True)
    return l

def is_amp(df,genes):
    columns= ["amp_"+g for g in genes]
    return df[columns].sum(axis=1)>0 
    
def clonal_truncating(data,genes):
    l=[]
    for entry in data:
        found=False
        if ";" in str(entry):
            for gene in genes:
                muts_gene=0
                for mut in entry.split("__"):
                    gene_mut,csq,allelic_status,clonality_purple,af,cn,macn=mut.split(";")
                    if gene == gene_mut and (clonality_purple.startswith("clonal"))  and (len(set(csq.split("&")).intersection(allowed_csq))) >0: 
                        found=True
                        break # At least one clonal deleterios mutations
                    if gene == gene_mut and (clonality_purple.startswith("clonal")) and allelic_status == "biallelic" and (len(set(csq.split("&")).intersection(allowed_csq_biallelic))) >0 :
                        found=True
                        break # At least one biallelic deleterios or non-synonymous likely impactful mutation (missense)
                    if gene == gene_mut and (clonality_purple.startswith("clonal")) and allelic_status == "monoallelic" and (len(set(csq.split("&")).intersection(allowed_csq_biallelic))) >0:
                        muts_gene+=1 # multiple non-synonymous monoallelic mutations
                found=found or muts_gene>1
                if found:# stop iterating, we already have a truncating mutation!
                    break 
        l.append(found)
    return l

def evaluate_type_muts(df,category,genes,is_hla):
    if category == "Non-synonymous, LOH, deep deletion":
        muts_genes=non_synonymous(df["muts_genes"].values,genes)
        loh=is_loh(df,genes[0])# for LOH and deepdel, we evaluate only one gene due to high proximity 
        loh_focal=is_loh_focal(df,genes[0])
        loh_nonfocal=is_loh_nonfocal(df,genes[0])
        loh_hfocal=is_loh_hfocal(df,genes[0])
        loh_is_copy_number_neutral= get_cn_neutral(df,genes[0])
        deep_dels=is_deepdel(df,[genes[0]])# for LOH and deepdel, we evaluate only one gene due to high proximity 
        return ([muts_genes,loh,deep_dels,loh_focal,loh_nonfocal,loh_hfocal,loh_is_copy_number_neutral],["_mut","_loh","_deepdel","_loh_focal","_loh_nonfocal","_loh_hfocal","_loh_cn_neutral"])
    elif category == "Truncating, deep deletion":
        muts_genes=clonal_truncating(df["muts_genes"].values,genes)
        deep_dels=is_deepdel(df,genes)
        return ([muts_genes,deep_dels],["_mut","_deepdel"])
    elif category == "Amplification":
        amps = is_amp(df,genes)
        return ([amps],["_amp"])
    
        
        
        
    

### Annotate raw alterations in randomized genes

In [12]:
for k,pathway in enumerate(alterations["Pathway_general"].unique()):
    pt=alterations[alterations["Pathway_general"]==pathway]
    is_hla= pt =="HLA-I"
    ngenes=pt.shape[0]
    for i in range(100): # perform 100 random iterations
        random.seed(i+k)
        it_genes = random.sample(genes_selected,ngenes)
        for type_mut in list(pt["Type mutations"].unique()):
            values,names=evaluate_type_muts(df,type_mut,it_genes,is_hla) # for LOH and CNV, we evaluate only one gene due to high proximity 
            for j,n in enumerate(names):
                df[f"{pathway.replace(' ','')}{n}_{i}"] = values[j]

### Group them by "pathway" (it mimicks the GIE pathways), perform this across 100 randomizations

In [13]:
for i in range(100):
    df["targeted_escape_"+str(i)] = df[f"HLA-I_deepdel_{i}"] | df[f"HLA-I_loh_{i}"] | df[f"HLA-I_mut_{i}"]
    df["systemic_app_pathway_"+str(i)] = df[f"AntigenPresentationPathway_deepdel_{i}"] | df[f"AntigenPresentationPathway_mut_{i}"]
    df["cd58_pathway_"+str(i)] = df[f"CD58immunereceptor_deepdel_{i}"] | df[f"CD58immunereceptor_mut_{i}"]
    df["ifn_gamma_pathway_"+str(i)] = df[f"IFN-gammapathway_deepdel_{i}"] | df[f"IFN-gammapathway_mut_{i}"]
    df["checkpoint_pathway_"+str(i)] = df[f"Immunecheckpointinhibition_amp_{i}"] 
    df["epigenetic_regulators_pathway_"+str(i)] = df[f"Epigeneticimmuneregulation_amp_{i}"] 
    df["non_targeted_escape_"+str(i)] = df["systemic_app_pathway_"+str(i)] | df["ifn_gamma_pathway_"+str(i)] | df["checkpoint_pathway_"+str(i)] | df["cd58_pathway_"+str(i)] | df["epigenetic_regulators_pathway_"+str(i)]
    df["genetic_immune_escape_"+str(i)] = df["targeted_escape_"+str(i)] | df["non_targeted_escape_"+str(i)]
    
    

### Store the output, this will be used as simulated GIE

In [14]:
df.to_csv("../results/data/hmf_randomized_genes_background_processed.tsv.gz",sep="\t",index=False,compression="gzip")