# Part 1: Create database set
By inputting a target gene, a nested dictionary will be created, with each key being the name of a cancer and each value being a dictionary containing the genomic, proteomic, transcriptomic, and phosphoproteomic data. In other words, the dictionary will be:

{
    cancer : {   
    
            "gen" : <cancer's genomic data>,
            "prot" : <cancer's proteomic data>,
            "trans": <cancer's transcriptomic data>,
            "phos": <cancer's phosphoproteomic data>
        }
    ... for each cancer
}

In [1]:
import cptac
import cptac.pancan as pc
import numpy as np
import cptac.utils as ut
import plot_utils as plt
import pandas as pd

In [2]:
dataset_names = pc.list_datasets(False)
dataset_names  = dataset_names.split('\n')
dataset_names

['PancanBrca',
 'PancanCcrcc',
 'PancanCoad',
 'PancanGbm',
 'PancanHnscc',
 'PancanLscc',
 'PancanLuad',
 'PancanOv',
 'PancanUcec',
 'PancanPdac']

In [3]:
target_gene = "TP53"

In [4]:
datasets = {}
for name in dataset_names:
    exec(f"cancer = pc.{name}(no_internet = True)")  
    genotype_data = cancer.get_genotype_all_vars(mutations_genes = target_gene, omics_source = "washu")
    
    proteomic_data = cancer.get_proteomics(source = 'umich')
    proteomic_data = proteomic_data.droplevel("Database_ID", axis = 1)
    
    phospho_data = cancer.get_phosphoproteomics()
    phospho_data.loc[:, phospho_data.columns.get_level_values(0) == "TP53"].droplevel(["Database_ID", "Peptide"], axis = 1)
    
    transcript_data = cancer.get_transcriptomics(source = "washu")
    transcript_data = transcript_data.droplevel("Database_ID", axis = 1)
    
    datasets[name[6:]] = {
                        "gen" : genotype_data,
                        "prot": proteomic_data,
                        "trans": transcript_data,
                        "phos" : phospho_data
                         }    

Loading broadbrca v1.0...       



  result = parse_gtf(


  result = parse_gtf(


                                    



                                         



                                    



                                      



                                         



                                        



                                        



                                   



                                        



                                        



# Part 2: Select mutations
Determine which cancers have mutation types with rates above 20%.

To facilitate that, the mutation types from the get_genotype_all_vars must be grouped into functional types.


In [52]:
mutation_types = ["Deletion", "Indel", "Missense", "Wildtyp]

In [6]:
# Grouping the mutation types for each cancer
for name, dataset in datasets.items():
    genotypes = dataset["gen"]
    genotypes.replace(["Nonsense_Mutation", "Frame_Shift_Ins", "Frame_Shift_Del"], "Truncation", inplace = True)
    genotypes.replace(["In_Frame_Del", "In_Frame_Ins"], "Indel", inplace = True)
    genotypes.replace(["Missense_Mutation", "Wildtype_Tumor"], ["Missense", "Wildtype"], inplace = True)
    genotypes = genotypes.loc[genotypes["Mutation"] != "Silent"]
    datasets[name]["gen"] = genotypes


In [7]:
# Create a bargraph displaying the mutation types

In [8]:
# Determine which cancers have mutation rates over 20%
variants_to_analyze = {}
for name, dataset in datasets.items():
    mut_type_freq = dataset["gen"]["Mutation"].value_counts() / len(dataset["prot"])
    mut_type_freq = mut_type_freq.loc[mut_type_freq >= 0.2]
    variants_to_analyze[name] = set(mut_type_freq.index.tolist())
    variants_to_analyze[name].discard("Wildtype")
print(str(variants_to_analyze))

{'Brca': {'Deletion'}, 'Ccrcc': set(), 'Coad': {'Deletion'}, 'Gbm': set(), 'Hnscc': {'Missense'}, 'Lscc': set(), 'Luad': set(), 'Ov': {'Deletion', 'Missense'}, 'Ucec': set(), 'Pdac': set()}


# Part 3: Cis effects
Determine which cancers alter their protein expression level of the target gene due to a mutation in that gene (with Wildtype as a control).
Writes out a file containing the infomation for each mutation and cancer, and creates boxplot summarizing the data

In [24]:
from scipy.stats import ttest_ind

In [43]:
# Create a function that will analyze the protein cis effects for a given mutation type
def analyze_cis_mutation(mutation_type, omics_type, write_out = True, pval_only = True):
    """Analyzes the cis effects for a given mutation type across all cancers.
    
    Parameters:
    mutation_type (str): The mutation type to analyze, as a string. Possible values given in mutation_types.
    omics_type (str): The omics to analyze. Possible values are in list ["prot", "trans", "phos"]
    write_out (bool, optional): Whether to write out the data to a separate file.
    pval_only (bool, optional): If write_out, only writes out the p_value for each analysis. If not write_out, has no effect.
    
    Returns the output of scipy.ttest_ind() between "Wildtype" and mutation_type tumors. If analyzing phosphoproteomics, returns a list of such outputs
    """
    if write_out:
        # Clears the target file
        file_name = omics_type.capitalize() + '_' + mutation_type.capitalize() + 's.txt'
        with open("Analysis_output/" + file_name, 'w') as out_file: pass
    print()
    print(omics_type)
    print(mutation_type)
    for name, dataset in datasets.items():
        print(name)
        if mutation_type not in variants_to_analyze[name]: continue
        
        # Separate the wildtype and mutated samples
        genotype_and_omic = pd.merge(dataset["gen"], dataset[omics_type], left_index = True, right_index = True)
        mutation = genotype_and_omic.loc[genotype_and_omic["Mutation"] == mutation_type][target_gene].dropna()
        wildtype = genotype_and_omic.loc[genotype_and_omic["Mutation"] == "Wildtype"][target_gene].dropna()
        
        # Phosphoproteomics must be analyzed at each site in the gene
        if omics_type != "phos":
            result = ttest_ind(mutation, wildtype)[1]
        else:
            result = [(site, ttest_ind(mutation[site], wildtype[site])[1]) for site in mutation.columns if site in wildtype.columns]
                
        if write_out:
            output_string = ""
            if not pval_only:
                output_string += f"{name}\n"
                output_string += f"Number of {mutation_type}s = {len(mutation)}\n"
                output_string += f"Number of Wildtypes = {len(wildtype)}\n"
            if omics_type != "phos":
                output_string += f"{name} {mutation_type} P-value = {result}\n\n"
            else:
                output_string += f"{name} {mutation_type} P-values:\n"
                for site in result:
                    output_string += f"   {site[0]}: {site[1]}\n"
                ouput_string += '\n'
            with open(file_name, 'a') as out_file:
                out_file.write(output_string)
                
    try:
        return result
    except UnboundLocalError:
        return



In [31]:
# analyze_cis_mutation("Deletion", "prot", True, False)

0.36078400842934055

In [53]:
#Now, run the analysis for every mutation_type, for every omics_type
for omics_type in ["prot"]:#, "trans", "phos"]:
    for mutation_type in mutation_types.remove("Wildtype"):
        analyze_cis_mutation(mutation_type, omics_type, write_out = True, pval_only = False)

TypeError: 'NoneType' object is not iterable

In [None]:
from scipy.stats import ttest_ind
with open("deletion_protein_data.txt", 'w') as outFile:
    for name, dataset in datasets.items():
        if name not in deletion_sets: continue
        outFile.write(name + '\n')
        genotype_and_prot = dataset[0].merge(dataset[1], left_index = True, right_index = True)
        deletions = genotype_and_prot.loc[genotype_and_prot["Mutation"] == "Deletion"][target_gene].dropna()
        wildtypes = genotype_and_prot.loc[genotype_and_prot["Mutation"] == "Wildtype_Tumor"][target_gene].dropna()
        outFile.write("len(deletions) = " + str(len(deletions)) + '\n')
        outFile.write("len(wildtypes) = " + str(len(wildtypes)) + '\n')
        result = ttest_ind(deletions, wildtypes)
        outFile.write(name + " deletion P-value: " + str(result[1]) + '\n')
        outFile.write("\n")
        

In [None]:
#Cancers with more than 20% having Missense and Wildtype
missense_sets = ["Luad", "Pdac"]
for name, dataset in datasets.items():
    if name not in missense_sets: continue
    print(name)
    genotype_and_prot = dataset[0].merge(dataset[1], left_index = True, right_index = True)
    missenses = genotype_and_prot.loc[genotype_and_prot["Mutation"] == "Missense_Mutation"][target_gene].dropna()
    wildtypes = genotype_and_prot.loc[genotype_and_prot["Mutation"] == "Wildtype_Tumor"][target_gene].dropna()
    print("len(missenses) = " + str(len(missenses)))
    print("len(wildtypes) = " + str(len(wildtypes)))
    result = ttest_ind(deletions, wildtypes)
    print(name + " missense P-value: " + str(result[1]))
    print("\n")
    
# genotype_and_prot = datasets["Luad"][1].merge(datasets["Luad"][2], left_index = True, right_index = True)
# genotype_and_prot["Mutation"]
# genotype_and_prot.loc[genotype_and_prot["Mutation"] == "Missense_Mutation"]

Trans effects

In [None]:
paths = ut.get_pathways_with_proteins(proteins = target_gene, database = 'reactome')
paths

In [None]:
interacting_proteins = ut.get_proteins_in_pathways(paths["pathway_id"], database = 'reactome')
interacting_proteins = list(interacting_proteins["member"])

genes_measured = list(datasets["Brca"][1].columns)
new_genes = sorted(list(set([protein for protein in interacting_proteins if protein in genes_measured])))

In [None]:
interacting_proteins = set(interacting_proteins)
print(len(interacting_proteins))
genes_measured = set(genes_measured)
print(len(genes_measured))
print(len(new_genes))
new_genes

In [None]:
for dataset in datasets.values():
    print(dataset[1].iloc[:,0:5])

In [None]:
with open("deletion_trans_data.txt", 'w') as outFile:
    all_significant = {}
    for name, dataset in datasets.items():
        if name not in deletion_sets: continue
        print(name + '\n')
        outFile.write(name + '\n')
        significant_changes = {}
        for trans_gene in interacting_proteins:
            genotype_and_prot = dataset[0].merge(dataset[1], left_index = True, right_index = True)
            if trans_gene not in genotype_and_prot: continue
            deletions = genotype_and_prot.loc[genotype_and_prot["Mutation"] == "Deletion"][trans_gene].dropna()
            wildtypes = genotype_and_prot.loc[genotype_and_prot["Mutation"] == "Wildtype_Tumor"][trans_gene].dropna()
            result = ttest_ind(deletions, wildtypes)
            print(trans_gene + " P-value with " + name + " deletion: " + str(result[1]))
            outFile.write(name + " deletion P-value: " + str(result[1]))
        print('\n')
        outFile.write('\n')
