# Step 1:
Load data

In [1]:
import cptac
import cptac.pancan as pc
import numpy as np
import cptac.utils as ut
import plot_utils as plt
import pandas as pd
from scipy.stats import ttest_ind

In [2]:
dataset_names = pc.list_datasets(False)
dataset_names  = dataset_names.split('\n')
dataset_names

['PancanBrca',
 'PancanCcrcc',
 'PancanCoad',
 'PancanGbm',
 'PancanHnscc',
 'PancanLscc',
 'PancanLuad',
 'PancanOv',
 'PancanUcec',
 'PancanPdac']

In [3]:
target_gene = "PIK3CA"

In [4]:
# pbr = pc.PancanBrca()

In [5]:
datasets = {}
for name in dataset_names:
    exec(f"cancer = pc.{name}(no_internet = True)")  
    genotype_data = cancer.get_genotype_all_vars(mutations_genes = target_gene, omics_source = "washu")
    
    proteomic_data = cancer.get_proteomics(source = 'umich')
    proteomic_data = proteomic_data.droplevel("Database_ID", axis = 1)
    proteomic_data = proteomic_data.loc[:, [target_gene]]
    
    phospho_data = cancer.get_phosphoproteomics(source = "umich")
    phospho_data.loc[:, phospho_data.columns.get_level_values(0) == target_gene].droplevel(["Database_ID", "Peptide"], axis = 1)
    
    transcript_data = cancer.get_transcriptomics(source = "washu")
    transcript_data = transcript_data.droplevel("Database_ID", axis = 1)
    
    datasets[name[6:]] = [genotype_data, proteomic_data, transcript_data, phospho_data]    

Loading broadbrca v1.0...       



  result = parse_gtf(


  result = parse_gtf(


                                    



                                         



                                    



                                      



                                         



                                        



                                        



                                   



                                        



                                        



The dataset dictionary is structured as {cancer name : [genotype data, proteomic data, transcript data, phosphoproteomic data]}.

This dictionary will be used throughout the notebook to access data from all cancers simultaneously

# Step 2


Determine which cancers have mutation types with rates above 20%

In [6]:
# Rename the Mutations in the [genotype] dataframe for a standard output
for name, dataset in datasets.items():
    genotypes = dataset[0]
    genotypes.replace(["Nonsense_Mutation", "Frame_Shift_Ins", "Frame_Shift_Del"], "Truncation", inplace = True)
    genotypes.replace(["In_Frame_Del", "In_Frame_Ins"], "Indel", inplace = True)
    genotypes.replace(["Missense_Mutation", "Wildtype_Tumor"], ["Missense", "Wildtype"], inplace = True)
    genotypes = genotypes.loc[genotypes["Mutation"] != "Silent"]
    datasets[name][0] = genotypes

In [10]:
# Create graph showing the frequency of each mutation per cancer
# plt.figure1_plot_mutations(dflist = [dataset[0] for dataset in datasets.values()], names_of_df = list(datasets.keys()), save_to_path = "Mutation_Frequency.png")

AttributeError: module 'plot_utils' has no attribute 'figure1_plot_mutations'

In [11]:
# Create a dictionary with the cancer name as the key, and a list of all
#    mutation types that have a frequency >=20% as the value.
# Used at the beginning of each step to determine which cancers will be included in the analysis
variants_to_analyze = {}
for name, dataset in datasets.items():
    mut_type_freq = dataset[0]["Mutation"].value_counts() / len(dataset[0])
    mut_type_freq = mut_type_freq.loc[mut_type_freq >= 0.2]
    variants_to_analyze[name] = set(mut_type_freq.index.tolist())
    variants_to_analyze[name].discard("Wildtype")
print(str(variants_to_analyze))


{'Brca': {'Missense'}, 'Ccrcc': set(), 'Coad': {'Missense'}, 'Gbm': set(), 'Hnscc': {'Amplification'}, 'Lscc': {'Amplification'}, 'Luad': set(), 'Ov': {'Amplification'}, 'Ucec': {'Missense'}, 'Pdac': set()}


#Step 3
Look at how proteomic expression differs between the mutated tumors and wildtype tumors

In [12]:
# Create a function that will analyze the protein cis effects for a given mutation type
def analyze_mutation(mutation_type, pval_only = True):
    output_string = ""
    for name, dataset in datasets.items():
        if mutation_type not in variants_to_analyze[name]: continue
        genotype_and_prot = pd.merge(dataset[0], dataset[1], left_index = True, right_index = True)
        mutation = genotype_and_prot.loc[genotype_and_prot["Mutation"] == mutation_type][target_gene].dropna()
        wildtype = genotype_and_prot.loc[genotype_and_prot["Mutation"] == "Wildtype"][target_gene].dropna()
        result = ttest_ind(mutation, wildtype)
        
        if not pval_only:
            output_string += f"""{name}\nNumber of {mutation_type}s = {len(mutation)}\nNumber of Wildtypes = {len(wildtype)}\n"""
        output_string += f"{name} {mutation_type} P-value = {result[1]}\n\n"
        
    return output_string    


In [13]:
print(str(variants_to_analyze))
# print(analyze_mutation("Missense", pval_only = False))

{'Brca': {'Missense'}, 'Ccrcc': set(), 'Coad': {'Missense'}, 'Gbm': set(), 'Hnscc': {'Amplification'}, 'Lscc': {'Amplification'}, 'Luad': set(), 'Ov': {'Amplification'}, 'Ucec': {'Missense'}, 'Pdac': set()}


In [14]:
for variant in ["Deletion", "Missense", "Amplification", "Truncation"]:
    print("----------" + variant.upper() + "-----------")
    print(analyze_mutation(variant, pval_only = False))
    print()

----------DELETION-----------


----------MISSENSE-----------
Brca
Number of Missenses = 34
Number of Wildtypes = 62
Brca Missense P-value = 0.4136071304038146

Coad
Number of Missenses = 19
Number of Wildtypes = 53
Coad Missense P-value = 0.3451708361382616

Ucec
Number of Missenses = 43
Number of Wildtypes = 41
Ucec Missense P-value = 0.1394571498900522



----------AMPLIFICATION-----------
Hnscc
Number of Amplifications = 51
Number of Wildtypes = 48
Hnscc Amplification P-value = 8.779608913150495e-08

Lscc
Number of Amplifications = 77
Number of Wildtypes = 21
Lscc Amplification P-value = 6.777352933247902e-06

Ov
Number of Amplifications = 46
Number of Wildtypes = 27
Ov Amplification P-value = 0.05106221295747373



----------TRUNCATION-----------




In [15]:
#Cancers with more than 20% having deletions
deletion_sets = ["Brca","Ccrcc","Coad","Luad","Ov","Pdac"]

In [16]:
target_dataset = datasets["Ccrcc"]
target_dataset[2]
# target_dataset[2].loc[target_dataset["Mutation"] == "Deletion"]

Name,5S_rRNA,5S_rRNA,5S_rRNA,5S_rRNA,5S_rRNA,5S_rRNA,5S_rRNA,5S_rRNA,5S_rRNA,5S_rRNA,...,uc_338,uc_338,uc_338,uc_338,uc_338,uc_338,uc_338,uc_338,uc_338,yR211F11.2
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00004,0.0,0.0,0.0,3.354233,0.000000,0.000000,0.0,0.0,0.000000,0.0,...,0.718764,1.796911,0.811508,5.616391,0.000000,0.0,0.370406,0.0,0.000000,0.104097
C3L-00010,0.0,0.0,0.0,2.350325,0.000000,0.192569,0.0,0.0,0.000000,0.0,...,0.545611,0.136403,0.739215,5.462455,0.000000,0.0,1.687043,0.0,0.000000,0.316078
C3L-00011,0.0,0.0,0.0,5.843757,0.412252,0.000000,0.0,0.0,0.000000,0.0,...,0.262565,0.000000,0.000000,6.155015,0.129738,0.0,0.135310,0.0,0.000000,0.190133
C3L-00026,0.0,0.0,0.0,1.445766,0.451681,0.203067,0.0,0.0,0.000000,0.0,...,0.863034,0.000000,0.389757,5.619753,0.000000,0.0,0.444753,0.0,0.129919,0.499964
C3L-00079,0.0,0.0,0.0,1.111766,2.674472,0.000000,0.0,0.0,0.000000,0.0,...,2.167943,0.000000,0.419602,1.815022,0.000000,0.0,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01646.N,0.0,0.0,0.0,2.043907,0.957825,0.287079,0.0,0.0,0.000000,0.0,...,0.813391,0.203348,2.571366,4.568233,0.000000,0.0,2.724612,0.0,0.000000,0.294504
C3N-01648.N,0.0,0.0,0.0,0.570147,1.870295,0.000000,0.0,0.0,0.000000,0.0,...,2.580933,0.000000,0.896602,6.011403,0.000000,0.0,3.887842,0.0,1.613883,0.632568
C3N-01649.N,0.0,0.0,0.0,0.704438,1.797306,0.000000,0.0,0.0,0.256758,0.0,...,0.981182,0.000000,1.329344,7.347457,0.000000,0.0,2.865293,0.0,0.443115,0.426307
C3N-01651.N,0.0,0.0,0.0,1.584157,3.117976,0.623013,0.0,0.0,0.000000,0.0,...,0.661951,0.000000,2.590863,3.448304,0.000000,0.0,5.458052,0.0,0.000000,0.447388


In [17]:
genotype_and_prot = target_dataset[1].merge(target_dataset[2], left_index = True, right_index = True)
genotype_and_prot.loc[genotype_and_prot["Mutation"] == "Deletion"]

KeyError: 'Mutation'

In [18]:
with open("deletion_protein_data.txt", 'w') as outFile:
    for name, dataset in datasets.items():
        if name not in deletion_sets: continue
        outFile.write(name + '\n')
        genotype_and_prot = dataset[1].merge(dataset[2], left_index = True, right_index = True)
        deletions = genotype_and_prot.loc[genotype_and_prot["Mutation"] == "Deletion"][target_gene].dropna()
        wildtypes = genotype_and_prot.loc[genotype_and_prot["Mutation"] == "Wildtype_Tumor"][target_gene].dropna()
        outFile.write("len(deletions) = " + str(len(deletions)) + '\n')
        outFile.write("len(wildtypes) = " + str(len(wildtypes)) + '\n')
        result = ttest_ind(deletions, wildtypes)
        outFile.write(name + " deletion P-value: " + str(result[1]) + '\n')
        outFile.write("\n")
        

KeyError: 'Mutation'

In [19]:
#Cancers with more than 20% having Missense and Wildtype
missense_sets = ["Luad", "Pdac"]
for name, dataset in datasets.items():
    if name not in missense_sets: continue
    print(name)
    genotype_and_prot = dataset[1].merge(dataset[2], left_index = True, right_index = True)
    missenses = genotype_and_prot.loc[genotype_and_prot["Mutation"] == "Missense_Mutation"][target_gene].dropna()
    wildtypes = genotype_and_prot.loc[genotype_and_prot["Mutation"] == "Wildtype_Tumor"][target_gene].dropna()
    print("len(missenses) = " + str(len(missenses)))
    print("len(wildtypes) = " + str(len(wildtypes)))
    result = ttest_ind(deletions, wildtypes)
    print(name + " missense P-value: " + str(result[1]))
    print("\n")
    
# genotype_and_prot = datasets["Luad"][1].merge(datasets["Luad"][2], left_index = True, right_index = True)
# genotype_and_prot["Mutation"]
# genotype_and_prot.loc[genotype_and_prot["Mutation"] == "Missense_Mutation"]

Luad


KeyError: 'Mutation'