In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp

import cptac
import cptac.utils as u

  import pandas.util.testing as tm


In [2]:
def format_pval_annotation(pval_symbol, x1, x2, line_start = .05, line_height=.05):
    # for manual adjustment to pval annotations
    
    y, h = line_start, line_height
    plt.plot([x1, x1, x2, x2], #draw horizontal line
             [y, y+h, y+h, y], #vertical line
             lw=1.5, color= '.3')
    plt.text((x1+x2)*.5, # half between x coord
             y+h, pval_symbol, horizontalalignment='center', verticalalignment='bottom', color = "black")



In [44]:
def get_dataframe(gene, cancer_object):
    prot_and_mutations = cancer_object.get_genotype_all_vars(gene)

    keep = ['Missense_Mutation','Wildtype_Tumor']
    in_keep = prot_and_mutations['Mutation'].isin(keep)
    miss_mutations = prot_and_mutations[in_keep]

    #Format the dataframe correctly for the T-test(just omics and binary columns for tumors)
    columns_to_drop = ["Location", "Mutation_Status"]
    mut_status = miss_mutations.drop(columns_to_drop, axis = 1)
    mut_status = mut_status.dropna(axis=1,how='all')

    #join proteomics
    proteomics = cancer_object.get_proteomics()
    proteomics = proteomics[["PIK3CA"]]#get just PIK3CA
    proteomics = proteomics[~proteomics.index.str.contains(".N")] #get rid of normal samples

    proteomics["Mutation"] = mut_status['Mutation']
    proteomics = proteomics.dropna()

    mut_status = proteomics
    mut_status[gene+"_proteomics"] = proteomics["PIK3CA"]
    mut_status['binary_mutations'] = proteomics["Mutation"]
    mut_status = mut_status.drop(columns=["PIK3CA", "Mutation"])

    return mut_status



Name,Mutation,Location,Mutation_Status
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C3L-00006,Missense_Mutation,p.E545K,Single_mutation
C3L-00008,Wildtype_Tumor,No_mutation,Wildtype_Tumor
C3L-00032,Missense_Mutation,p.E545K,Single_mutation
C3L-00090,Wildtype_Tumor,No_mutation,Wildtype_Tumor
C3L-00136,Wildtype_Tumor,No_mutation,Wildtype_Tumor
...,...,...,...
C3N-01510,Missense_Mutation,p.E545K,Single_mutation
C3N-01520,Missense_Mutation,p.E726K,Single_mutation
C3N-01537,Wildtype_Tumor,No_mutation,Wildtype_Tumor
C3N-01802,Wildtype_Tumor,No_mutation,Wildtype_Tumor


In [4]:
en = cptac.Endometrial()

                                    

In [39]:
gene = 'PIK3CA'
test_gene = "PIK3CA"

In [40]:
endo_prot = get_dataframe(gene, en) #normal=True)
endo_prot.head()



Name,PIK3CA_proteomics,binary_mutations
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
C3L-00006,-0.45,Missense_Mutation
C3L-00008,-0.215,Wildtype_Tumor
C3L-00032,-0.178,Missense_Mutation
C3L-00090,-0.392,Wildtype_Tumor
C3L-00136,-0.349,Wildtype_Tumor


In [24]:
u.wrap_ttest(endo_prot, 'binary_mutations', [gene+'_proteomics'], return_all=True)

Unnamed: 0,Comparison,P_Value
0,PIK3R2_proteomics,0.237246


# For interacting proteins

In [47]:
# trans
gene = 'PIK3CA'
prot_and_mutations = en.join_omics_to_mutations(
    mutations_genes = [gene], omics_df_name = 'proteomics', omics_genes = ip, 
    mutations_filter = ['Nonsense_Mutation','Frame_Shift_Del','Frame_Shift_Ins']) 

keep = ['Nonsense_Mutation','Frame_Shift_Del','Frame_Shift_Ins','Wildtype_Tumor']
in_keep = prot_and_mutations[gene+'_Mutation'].isin(keep)
trunc_mutations = prot_and_mutations[in_keep]

# Step 2 - Create binary column 
trunc_mutations['binary_mutations'] = np.where(
            trunc_mutations[gene+'_Mutation_Status'] == 'Wildtype_Tumor', 'Wildtype', 'Truncation')

# Step 3 - Format the dataframe correctly for the T-test(just omics and binary columns for tumors)
tumors = trunc_mutations[trunc_mutations.Sample_Status == 'Tumor'] #drop Normal samples
columns_to_drop = [gene+"_Mutation", gene+"_Location", gene+"_Mutation_Status", "Sample_Status"]
mut_status = tumors.drop(columns_to_drop, axis = 1)
mut_status = mut_status.dropna(axis=1,how='all')
mut_status['binary_mutations'].value_counts()
prot_col = list(mut_status.columns[:-1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [36]:
p_values = []
for gene in ip:
    endo_prot = get_dataframe(gene, en) #normal=True)
    pval = u.wrap_ttest(endo_prot, 'binary_mutations', [gene+'_proteomics'], return_all=True)
    p_values.append(pval)

  **kwargs)
  ret = ret.dtype.type(ret / rcount)


Incorrectly Formatted Dataframe! Label column must have exactly 2 unique values.




Incorrectly Formatted Dataframe! Label column must have exactly 2 unique values.




Incorrectly Formatted Dataframe! Label column must have exactly 2 unique values.


In [38]:
p_values

[          Comparison   P_Value
 0  PIK3R2_proteomics  0.237246,         Comparison  P_Value
 0  TNS1_proteomics      NaN,            Comparison  P_Value
 0  RPS6KB1_proteomics      NaN,         Comparison  P_Value
 0  KRAS_proteomics  0.90991,           Comparison   P_Value
 0  PDGFRA_proteomics  0.073113,         Comparison   P_Value
 0  AKT3_proteomics  0.083449,           Comparison   P_Value
 0  PIK3CA_proteomics  0.109686,          Comparison   P_Value
 0  ERBB3_proteomics  0.090815,          Comparison   P_Value
 0  IGF1R_proteomics  0.042319,          Comparison  P_Value
 0  ERBB2_proteomics      NaN,         Comparison  P_Value
 0  EGFR_proteomics  0.42044,         Comparison  P_Value
 0  GNAQ_proteomics      NaN,        Comparison   P_Value
 0  KIT_proteomics  0.222542,         Comparison  P_Value
 0  MRAS_proteomics      NaN, None,           Comparison   P_Value
 0  CTNNB1_proteomics  0.033919, None,         Comparison  P_Value
 0  PTEN_proteomics  0.18491,         Compariso