# RB1 and EGFR Fisher test


## Step 1: Library Imports

Run this cell to import the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import re
import sys 
#sys.path.append('C:\\Users\\brittany henderson\\GitHub\\GBM_for_CPTAC\\')
#import cis_functions as f

import cptac
import cptac.utils as u

In [2]:
# note: pass reduced omics # CHECK would include non coding, but none in fm genes

def format_mutated_wt_comparison(reduced_omics_df):
    # Step 2 - Create binary column 
    reduced_omics_df['binary_mutations'] = np.where(
                reduced_omics_df[gene+'_Mutation_Status'] == 'Wildtype_Tumor', 'Wildtype', 'Mutated')

    # Step 3 - Format the dataframe correctly for the T-test(just omics and binary columns for tumors)
    tumors = reduced_omics_df[reduced_omics_df.Sample_Status == 'Tumor'] #drop Normal samples
    columns_to_drop = [gene+"_Mutation", gene+"_Location", gene+"_Mutation_Status", "Sample_Status"]
    mut_status = tumors.drop(columns_to_drop, axis = 1)

    return mut_status

In [3]:
def get_missense_truncation_comparison(cancer_object, reduced_omics_df):
    #get omics data and tumors
    tumors = reduced_omics_df.loc[reduced_omics_df['Sample_Status'] == 'Tumor'] #drop Normal samples


    somatic_mutations = cancer_object.get_somatic_mutation().reset_index()

    if cancer_object.get_cancer_type() == 'colon':
        missence_truncation_groups = {'frameshift substitution': 'Truncation', 
            'frameshift deletion': 'Truncation', 'frameshift insertion': 'Truncation', 
            'stopgain': 'Truncation', 'stoploss': 'Truncation', 'nonsynonymous SNV': 'Missense',
            'nonframeshift insertion': 'Missense','nonframeshift deletion': 'Missense', 
            'nonframeshift substitution': 'Missense'}
    else: 
        missence_truncation_groups = {'In_Frame_Del': 'Missense', 'In_Frame_Ins': 'Missense',
            'Missense_Mutation': 'Missense', 'Frame_Shift_Del': 'Truncation','Nonsense_Mutation': 'Truncation', 
            'Splice_Site': 'Truncation', 'Frame_Shift_Ins': 'Truncation','Nonstop_Mutation':'Truncation'}

    mutations_replaced_M_T = somatic_mutations.replace(missence_truncation_groups)
    mutations_replaced_M_T = mutations_replaced_M_T.loc[mutations_replaced_M_T['Gene'] == gene]

    # group mutation categories
    miss = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'Missense']
    trunc = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'Truncation']

    #get lists of unique samples for missence and trucation categories
    miss_unique_samples = list(miss['Sample_ID'].unique())
    trunc_unique_samples = list(trunc['Sample_ID'].unique())
    
    #check if there is only one type of mutation for the specific gene
    if miss_unique_samples == []:
        print('Only truncation type mutations found for', gene+'.', 
             'Not possible to compare missense with wildtype.')
        truncation_omics = tumors.loc[tumors.index.isin(trunc_unique_samples)]
        truncation_omics = truncation_omics.assign(binary_mutations = 'Truncation')
        columns_to_drop = [gene+"_Mutation", gene+"_Location", gene+"_Mutation_Status", "Sample_Status"]
        binary_mut_omics = truncation_omics.drop(columns_to_drop, axis = 1)
        return binary_mut_omics
    elif trunc_unique_samples == []:
        print('Only missence type mutations found for', gene+'.', 
             'Not possible to compare truncation with wildtype.')
        missence_omics = tumors.loc[tumors.index.isin(miss_unique_samples)]
        missence_omics = missence_omics.assign(binary_mutations = 'Missense')
        columns_to_drop = [gene+"_Mutation", gene+"_Location", gene+"_Mutation_Status", "Sample_Status"]
        binary_mut_omics = missence_omics.drop(columns_to_drop, axis = 1)
        return binary_mut_omics

    ### Step 2 - Create the binary column needed to do the comparison
    # Get mutation catagories with omics data
    missence_omics = tumors.loc[tumors.index.isin(miss_unique_samples)]
    missence_omics = missence_omics.assign(binary_mutations = 'Missense')
    truncation_omics = tumors.loc[tumors.index.isin(trunc_unique_samples)]
    truncation_omics = truncation_omics.assign(binary_mutations = 'Truncation')
    binary_mut_omics = missence_omics.append(truncation_omics)

    # Step 3 - Format the dataframe correctly for the T-test(just omics and binary columns for tumors)
    columns_to_drop = [gene+"_Mutation", gene+"_Location", gene+"_Mutation_Status", "Sample_Status"]
    binary_mut_omics = binary_mut_omics.drop(columns_to_drop, axis = 1)

    return binary_mut_omics


In [5]:
#cptac.download(dataset='ccrcc', version='0.0')
brain = cptac.Gbm()
desired_cutoff = 0.05
gene = 'EGFR'

                                    

In [6]:
freq_mut = u.get_frequently_mutated(brain, cutoff = desired_cutoff)
freq_mut.loc[freq_mut['Gene'] == gene]



Name,Gene,Unique_Samples_Mut,Missense_Mut,Truncation_Mut,Non-Coding
4,EGFR,0.16,0.16,0.0,0.0


In [66]:
#EGFR
gene = 'EGFR'
# Step 1 - Create dataframe in order to do comparisons with wrap_ttest - drop 
transrcpt_mutations = brain.join_omics_to_mutations(
    mutations_genes = gene, omics_df_name = 'transcriptomics', omics_genes = gene)
reduced_transcript_mutations = brain.reduce_multiindex(transrcpt_mutations, levels_to_drop=1) #single col labels
EGFR_reduced_t_mutations = brain.reduce_multiindex(transrcpt_mutations, levels_to_drop=1) #single col labels

# Step 2 & 3 - Get binary column and format
transcript_mut_wt = format_mutated_wt_comparison(EGFR_tumors) # function drops Normal samples
EGFR_df = transcript_mut_wt['binary_mutations'].value_counts().to_frame() #value counts drops na
EGFR_df = EGFR_df.rename({'binary_mutations': 'EGFR'}, axis=1)
EGFR_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,EGFR
Wildtype,84
Mutated,16


In [65]:
#RB1
gene = 'RB1'
# Step 1 - Create dataframe in order to do comparisons with wrap_ttest - drop 
transrcpt_mutations = brain.join_omics_to_mutations(
    mutations_genes = gene, omics_df_name = 'transcriptomics', omics_genes = gene)
RB1_reduced_t_mutations = brain.reduce_multiindex(transrcpt_mutations, levels_to_drop=1) #single col labels
RB1_t_and_mutations = RB1_reduced_t_mutations.Sample_Status == "Tumor" # drop Normal samples
RB1_tumors = RB1_reduced_t_mutations[RB1_t_and_mutations]

# Step 2 & 3 - Get binary column and format
transcript_mut_wt = format_mutated_wt_comparison(RB1_tumors)
RB1_df = transcript_mut_wt['binary_mutations'].value_counts().to_frame() #value counts drops na
RB1_df = RB1_df.rename({'binary_mutations': 'RB1'}, axis=1)
RB1_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,RB1
Wildtype,90
Mutated,10


In [36]:
# Make contingency table
RB1_EFGR = RB1_df.join(EGFR_df, how='left')
RB1_EFGR

Unnamed: 0,RB1,EGFR
Wildtype,90,84
Mutated,10,16


In [44]:
oddsratio, pvalue = scipy.stats.fisher_exact([[90,10],[84,16]])
pvalue

0.2930739836430058

In [45]:
# uses two-sided as default
oddsratio, pvalue = scipy.stats.fisher_exact(RB1_EFGR,)
pvalue

0.2930739836430052

# Proteomics

Proteomics: Mutated and Wildtype

In [69]:
gene= 'EGFR'
# Step 1 - Create dataframe in order to do comparisons with wrap_ttest  
EGFR_prot_and_mutations = brain.join_omics_to_mutations(
    mutations_genes = gene, omics_df_name = 'proteomics', omics_genes = gene) # single col level
EGFR_prot_and_mutations = EGFR_prot_and_mutations.loc[EGFR_prot_and_mutations['Sample_Status'] == "Tumor"] # drop Normal samples

# Step 2 & 3 - Get binary column and format
EGFR_prot_mut_wt = format_mutated_wt_comparison(EGFR_prot_and_mutations)
prot_EGFR_df = EGFR_prot_mut_wt['binary_mutations'].value_counts().to_frame() #value counts drops na
prot_EGFR_df = prot_EGFR_df.rename({'binary_mutations': 'EGFR'}, axis=1)
prot_EGFR_df



Unnamed: 0,EGFR
Wildtype,84
Mutated,16


In [68]:
gene= 'RB1'
# Step 1 - Create dataframe in order to do comparisons with wrap_ttest  
RB1_prot_and_mutations = brain.join_omics_to_mutations(
    mutations_genes = gene, omics_df_name = 'proteomics', omics_genes = gene) # single col level
RB1_prot_and_mutations = RB1_prot_and_mutations.loc[RB1_prot_and_mutations['Sample_Status'] == "Tumor"] # drop Normal samples

# Step 2 & 3 - Get binary column and format
prot_mut_wt = format_mutated_wt_comparison(RB1_prot_and_mutations)
prot_RB1_df = prot_mut_wt['binary_mutations'].value_counts().to_frame()
prot_RB1_df = prot_RB1_df.rename({'binary_mutations': 'RB1'}, axis=1)
prot_RB1_df

Unnamed: 0,RB1
Wildtype,90
Mutated,10
