In [1]:
import pandas as pd
import cptac

In [2]:
def get_frequently_mutated(cancer_object, cutoff = 0.1):  
    """
    Takes a cancer object and find the frequently 
    mutated genes (in the tumor samples) compared to the cutoff.

    Parameters:
    cancer_object (object): cancer type from cptac module 
    cutoff (float): used as a comparison to determine the 
                    status of gene mutation frequency

    Returns:
    freq_mutated_df (pd.DataFrame): DataFrame of frequently 
        mutated genes passing the cutoff. Columns contain the 
        fractions of total unique mutations,missence type 
        mutations, and truncation type mutations per gene.
    
    The Missence_Mut column includes: 
        In_Frame_Del, In_Frame_Ins, Missense_Mutation
   
    The Truncation_Mut column includes: 
        Frame_Shift_Del, Frame_Shift_Ins, Splice_Site, 
        Nonsense_Mutation, Nonstop_Mutation
        
    These columns count multiple mutations of one gene in the 
    same sample, so fractions in the last two columns may 
    exceed the Unique_Samples_Mut column which only counts if 
    the gene was mutated once per sample."""    
    
    # Get total tumors/patients
    omics_and_mutations = cancer_object.join_omics_to_mutations(
        mutations_genes = 'TP53', omics_df_name = 'proteomics', omics_genes = 'TP53')
    tumors = omics_and_mutations.loc[omics_and_mutations['Sample_Status'] == 'Tumor'] 
    total_tumor_samples = len(tumors)
    
    # Get mutations data frame
    somatic_mutations = cancer_object.get_mutations() 

    # Drop silent mutations for Ovarian and RenalCcrcc dataset 
    if 'Silent' in somatic_mutations['Mutation'].unique():
        origin_df = somatic_mutations.loc[somatic_mutations['Mutation'] != 'Silent'].reset_index()
    else:
        origin_df = somatic_mutations.reset_index() #prepare to count unique samples
        
    # Create two categories in Mutation column - 'M': Missence, 'T': Truncation
    if cancer_object.get_cancer_type() == 'colon':
        missence_truncation_groups = {'frameshift substitution': 'T', 
            'frameshift deletion': 'T', 'frameshift insertion': 'T', 
            'stopgain': 'T', 'stoploss': 'T', 'nonsynonymous SNV': 'M',
            'nonframeshift insertion': 'M','nonframeshift deletion': 'M', 
            'nonframeshift substitution': 'M'}
    else: 
        missence_truncation_groups = {'In_Frame_Del': 'M', 'In_Frame_Ins': 'M',
            'Missense_Mutation': 'M', 'Frame_Shift_Del': 'T','Nonsense_Mutation': 'T', 
            'Splice_Site': 'T', 'Frame_Shift_Ins': 'T','Nonstop_Mutation':'T'}
    mutations_replaced_M_T = origin_df.replace(missence_truncation_groups)
    # Check that all mutation names are catagorized
    if len(mutations_replaced_M_T['Mutation'].unique()) != 2:
        print('Warning: New mutation name not classified. Counts will be affected.')
    
    # Find frequently mutated genes (total fraction > cutoff)
    # Step 1 - group by gene and count unique samples
    count_mutations = origin_df.groupby(['Gene']).nunique()
    # Step 2 - format
    count_mutations = count_mutations.rename(columns={"Sample_ID": "Unique_Samples_Mut"})
    count_mutations = count_mutations.drop(['Gene', 'Mutation','Location'], axis = 1)
    # Step 3 - filter using the cutoff and create fraction
    fraction_mutated = count_mutations.apply(lambda x: x / total_tumor_samples)
    fraction_greater_than_cutoff = fraction_mutated.where(lambda x: x > cutoff) #na used when false
    filtered_gene_df = fraction_greater_than_cutoff.dropna()
    
    # Create and join Missence column (following similar steps as seen above)
    miss = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'M']
    count_miss = miss.groupby(['Gene']).nunique()
    missence_df = count_miss.rename(columns={"Sample_ID": "Missence_Mut"})
    missence_df = missence_df.drop(['Gene', 'Mutation', 'Location'], axis = 1)
    fraction_missence = missence_df.apply(lambda x: x / total_tumor_samples)
    freq_mutated_df = filtered_gene_df.join(fraction_missence, how='left').fillna(0)
    
    # Create and join Truncation column (following similar steps as seen above)
    trunc = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'T']
    count_trunc = trunc.groupby(['Gene']).nunique()
    truncation_df = count_trunc.rename(columns={"Sample_ID": "Truncation_Mut"})
    truncation_df = truncation_df.drop(['Gene', 'Mutation', 'Location'], axis = 1)
    fraction_truncation = truncation_df.apply(lambda x: x / total_tumor_samples)
    freq_mutated_df = freq_mutated_df.join(fraction_truncation, how='left').fillna(0)
    freq_mutated_df = freq_mutated_df.reset_index() #move genes to their own columns

    return freq_mutated_df

In [10]:
def check_getter(df, exp_dim, exp_headers, coordinates, values): 
    """Test a dataframe's dimensions and headers, and three test values, then print whether it passed the test.
    Parameters
    df: the dataframe gotten by the getter we are testing
    exp_dim: a tuple containing the expected dimensions of the dataframe, in the format (rows, columns)
    exp_headers: if the dataframe has up to 20 columns, all of the headers for the dataframe, in order. If it has more than 20 columns, then a list containing the first ten and last ten headers, in order.
    coordinates: a tuple with three elements, each element being a tuple with two elements, the first element being the int index of the row of a test value, and the second element being the int index of the column of a test value
    values: a tuple with three elements, each element being the expected value of the test value corresponding to the coordinates at the same index in the coordinates parameter 
    Returns
    bool indicating if the dataframe had the correct data.
    """
    PASS = True

    # Check that df is a dataframe, not None or something else.
    if not check_returned_is_df(df):
        return False # End test, because other tests will be useless.

    # Check dimensions
    if not check_df_shape(df, exp_dim):
        PASS = False

    # Check headers
    act_headers_all = list(df.columns.values)
    if len(df.columns.values) <= 20:
        act_headers = act_headers_all
    else:
        act_headers = act_headers_all[:10] + act_headers_all[-10:]

    if len(exp_headers) != len(act_headers):
        print("Unexpected number of test headers in dataframe. Expected number of headers: {}. You passed {} headers.\n".format(len(act_headers), len(exp_headers)))
        PASS = False
    else:
        for i, header in enumerate(exp_headers):
            if header != act_headers[i]:
                print("Dataframe header did not match expected value.\n\tExpected: {}\n\tActual: {}\n".format(header, act_headers[i]))
                PASS = False

    # Check test values
    act_values = [
        df.iloc[coordinates[0][0], coordinates[0][1]],
        df.iloc[coordinates[1][0], coordinates[1][1]],
        df.iloc[coordinates[2][0], coordinates[2][1]]]

    for i, value in enumerate(values):
        if act_values[i] != value:
            print("Dataframe value did not match expected value.\n\tColumn: {}\n\tIndex: {}\n\tExpected: {}\n\tActual: {}\n".format(df.columns.values[coordinates[i][1]], df.index.values[coordinates[i][0]], value, act_values[i]))
            PASS = False

    # Return whether the dataframe passed the test
    return PASS

In [13]:
co = cptac.Colon()

                                    

In [16]:
df = get_frequently_mutated(co, 0.15)

In [49]:
df.loc[df['Gene'] == 'RYR2']

Unnamed: 0,Gene,Unique_Samples_Mut,Missence_Mut,Truncation_Mut
102,RYR2,0.216495,0.195876,0.072165


In [56]:
import pandas as pd
def test_get_frequently_mutated_co():
    """Test get_frequently_mutated."""
    #co = cptac.Colon()
    print('Running get_frequently_mutated...')

    df = get_frequently_mutated(co, 0.15)
    name = "frequently_mutated"
    dimensions = (138, 4)
    headers = ['Gene', 'Unique_Samples_Mut', 'Missence_Mut', 'Truncation_Mut']
    
    # test when there are no missence type mutatations
    test_coord_CASP5 = ((15, 1), (15, 2), (15, 3)) #(8, 1)]) # (29, 1), (40, 1), (8, 2), (29, 2), (40, 2))
    test_vals_CASP5 = (19/97, 0/97, 19/97) #, 0/97]) #,35/97, 23/97, 19/97, 0/97, 1/97) 
    
    # test when there are no truncation type mutatations
    test_coord_KRAS = ((66, 1),(66, 2),(66, 3))
    test_vals_KRAS = (35/97, 35/97, 0/97)

    test_coord_PIK3CA = ((92, 1),(92, 2),(92, 3))
    test_vals_PIK3CA = (24/97, 23/97, 1/97)
    
    # test when missence and trucation don't add up to equal the fraction mutated (miss and trunc in same sample)
    test_coord_RYR2 = ((102, 1),(102, 2),(102, 3))
    test_vals_RYR2 = (21/97, 19/97, 7/97)

    test_coord_vals = [(test_coord_CASP5, test_vals_CASP5), (test_coord_KRAS, test_vals_KRAS),
                      (test_coord_PIK3CA, test_vals_PIK3CA), (test_coord_RYR2, test_vals_RYR2)]

    for coord, val in test_coord_vals:
        #print('coord', coord, 'val', val)
        PASS = check_getter(df, dimensions, headers, coord, val)
    
    print_test_result(PASS)

In [57]:
test_get_frequently_mutated_co()

Running get_frequently_mutated...
	PASS


In [69]:
import pandas as pd
def test_get_frequently_mutated_ov_default_cutoff():
    """Test get_frequently_mutated."""
    ov = cptac.Ovarian()
    print('Running get_frequently_mutated...')

    df = get_frequently_mutated(ov)
    name = "frequently_mutated"
    dimensions = (16, 4)
    headers = ['Gene', 'Unique_Samples_Mut', 'Missence_Mut', 'Truncation_Mut']
    
    # test genes names
    test_coord_names = ((15, 0), (13, 0), (2, 0))
    test_vals_names = ('WDFY4', 'TP53', 'MT-CO1')
    
    #test when missence and trucation don't add up to equal the fraction mutated (miss and trunc in same sample)
    test_coord_WDFY4 = ((15, 1), (15, 2), (15, 3)) 
    test_vals_WDFY4 = (10/83, 8/83, 3/83) 
    
    # test highest count
    test_coord_TP53 = ((13, 1),(13, 2),(13, 3))
    test_vals_TP53 = (77/83, 50/83, 27/83)
    
    # test when there are no truncation mutations
    test_coord_MTCO1 = ((2, 1),(2, 2),(2, 3))
    test_vals_MTCO1 = (10/83, 10/83, 0/83)

    test_coord_vals = [(test_coord_names, test_vals_names), (test_coord_WDFY4, test_vals_WDFY4),
                      (test_coord_TP53, test_vals_TP53), (test_coord_MTCO1, test_vals_MTCO1)]

    for coord, val in test_coord_vals:
        #print('coord', coord, 'val', val)
        PASS = check_getter(df, dimensions, headers, coord, val)
    
    print_test_result(PASS)

In [70]:
test_get_frequently_mutated_ov_default_cutoff()

Running get_frequently_mutated...
	PASS


In [6]:
import pandas as pd
def test_get_frequently_mutated_en_default_cutoff():
    """Test get_frequently_mutated."""

    print('Running get_frequently_mutated...')

    df = get_frequently_mutated('endometrial')
    name = "frequently_mutated"
    dimensions = (232, 4)
    headers = ['Gene', 'Fraction_Mutated', 'Truncation', 'Missence']
    
    # test genes: ARID1A, DICER1, FAT3
    # first three = total_mutation, next three = truncation, etc.
    test_coord = ((2, 1), (129, 1), (220, 1), (2, 2), (129, 2), (220, 2), (2, 3), (129, 3), (220, 3))
    test_vals = (43/95, 10/95, 12/95, 38/95, 1/95, 6/95, 13/95, 10/95, 9/95) 
    """
    test_coord_trunc = ((2,2),(129,2),(220,2))
    test_vals_trunc = (38/95, 1/95,6/95)
    
    test_coord_miss = ((2,3),(129,3),(220,3))
    test_vals_miss = (13/95, 10/95, 9/95)
    """
    
    PASS = check_getter(df, name, dimensions, headers, test_coord, test_vals)
    print_test_result(PASS)

In [7]:
test_get_frequently_mutated_en_default_cutoff()

Running get_frequently_mutated...
Welcome to the cptac data service package. Available datasets may be
viewed using cptac.list_data(). In order to access a specific data
set, import a cptac subfolder using either 'import cptac.dataset' or
'from cptac import dataset'.
******
Version: 0.4.1
******
You have loaded the cptac endometrial dataset. To view available
dataframes, use cptac.endometrial.list_data(). To view available
functions for accessing and manipulating the dataframes, use
cptac.endometrial.list_api().
endometrial data version: 2.1

Loading Dictionary...
Loading cptac endometrial data:
Loading acetylproteomics data...
Loading clinical data...
Loading CNA data...
Loading miRNA data...
Loading phosphoproteomics_gene data...
Loading phosphoproteomics_site data...
Loading proteomics data...
Loading somatic data...
Loading somatic_binary data...
Loading transcriptomics_circular data...
Loading transcriptomics_linear data...

 ******PLEASE READ******
CPTAC is a community resource p

In [33]:
import pandas as pd
def test_get_frequently_mutated_en_higher_cutoff():
    """Test get_frequently_mutated."""

    print('Running get_frequently_mutated...')

    df = get_frequently_mutated('endometrial', .25)
    name = "frequently_mutated"
    dimensions = (7,4)
    headers = ['Gene', 'Fraction_Mutated', 'Truncation', 'Missence']
    test_coord = ((0,1), (2, 1), (3,1))
    test_vals = (43/95, 75/95, 27/95)

    PASS = check_getter(df, name, dimensions, headers, test_coord, test_vals)
    print_test_result(PASS)

In [32]:
test_get_frequently_mutated_en()

Running get_frequently_mutated...
Dataframe dimensions did not match expected values.
	Expected: (7, 4)
	Actual: (232, 4)



IndexError: tuple index out of range