# Test get_frequently_mutated

In [1]:
import pandas as pd
import cptac
import cptac.utils as ut

In [2]:
def print_test_result(PASS):
    """Prints the result of a test, based on a bool.
    Parameters:
    PASS (bool): Whether or not the test passed.
    """
    if PASS:
        print('\tPASS')
    else:
        print('\tFAIL\n')

def check_returned_is_df(returned):
    """Checks that an object is a dataframe. Prints a specific message if it's actually None, or a general message if it's something else.
    Parameters:
    returned: The object to test
    Returns:
    bool: Indicates whether the object was a dataframe.
    """
    if returned is None:
        print("Function under test returned None.")
        return False
    
    if not isinstance(returned, pd.core.frame.DataFrame):
        print("Returned object was not a dataframe. Type of object: {}".format(type(returned)))
        return False
    return True

def check_df_shape(df, exp_shape):
    """Checks that a dataframe has the proper shape.
    Parameters:
    df (pandas.core.frame.DataFrame): The dataframe to test.
    exp_shape (tuple): A tuple with two elements. First element is expected number of rows, second is expected number of columns.
    Returns:
    bool: Indicates whether the dataframe had the proper shape.
    """
    act_shape = df.shape
    if exp_shape != act_shape:
        print("Dataframe dimensions did not match expected values.\n\tExpected: {}\n\tActual: {}\n".format(exp_shape, act_shape))
        return False
    return True

In [3]:
def check_getter(df, exp_dim, exp_headers, coordinates, values): 
    """Test a dataframe's dimensions and headers, and three test values, then print whether it passed the test.
    Parameters
    df: the dataframe gotten by the getter we are testing
    exp_dim: a tuple containing the expected dimensions of the dataframe, in the format (rows, columns)
    exp_headers: if the dataframe has up to 20 columns, all of the headers for the dataframe, in order. If it has more than 20 columns, then a list containing the first ten and last ten headers, in order.
    coordinates: a tuple with three elements, each element being a tuple with two elements, the first element being the int index of the row of a test value, and the second element being the int index of the column of a test value
    values: a tuple with three elements, each element being the expected value of the test value corresponding to the coordinates at the same index in the coordinates parameter 
    Returns
    bool indicating if the dataframe had the correct data.
    """
    PASS = True

    # Check that df is a dataframe, not None or something else.
    if not check_returned_is_df(df):
        return False # End test, because other tests will be useless.

    # Check dimensions
    if not check_df_shape(df, exp_dim):
        PASS = False

    # Check headers
    act_headers_all = list(df.columns.values)
    if len(df.columns.values) <= 20:
        act_headers = act_headers_all
    else:
        act_headers = act_headers_all[:10] + act_headers_all[-10:]

    if len(exp_headers) != len(act_headers):
        print("Unexpected number of test headers in dataframe. Expected number of headers: {}. You passed {} headers.\n".format(len(act_headers), len(exp_headers)))
        PASS = False
    else:
        for i, header in enumerate(exp_headers):
            if header != act_headers[i]:
                print("Dataframe header did not match expected value.\n\tExpected: {}\n\tActual: {}\n".format(header, act_headers[i]))
                PASS = False

    # Check test values
    act_values = [
        df.iloc[coordinates[0][0], coordinates[0][1]],
        df.iloc[coordinates[1][0], coordinates[1][1]],
        df.iloc[coordinates[2][0], coordinates[2][1]]]

    for i, value in enumerate(values):
        if act_values[i] != value:
            print("Dataframe value did not match expected value.\n\tColumn: {}\n\tIndex: {}\n\tExpected: {}\n\tActual: {}\n".format(df.columns.values[coordinates[i][1]], df.index.values[coordinates[i][0]], value, act_values[i]))
            PASS = False

    # Return whether the dataframe passed the test
    return PASS

In [144]:
def get_frequently_mutated(cancer_object, cutoff = 0.1):  
    # Get total tumors/patients
    omics_and_mutations = cancer_object.join_omics_to_mutations(
        mutations_genes = 'TP53', omics_df_name = 'proteomics', omics_genes = 'TP53')
    tumors = omics_and_mutations.Sample_Status

    if isinstance(tumors, pd.DataFrame): # This would happen if our proteomics dataframe has a column multiindex, which leads to a joined df with a column multiindex, and causes our selection to be a dataframe instead of a series.
        tumors = tumors.iloc[:, 0]
        tumors.name = "Sample_Status"

    v = tumors.value_counts()
    total_tumors = v['Tumor']
    total_tumor_count = float(total_tumors)
    print(total_tumor_count)
    
    # Get mutations data frame
    somatic_mutations = cancer_object.get_somatic_mutation() 

    # Drop silent mutations for Hnscc, Ovarian, and Ccrcc dataset, and synonymous SNV (i.e. silent) mutations in HNSCC
    if 'Silent' in somatic_mutations['Mutation'].unique():
        origin_df = somatic_mutations.loc[somatic_mutations['Mutation'] != 'Silent'].reset_index()
    elif 'synonymous SNV' in somatic_mutations['Mutation'].unique():
        origin_df = somatic_mutations.loc[somatic_mutations['Mutation'] != 'synonymous SNV'].reset_index()
    else:
        origin_df = somatic_mutations.reset_index() #prepare to count unique samples
        
    # Create two categories in Mutation column - 'M': Missense, 'T': Truncation
    if cancer_object.get_cancer_type() in ('hnscc') and cancer_object.version() == '0.1':
        dif_mut_names = True
    elif cancer_object.get_cancer_type() in ('colon'):
        dif_mut_names = True
    else: 
        dif_mut_names = False
        
    if dif_mut_names == True:
        missense_truncation_groups = {'frameshift substitution': 'T', 
            'frameshift deletion': 'T', 'frameshift insertion': 'T', 
            'stopgain': 'T', 'stoploss': 'T', 'nonsynonymous SNV': 'M',
            'nonframeshift insertion': 'M','nonframeshift deletion': 'M', 
            'nonframeshift substitution': 'M'}
    else: 
        missense_truncation_groups = {'In_Frame_Del': 'M', 'In_Frame_Ins': 'M',
            'Missense_Mutation': 'M', 'Frame_Shift_Del': 'T','Nonsense_Mutation': 'T', 
            'Splice_Site': 'T', 'Frame_Shift_Ins': 'T','Nonstop_Mutation':'T'}
    
    mutations_replaced_M_T = origin_df.replace(missense_truncation_groups)
    unique_mutations = len(mutations_replaced_M_T['Mutation'].unique())
    
    # replace non_coding mutations for Gbm
    gbm = False
    if cancer_object.get_cancer_type() == 'gbm':
        gbm = True
        non_coding = {'Intron': 'NC', 'RNA': 'NC', "5'Flank": 'NC', "3'Flank": 'NC', 
            "5'UTR": 'NC', "3'UTR": 'NC', 'Splice_Region' : 'NC'}
        mutations_replaced_M_T = mutations_replaced_M_T.replace(non_coding)
        unique_mutations_2 = len(mutations_replaced_M_T['Mutation'].unique())
        
    elif unique_mutations != 2: # Check that all mutation names are catagorized
        print('Warning: New mutation name not classified. Counts will be affected.')
    
    # Find frequently mutated genes (total fraction > cutoff)
    # Same steps will be repeated for finding the missense and truncation mutation frequencies
    # Step 1 - group by gene and count unique samples
    # Step 2 - format
    # Step 3 - filter using the cutoff and create fraction 
    count_mutations = origin_df.groupby(['Gene']).nunique()
    count_mutations = count_mutations.rename(columns={"Patient_ID": "Unique_Samples_Mut"}) # Step 2 
    count_mutations = count_mutations.drop(['Gene', 'Mutation', 'Location'], axis = 1)
    fraction_mutated = count_mutations.apply(lambda x: x / total_tumor_count) # Step 3 
    fraction_greater_than_cutoff = fraction_mutated.where(lambda x: x > cutoff) #na used when not > cutoff
    filtered_gene_df = fraction_greater_than_cutoff.dropna() # drop genes below cutoff
    
    # Create and join Missense column (following similar steps as seen above) *Counts missense once in sample
    miss = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'M']
    count_miss = miss.groupby(['Gene']).nunique()
    missense_df = count_miss.rename(columns={"Patient_ID": "Missense_Mut"})
    missense_df = missense_df.drop(['Gene', 'Mutation', 'Location'], axis = 1)
    fraction_missense = missense_df.apply(lambda x: x / total_tumor_count)
    freq_mutated_df = filtered_gene_df.join(fraction_missense, how='left').fillna(0)
    
    # Create and join Truncation column (following similar steps as seen above)
    trunc = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'T']
    count_trunc = trunc.groupby(['Gene']).nunique()
    truncation_df = count_trunc.rename(columns={"Patient_ID": "Truncation_Mut"})
    truncation_df = truncation_df.drop(['Gene', 'Mutation', 'Location'], axis = 1)
    fraction_truncation = truncation_df.apply(lambda x: x / total_tumor_count)
    freq_mutated_df = freq_mutated_df.join(fraction_truncation, how='left').fillna(0)
    
    if gbm == True:
        # Create and join non-coding column (following similar steps as seen above)
        nc = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'NC']
        count_nc = nc.groupby(['Gene']).nunique()
        nc_df = count_nc.rename(columns={"Patient_ID": "Non-Coding"})
        nc_df = nc_df.drop(['Gene', 'Mutation', 'Location'], axis = 1)
        fraction_nc = nc_df.apply(lambda x: x / total_tumor_count)
        freq_mutated_df = freq_mutated_df.join(fraction_nc, how='left').fillna(0)
        
    freq_mutated_df = freq_mutated_df.reset_index() #move genes to their own column
    
    return freq_mutated_df



In [194]:
g = cptac.Colon()

                                          

In [203]:
len(g.get_proteomics(tissue_type='tumor'))


97

In [207]:
d= get_frequently_mutated(g)



107.0


In [208]:
d

Name,Gene,Unique_Samples_Mut,Missense_Mut,Truncation_Mut
0,ABCA13,0.177570,0.149533,0.093458
1,ABCA2,0.158879,0.149533,0.028037
2,ABCA4,0.130841,0.074766,0.056075
3,ABCB4,0.121495,0.056075,0.065421
4,ABCC5,0.102804,0.046729,0.084112
...,...,...,...,...
443,ZNF462,0.121495,0.112150,0.018692
444,ZNF469,0.233645,0.168224,0.093458
445,ZNF536,0.121495,0.112150,0.018692
446,ZNF540,0.102804,0.037383,0.065421


In [231]:
gene = 'TP53'
d.loc[d['Gene'] == gene]

Name,Gene,Unique_Samples_Mut,Missense_Mut,Truncation_Mut
403,TP53,0.523364,0.35514,0.196262


In [230]:
m = g.get_somatic_mutation()
t = m.loc[m['Gene']== gene]
t = t.loc[t['Mutation'] != 'Silent']
t.index.value_counts()
#t.Mutation.value_counts()

trunc = ['stopgain','frameshift deletion', 'frameshift insertion', 'frameshift substitution']
s = t['Mutation'].isin(trunc)
tr = t[s]
mis = t[-s]
i = t.index.value_counts()
t.index.value_counts()
len(t)
t.Mutation.value_counts()
len(t.index.value_counts())

10

In [204]:
# update 5/18/20 total_tumors increased (97 to 107) - 10 samples added to cptac

def test_get_frequently_mutated_co_default_cutoff():
    co = cptac.Colon()
    print('Running get_frequently_mutated...')
    df = ut.get_frequently_mutated(co)
    
    dimensions = (448, 4)
    headers = ['Gene', 'Unique_Samples_Mut', 'Missense_Mut', 'Truncation_Mut']
    # test gene names
    test_coord_names = ((68, 0), (212, 0), (499, 0))
    test_vals_names = ('CASP5', 'KRAS', 'SPINK5')
    
    total_tumors = float(107)
    # test when there are no missense type mutatations
    test_coord_CASP5 = ((68, 1), (68, 2), (68, 3))
    test_vals_CASP5 = (19/total_tumors, 0/total_tumors, 19/total_tumors) 
    # test when there are no truncation type mutatations
    test_coord_KRAS = ((212, 1),(212, 2),(212, 3))
    test_vals_KRAS = (35/total_tumors, 35/total_tumors, 0/total_tumors)
    # test when missense and trucation don't add up to equal the fraction mutated
    #(miss and trunc in same sample)
    test_coord_ANK2 = ((23, 1),(23, 2),(23, 3)) 
    test_vals_ANK2 = (15/total_tumors, 13/total_tumors, 4/total_tumors) 
    

    # test when miss and trunc count are the same
    test_coord_ATM = ((56, 1),(56, 2),(56, 3)) 
    test_vals_ATM = (10/total_tumors, 7/total_tumors, 7/total_tumors) 
    # test close to the cutoff
    test_coord_SPINK5 = ((499, 1),(499, 2),(499,3))
    test_vals_SPINK5 = (10/total_tumors, 5/total_tumors, 7/total_tumors)
    
    
    # common test
    test_coord_TP53 = ((403, 1),(403, 2),(403, 3))
    test_vals_TP53 = (56/total_tumors, 38/total_tumors, 21/total_tumors)

    test_coord_vals = [(test_coord_names, test_vals_names), (test_coord_CASP5, test_vals_CASP5),
                        (test_coord_KRAS, test_vals_KRAS), (test_coord_ANK2, test_vals_ANK2),
                        (test_coord_ATM, test_vals_ATM), (test_coord_SPINK5, test_vals_SPINK5), 
                        (test_coord_TP53, test_vals_TP53)]

    for coord, vals in test_coord_vals:
        PASS = check_getter(df, dimensions, headers, coord, vals)
    
    print_test_result(PASS)

In [None]:
test_get_frequently_mutated_co_default_cutoff()

# Luad

In [192]:
def test_get_frequently_mutated_luad_default_cutoff():
    l = cptac.Luad()
    print('Running get_frequently_mutated...')
    df = ut.get_frequently_mutated(l)
    
    name = "frequently_mutated"
    dimensions = (106, 4)
    headers = ['Gene', 'Unique_Samples_Mut', 'Missense_Mut', 'Truncation_Mut']
    # test gene names
    test_coord_names = ((15, 0), (40, 0), (104, 0))
    test_vals_names = ('CFAP47', 'GRM5', 'ZNF536')
    
    total_tumors = float(110)
    # test missense and trucation don't equal the unique_sample_mutated 
    #(miss and trunc in same sample), also test high missense
    test_coord_TTN = ((96, 1), (96, 2), (96, 3)) 
    test_vals_TTN = (35/total_tumors, 31/total_tumors, 9/total_tumors) 
    # test low missense, test top third index position
    test_coord_CFAP47 = ((15, 1),(15, 2),(15, 3))
    test_vals_CFAP47 = (12/total_tumors, 10/total_tumors, 2/total_tumors)
    # test no truncation mutatations
    test_coord_ZNF536 = ((104, 1),(104, 2),(104, 3))
    test_vals_ZNF536 = (13/total_tumors, 13/total_tumors, 0/total_tumors)
    # test close to cutoff
    test_coord_GRM5 = ((40, 1),(40, 2),(40, 3))
    test_vals_GRM5 = (12/total_tumors, 9/total_tumors, 3/total_tumors)
    # common test
    test_coord_TP53 = ((94, 1),(94, 2),(94, 3))
    test_vals_TP53 = (59/total_tumors, 44/total_tumors, 16/total_tumors)

    test_coord_vals = [(test_coord_names, test_vals_names), (test_coord_TTN, test_vals_TTN), 
                       (test_coord_CFAP47, test_vals_CFAP47), (test_coord_ZNF536, test_vals_ZNF536),
                       (test_coord_GRM5, test_vals_GRM5), (test_coord_TP53, test_vals_TP53)]

    for coord, vals in test_coord_vals:
        PASS = check_getter(df, dimensions, headers, coord, vals)
    
    print_test_result(PASS)

In [193]:
test_get_frequently_mutated_luad_default_cutoff()

Running get_frequently_mutated...        




	PASS


# Hnscc version = 0.1

In [137]:
def test_get_frequently_mutated_hnscc_first_version():
    h = cptac.Hnscc(version = '0.1')
    # NOTE: many synonomous snv mutations = silent and not counted (0 common for missense)
    print('Running get_frequently_mutated...')
    df = ut.get_frequently_mutated(h, 0.01)
    
    name = "frequently_mutated"
    dimensions = (78, 4)
    headers = ['Gene', 'Unique_Samples_Mut', 'Missense_Mut', 'Truncation_Mut']
    # test gene names
    test_coord_names = ((0, 0), (47, 0), (77, 0))
    test_vals_names = ('ABCA13', 'NOTCH3', 'ZNF749')
    
    total_tumors = float(110) # 110 tumor samples for v0.1
    # test position 0
    test_coord_ABCA13 = ((0, 1), (0, 2), (0, 3)) 
    test_vals_ABCA13 = (2/total_tumors, 0/total_tumors, 2/total_tumors)
    # test has a missense value
    test_coord_NOTCH3 = ((47, 1),(47, 2),(47, 3))
    test_vals_NOTCH3 = (2/total_tumors, 1/total_tumors, 1/total_tumors)
    # test close to cutoff
    test_coord_NOTCH1 = ((46, 1),(46, 2),(46, 3))
    test_vals_NOTCH1 = (7/total_tumors, 0/total_tumors, 7/total_tumors)
    # common test
    test_coord_TP53 = ((67, 1),(67, 2),(67, 3))
    test_vals_TP53 = (21/total_tumors, 0/total_tumors, 21/total_tumors)
    # test last position
    test_coord_ZNF749 = ((77, 1),(77, 2),(77, 3))
    test_vals_ZNF749 = (2/total_tumors, 0/total_tumors, 2/total_tumors)

    test_coord_vals = [(test_coord_names, test_vals_names), (test_coord_ABCA13, test_vals_ABCA13), 
                       (test_coord_NOTCH3, test_vals_NOTCH3), (test_coord_ZNF749, test_vals_ZNF749),
                       (test_coord_NOTCH1, test_vals_NOTCH1), (test_coord_TP53, test_vals_TP53)]

    for coord, vals in test_coord_vals:
        PASS = check_getter(df, dimensions, headers, coord, vals)
    
    print_test_result(PASS)

In [138]:
test_get_frequently_mutated_hnscc_first_version()

Loading hnscc v0.1..                      



Running get_frequently_mutated...
	PASS




# Hnscc version = 2.0

In [52]:
def test_get_frequently_mutated_hnscc_12_cutoff():
    h = cptac.Hnscc(version = '2.0')
    print('Running get_frequently_mutated...')
    df = ut.get_frequently_mutated(h, 0.12)
    
    name = "frequently_mutated"
    dimensions = (20, 4)
    headers = ['Gene', 'Unique_Samples_Mut', 'Missense_Mut', 'Truncation_Mut']
    # test gene names
    test_coord_names = ((3, 0), (15, 0), (19, 0))
    test_vals_names = ('CSMD3', 'PCLO', 'TTN')
    
    total_tumors = float(109)
    # test missense and trucation don't equal the unique_sample_mutated 
    #(miss and trunc in same sample), also test high missense
    test_coord_CSMD3 = ((3, 1), (3, 2), (3, 3)) 
    test_vals_CSMD3 = (26/total_tumors, 22/total_tumors, 5/total_tumors) 
    # test low missense
    test_coord_FMN2 = ((7, 1),(7, 2),(7, 3))
    test_vals_FMN2 = (15/total_tumors, 14/total_tumors, 1/total_tumors)
    # test no truncation mutatations
    test_coord_PCLO = ((15, 1),(15, 2),(15, 3))
    test_vals_PCLO = (15/total_tumors, 15/total_tumors, 0/total_tumors)
    # test close to cutoff
    test_coord_RYR1 = ((16, 1),(16, 2),(16, 3))
    test_vals_RYR1 = (14/total_tumors, 14/total_tumors, 0/total_tumors)
    # common test
    test_coord_TP53 = ((18, 1),(18, 2),(18, 3))
    test_vals_TP53 = (96/total_tumors, 59/total_tumors, 46/total_tumors)

    test_coord_vals = [(test_coord_names, test_vals_names), (test_coord_CSMD3, test_vals_CSMD3), 
                       (test_coord_FMN2, test_vals_FMN2), (test_coord_PCLO, test_vals_PCLO),
                       (test_coord_RYR1, test_vals_RYR1), (test_coord_TP53, test_vals_TP53)]

    for coord, vals in test_coord_vals:
        PASS = check_getter(df, dimensions, headers, coord, vals)
    
    print_test_result(PASS)

In [53]:
test_get_frequently_mutated_hnscc_12_cutoff()

Running get_frequently_mutated...         
	PASS






In [23]:
def test_get_frequently_mutated_hnscc_default_cutoff():
    h = cptac.Hnscc(version = '2.0')
    print('Running get_frequently_mutated...')
    df = ut.get_frequently_mutated(h)
    
    name = "frequently_mutated"
    dimensions = (44, 4)
    headers = ['Gene', 'Unique_Samples_Mut', 'Missense_Mut', 'Truncation_Mut']
    # test gene names
    test_coord_names = ((3, 0), (32, 0), (41, 0))
    test_vals_names = ('AJUBA', 'PCLO', 'TP53')
    
    total_tumors = float(109)
    # test missense and trucation don't equal the unique_sample_mutated 
    #(miss and trunc in same sample)
    test_coord_FAT1 = ((17, 1), (17, 2), (17, 3)) 
    test_vals_FAT1 = (26/total_tumors, 9/total_tumors, 21/total_tumors) 
    # test low missense
    test_coord_AJUBA = ((3, 1),(3, 2),(3, 3))
    test_vals_AJUBA = (12/total_tumors, 3/total_tumors, 10/total_tumors)
    # test no truncation type mutatations
    test_coord_PCLO = ((32, 1),(32, 2),(32, 3))
    test_vals_PCLO = (15/total_tumors, 15/total_tumors, 0/total_tumors)
    # test close to cutoff
    test_coord_AHNAK = ((2, 1),(2, 2),(2, 3))
    test_vals_AHNAK = (11/total_tumors, 8/total_tumors, 3/total_tumors)
    # common test
    test_coord_TP53 = ((41, 1),(41, 2),(41, 3))
    test_vals_TP53 = (96/total_tumors, 59/total_tumors, 46/total_tumors)

    test_coord_vals = [(test_coord_names, test_vals_names), (test_coord_FAT1, test_vals_FAT1), 
                       (test_coord_AJUBA, test_vals_AJUBA), (test_coord_PCLO, test_vals_PCLO),
                       (test_coord_AHNAK, test_vals_AHNAK), (test_coord_TP53, test_vals_TP53)]

    for coord, vals in test_coord_vals:
        PASS = check_getter(df, dimensions, headers, coord, vals)
    
    print_test_result(PASS)

In [24]:
test_get_frequently_mutated_hnscc_default_cutoff()

Running get_frequently_mutated...         




	PASS


# Gbm

In [64]:
def test_get_frequently_mutated_gbm_default_cutoff():
    g = cptac.Gbm()
    print('Running get_frequently_mutated...')
    df = ut.get_frequently_mutated(g)
    
    dimensions = (6, 5)
    headers = ['Gene', 'Unique_Samples_Mut', 'Missense_Mut', 'Truncation_Mut', 'Non-Coding']
    # test gene names
    test_coord_names = ((5, 0), (2, 0), (0, 0))
    test_vals_names = ('TP53', 'PIK3CA', 'EGFR')
    
    total_tumors = float(99)
    # test missense and trucation don't equal the unique_sample_mutated 
    test_coord_NF1 = ((1, 1), (1, 2), (1, 4)) 
    test_vals_NF1 = (15/total_tumors, 2/total_tumors, 0/total_tumors) 
    # test missense and trucation values add to unique_samples_mut
    test_coord_PIK3CA = ((2, 1),(2, 2),(2, 3))
    test_vals_PIK3CA = (10/total_tumors, 9/total_tumors, 1/total_tumors)
    # test high missense
    test_coord_PTEN = ((3, 1),(3, 2),(3, 3))
    test_vals_PTEN = (27/total_tumors, 16/total_tumors, 11/total_tumors)
    # test  non-coding
    test_coord_RB1 = ((4, 1),(4, 3),(4, 4))
    test_vals_RB1 = (10/total_tumors, 10/total_tumors, 1/total_tumors)
    # common test
    test_coord_TP53 = ((5, 1),(5, 2),(5, 3))
    test_vals_TP53 = (32/total_tumors, 27/total_tumors, 5/total_tumors)

    test_coord_vals = [(test_coord_names, test_vals_names), (test_coord_NF1, test_vals_NF1), 
                       (test_coord_PIK3CA, test_vals_PIK3CA), (test_coord_PTEN, test_vals_PTEN),
                       (test_coord_RB1, test_vals_RB1), (test_coord_TP53, test_vals_TP53)]

    for coord, vals in test_coord_vals:
        PASS = check_getter(df, dimensions, headers, coord, vals)
    
    print_test_result(PASS)

In [43]:
def test_get_frequently_mutated_gbm_05_cutoff():
    g = cptac.Gbm()
    print('Running get_frequently_mutated...')
    df = ut.get_frequently_mutated(g,0.05)
    
    dimensions = (34, 5)
    headers = ['Gene', 'Unique_Samples_Mut', 'Missense_Mut', 'Truncation_Mut', 'Non-Coding']
    # test gene names
    test_coord_names = ((32, 0), (22, 0), (0, 0))
    test_vals_names = ('TTC6', 'PIK3CA', 'ARHGAP5')
    
    total_tumors = float(99)
    # test missense and trucation don't equal the unique_sample_mutated 
    #(miss and nc = unique_sample_mut)
    test_coord_TTC6 = ((32, 1), (32, 2), (32, 4)) 
    test_vals_TTC6 = (6/total_tumors, 3/total_tumors, 3/total_tumors) 
    # test missense and trucation values add to unique_samples_mut
    test_coord_PIK3CA = ((22, 1),(22, 2),(22, 3))
    test_vals_PIK3CA = (10/total_tumors, 9/total_tumors, 1/total_tumors)
    # test high non-conding and test unique_samples_mut close to cutoff
    test_coord_GLT1D1 = ((11, 1),(11, 2),(11, 4))
    test_vals_GLT1D1 = (5/total_tumors, 1/total_tumors, 4/total_tumors)
    # test  high truncation
    test_coord_RB1 = ((26, 1),(26, 3),(26, 4))
    test_vals_RB1 = (10/total_tumors, 10/total_tumors, 1/total_tumors)
    # common test
    test_coord_TP53 = ((31, 1),(31, 2),(31, 3))
    test_vals_TP53 = (32/total_tumors, 27/total_tumors, 5/total_tumors)

    test_coord_vals = [(test_coord_names, test_vals_names), (test_coord_TTC6, test_vals_TTC6), 
                       (test_coord_PIK3CA, test_vals_PIK3CA), (test_coord_GLT1D1, test_vals_GLT1D1),
                       (test_coord_RB1, test_vals_RB1), (test_coord_TP53, test_vals_TP53)]

    for coord, vals in test_coord_vals:
        PASS = check_getter(df, dimensions, headers, coord, vals)
    
    print_test_result(PASS)

# Endo

In [66]:
def test_get_frequently_mutated_en_default_cutoff():
    en = cptac.Endometrial()
    print('Running get_frequently_mutated...')
    df = ut.get_frequently_mutated(en)
    
    name = "frequently_mutated"
    dimensions = (232, 4)
    headers = ['Gene', 'Unique_Samples_Mut', 'Missense_Mut', 'Truncation_Mut']
    # test gene names
    test_coord_names = ((53, 0), (32, 0), (227, 0))
    test_vals_names = ('CTCF', 'CCDC168', 'ZNF536')
    
    total_tumors = float(95)
    # test missense and trucation don't equal the unique_sample_mutated 
    #(miss and trunc in same sample)
    test_coord_CTCF = ((53, 1), (53, 2), (53, 3)) 
    test_vals_CTCF = (27/total_tumors, 9/total_tumors, 23/total_tumors) 
    # testmissense and trucation values are equal
    test_coord_CCDC168 = ((32, 1),(32, 2),(32, 3))
    test_vals_CCDC168 = (16/total_tumors, 11/total_tumors, 11/total_tumors)
    # test no truncation type mutatations
    test_coord_ZNF536 = ((227, 1),(227, 2),(227, 3))
    test_vals_ZNF536 = (12/total_tumors, 12/total_tumors, 0/total_tumors)
    # test close to cutoff
    test_coord_DICER1 = ((61, 1),(61, 2),(61, 3))
    test_vals_DICER1 = (10/total_tumors, 10/total_tumors, 1/total_tumors)
    # common test
    test_coord_TP53 = ((205, 1),(205, 2),(205, 3))
    test_vals_TP53 = (21/total_tumors, 15/total_tumors, 7/total_tumors)

    test_coord_vals = [(test_coord_names, test_vals_names), (test_coord_CTCF, test_vals_CTCF), 
                       (test_coord_CCDC168, test_vals_CCDC168), (test_coord_ZNF536, test_vals_ZNF536),
                       (test_coord_DICER1, test_vals_DICER1), (test_coord_TP53, test_vals_TP53)]

    for coord, vals in test_coord_vals:
        PASS = check_getter(df, dimensions, headers, coord, vals)
    
    print_test_result(PASS)

In [67]:
test_get_frequently_mutated_en_default_cutoff()

Running get_frequently_mutated...               




	PASS


In [78]:
def test_get_frequently_mutated_en_20_cutoff():
    en = cptac.Endometrial()
    print('Running get_frequently_mutated...')
    df = ut.get_frequently_mutated(en, cutoff=0.2)
    
    dimensions = (10, 4)
    headers = ['Gene', 'Unique_Samples_Mut', 'Missense_Mut', 'Truncation_Mut']
    # test gene names
    test_coord_names = ((0, 0), (2, 0), (8, 0))
    test_vals_names = ('ARID1A', 'CTNNB1', 'TP53')
    
    total_tumors = float(95)
    # test missense and trucation don't equal the unique_samples_mutated 
    #(miss and trunc in same sample and counted in each category)
    test_coord_ARID1A = ((0, 1), (0, 2), (0, 3)) 
    test_vals_ARID1A = (43/total_tumors, 13/total_tumors, 38/total_tumors) 
    # test no truncation type mutatations
    test_coord_CTNNB1 = ((2, 1),(2, 2),(2, 3))
    test_vals_CTNNB1 = (29/total_tumors, 29/total_tumors, 0/total_tumors)
    # test close to the cutoff 
    test_coord_ZFHX3 = ((9, 1), (9, 2), (9, 3))
    test_vals_ZFHX3 = (21/total_tumors , 8/total_tumors , 16/total_tumors)
    # test miss and trunc almost equal
    test_coord_KMT2B = ((3, 1), (3, 2), (3, 3))
    test_vals_KMT2B = (23/total_tumors , 11/total_tumors , 12/total_tumors)
    # common test
    test_coord_TP53 = ((8, 1),(8, 2),(8, 3))
    test_vals_TP53 = (21/total_tumors, 15/total_tumors, 7/total_tumors)
    
    test_coord_vals = [(test_coord_names, test_vals_names), (test_coord_ARID1A, test_vals_ARID1A),
                        (test_coord_CTNNB1, test_vals_CTNNB1), (test_coord_ZFHX3, test_vals_ZFHX3),
                        (test_coord_KMT2B, test_vals_KMT2B), (test_coord_TP53, test_vals_TP53)]

    for coord, vals in test_coord_vals:
        PASS = check_getter(df, dimensions, headers, coord, vals)
        
    print_test_result(PASS)

In [79]:
test_get_frequently_mutated_en_20_cutoff()

Running get_frequently_mutated...               




	PASS


# Colon

In [75]:
def test_get_frequently_mutated_co_default_cutoff():
    co = cptac.Colon()
    print('Running get_frequently_mutated...')
    df = ut.get_frequently_mutated(co)
    
    dimensions = (612, 4)
    headers = ['Gene', 'Unique_Samples_Mut', 'Missense_Mut', 'Truncation_Mut']
    # test gene names
    test_coord_names = ((90, 0), (284, 0), (499, 0))
    test_vals_names = ('CASP5', 'KRAS', 'SPINK5')
    
    total_tumors = 97
    # test when there are no missense type mutatations
    test_coord_CASP5 = ((90, 1), (90, 2), (90, 3))
    test_vals_CASP5 = (19/total_tumors, 0/total_tumors, 19/total_tumors) 
    # test when there are no truncation type mutatations
    test_coord_KRAS = ((284, 1),(284, 2),(284, 3))
    test_vals_KRAS = (35/total_tumors, 35/total_tumors, 0/total_tumors)
    # test when missense and trucation don't add up to equal the fraction mutated
    #(miss and trunc in same sample)
    test_coord_ANK2 = ((34, 1),(34, 2),(34, 3)) 
    test_vals_ANK2 = (15/total_tumors, 13/total_tumors, 4/total_tumors) 
    # test when miss and trunc count are the same
    test_coord_ATM = ((56, 1),(56, 2),(56, 3)) 
    test_vals_ATM = (10/total_tumors, 7/total_tumors, 7/total_tumors) 
    # test close to the cutoff
    test_coord_SPINK5 = ((499, 1),(499, 2),(499,3))
    test_vals_SPINK5 = (10/total_tumors, 5/total_tumors, 7/total_tumors)
    # common test
    test_coord_TP53 = ((554, 1),(554, 2),(554, 3))
    test_vals_TP53 = (56/total_tumors, 38/total_tumors, 21/total_tumors)

    test_coord_vals = [(test_coord_names, test_vals_names), (test_coord_CASP5, test_vals_CASP5),
                        (test_coord_KRAS, test_vals_KRAS), (test_coord_ANK2, test_vals_ANK2),
                        (test_coord_ATM, test_vals_ATM), (test_coord_SPINK5, test_vals_SPINK5), 
                        (test_coord_TP53, test_vals_TP53)]

    for coord, vals in test_coord_vals:
        PASS = check_getter(df, dimensions, headers, coord, vals)
    
    print_test_result(PASS)

In [76]:
test_get_frequently_mutated_co_default_cutoff()

Running get_frequently_mutated...         




Dataframe dimensions did not match expected values.
	Expected: (612, 4)
	Actual: (448, 4)



IndexError: single positional indexer is out-of-bounds

In [None]:
def test_get_frequently_mutated_co_15_cutoff():
    co = cptac.Colon()
    print('Running get_frequently_mutated...')
    df = cptac.algorithms.get_frequently_mutated(co, 0.15)
    
    dimensions = (138, 4)
    headers = ['Gene', 'Unique_Samples_Mut', 'Missense_Mut', 'Truncation_Mut']
    # test gene names
    test_coord_names = ((15, 0), (66, 0), (102, 0))
    test_vals_names = ('CASP5', 'KRAS', 'RYR2')
    
    total_tumors = 97
    # test no missense type mutatations 
    test_coord_CASP5 = ((15, 1), (15, 2), (15, 3))
    test_vals_CASP5 = (19/total_tumors, 0/total_tumors, 19/total_tumors) 
    # test no truncation type mutatations
    test_coord_KRAS = ((66, 1),(66, 2),(66, 3))
    test_vals_KRAS = (35/total_tumors, 35/total_tumors, 0/total_tumors)
    # test missense and truncation equal fraction mutated
    test_coord_PIK3CA = ((92, 1),(92, 2),(92, 3))
    test_vals_PIK3CA = (24/total_tumors, 23/total_tumors, 1/total_tumors)
    # test missense and trucation don't equal unique_samples_mutated (miss and trunc in same sample)
    test_coord_RYR2 = ((102, 1),(102, 2),(102, 3))
    test_vals_RYR2 = (21/total_tumors, 19/total_tumors, 7/total_tumors)
    # test close to the cutoff
    test_coord_ANK2 = ((6, 1),(6, 2),(6, 3)) 
    test_vals_ANK2 = (15/total_tumors, 13/total_tumors, 4/total_tumors) 
    # common test
    test_coord_TP53 = ((123, 1),(123, 2),(123, 3))
    test_vals_TP53 = (56/total_tumors, 38/total_tumors, 21/total_tumors)

    test_coord_vals = [(test_coord_names, test_vals_names), (test_coord_CASP5, test_vals_CASP5), 
                       (test_coord_KRAS, test_vals_KRAS), (test_coord_PIK3CA, test_vals_PIK3CA), 
                       (test_coord_RYR2, test_vals_RYR2), (test_coord_ANK2, test_vals_ANK2), 
                       (test_coord_TP53, test_vals_TP53)]

    for coord, vals in test_coord_vals:
        PASS = check_getter(df, dimensions, headers, coord, vals)
    
    print_test_result(PASS)

In [None]:
test_get_frequently_mutated_co_15_cutoff()

# Ov

In [None]:
def test_get_frequently_mutated_ov_default_cutoff():
    ov = cptac.Ovarian()
    print('Running get_frequently_mutated...')
    df = cptac.algorithms.get_frequently_mutated(ov)
    
    dimensions = (16, 4)
    headers = ['Gene', 'Unique_Samples_Mut', 'Missense_Mut', 'Truncation_Mut']
    # test genes names
    test_coord_names = ((15, 0), (13, 0), (2, 0))
    test_vals_names = ('WDFY4', 'TP53', 'MT-CO1')
    
    total_tumors = 83
    #test missense and trucation not equal to unique_samples_mutated 
    #(miss and trunc in same sample)
    test_coord_WDFY4 = ((15, 1), (15, 2), (15, 3)) 
    test_vals_WDFY4 = (10/total_tumors, 8/total_tumors, 3/total_tumors) 
    # test miss and trunc equal to unique_samples_mutated
    test_coord_MUC4 = ((8, 1),(8, 2),(8, 3))
    test_vals_MUC4 = (27/total_tumors, 26/total_tumors, 1/total_tumors)
    # test no truncation mutations
    test_coord_MTCO1 = ((2, 1),(2, 2),(2, 3))
    test_vals_MTCO1 = (10/total_tumors, 10/total_tumors, 0/total_tumors)
    # test close to cutoff
    test_coord_FSIP2 = ((1, 1),(1, 2),(1, 3))
    test_vals_FSIP2 = (9/total_tumors, 8/total_tumors, 2/total_tumors)
    # common test and highest count
    test_coord_TP53 = ((13, 1),(13, 2),(13, 3))
    test_vals_TP53 = (77/total_tumors, 50/total_tumors, 27/total_tumors)
    
    
    test_coord_vals = [(test_coord_names, test_vals_names), (test_coord_WDFY4, test_vals_WDFY4),
                       (test_coord_MUC4, test_vals_MUC4), (test_coord_MTCO1, test_vals_MTCO1), 
                       (test_coord_FSIP2, test_vals_FSIP2), (test_coord_TP53, test_vals_TP53)]

    for coord, vals in test_coord_vals:
        PASS = check_getter(df, dimensions, headers, coord, vals)
    
    print_test_result(PASS)

In [None]:
test_get_frequently_mutated_ov_default_cutoff()

In [None]:
def test_get_frequently_mutated_ov_05_cutoff():
    ov = cptac.Ovarian()
    print('Running get_frequently_mutated...')
    df = cptac.algorithms.get_frequently_mutated(ov, 0.05)
    
    dimensions = (142, 4)
    headers = ['Gene', 'Unique_Samples_Mut', 'Missense_Mut', 'Truncation_Mut']
    # test genes names
    test_coord_names = ((133, 0), (127, 0), (141, 0))
    test_vals_names = ('WDFY4', 'TP53', 'ZNF865')
    
    total_tumors = 83
    #test missense and trucation not equal to unique_samples_mutated 
    #(miss and trunc in same sample)
    test_coord_WDFY4 = ((133, 1), (133, 2), (133, 3)) 
    test_vals_WDFY4 = (10/total_tumors, 8/total_tumors, 3/total_tumors) 
    # test miss and trunc almost equal
    test_coord_CDK12 = ((11, 1),(11, 2),(11, 3))
    test_vals_CDK12 = (6/total_tumors, 4/total_tumors, 3/total_tumors)
    # test no truncation mutations
    test_coord_ZNF865 = ((141, 1),(141, 2),(141, 3))
    test_vals_ZNF865 = (5/total_tumors, 5/total_tumors, 0/total_tumors)
    # test close to cutoff 
    test_coord_SYNE1 = ((122, 1),(122, 2),(122, 3))
    test_vals_SYNE1 = (5/total_tumors, 5/total_tumors, 1/total_tumors)
    # common test and highest count
    test_coord_TP53 = ((127, 1),(127, 2),(127, 3))
    test_vals_TP53 = (77/total_tumors, 50/total_tumors, 27/total_tumors)
    
    #CHECK silent mut not counted
    
    test_coord_vals = [(test_coord_names, test_vals_names), (test_coord_WDFY4, test_vals_WDFY4),
                        (test_coord_CDK12, test_vals_CDK12), (test_coord_ZNF865, test_vals_ZNF865),  
                        (test_coord_SYNE1, test_vals_SYNE1), (test_coord_TP53, test_vals_TP53)]

    for coord, vals in test_coord_vals:
        PASS = check_getter(df, dimensions, headers, coord, vals)
    
    print_test_result(PASS)

In [None]:
test_get_frequently_mutated_ov_05()

# Ccrcc

In [None]:
def test_get_frequently_mutated_ccrcc_default_cutoff():
    rc = cptac.Ccrcc()
    print('Running get_frequently_mutated...')
    df = cptac.algorithms.get_frequently_mutated(rc)
   
    dimensions = (6, 4)
    headers = ['Gene', 'Unique_Samples_Mut', 'Missense_Mut', 'Truncation_Mut']
    # test genes names
    test_coord_names = ((0, 0), (2, 0), (4, 0))
    test_vals_names = ('BAP1', 'PBRM1', 'TTN')
    
    total_tumors = 110
    # test miss and trunc equal to unique_samples_mutated
    test_coord_BAP1 = ((0, 1), (0, 2), (0, 3)) 
    test_vals_BAP1 = (17/total_tumors, 7/total_tumors, 10/total_tumors) 
    # test high truncation, low missense count
    test_coord_PBRM1 = ((2, 1),(2, 2),(2, 3))
    test_vals_PBRM1 = (44/total_tumors, 8/total_tumors, 37/total_tumors)
    # check that silent mutations are not counted (TTN has many silent mutations)
    # and missense and trucation not equal to unique_samples_mutated 
    test_coord_TTN = ((4, 1),(4, 2),(4, 3))
    test_vals_TTN = (13/total_tumors, 10/total_tumors, 4/total_tumors)
    # test close to cutoff
    test_coord_SETD2 = ((3, 1), (3, 2), (3, 3)) 
    test_vals_SETD2 = (15/total_tumors, 2/total_tumors, 13/total_tumors)               
    # common test and highest count
    test_coord_VHL = ((5, 1),(5, 2),(5, 3))
    test_vals_VHL = (82/total_tumors, 33/total_tumors, 49/total_tumors)
    
    
    test_coord_vals = [(test_coord_names, test_vals_names), (test_coord_BAP1, test_vals_BAP1),
                        (test_coord_PBRM1, test_vals_PBRM1), (test_coord_TTN, test_vals_TTN),
                        (test_coord_SETD2, test_vals_SETD2), (test_coord_VHL, test_vals_VHL)]

    for coord, vals in test_coord_vals:
        PASS = check_getter(df, dimensions, headers, coord, vals)
    
    print_test_result(PASS)

In [None]:
test_get_frequently_mutated_ccrcc_default_cutoff()

In [None]:
def test_get_frequently_mutated_ccrcc_01_cutoff():
    rc = cptac.Ccrcc()
    print('Running get_frequently_mutated...')
    df = cptac.algorithms.get_frequently_mutated(rc, cutoff=0.01)
    
    dimensions = (1106, 4)
    headers = ['Gene', 'Unique_Samples_Mut', 'Missense_Mut', 'Truncation_Mut']
    # test genes names
    test_coord_names = ((11, 0), (992, 0), (1080, 0))
    test_vals_names = ('ABCC3', 'TTN', 'ZNF532')
    
    total_tumors = 110
    # test no missense 
    test_coord_ABCC3 = ((11, 1),(11, 2),(11, 3))
    test_vals_ABCC3 = (2/total_tumors, 0/total_tumors, 2/total_tumors)
    # test no truncation and close to cutoff
    test_coord_ZNF532 = ((1080, 1), (1080, 2), (1080, 3)) 
    test_vals_ZNF532 = (2/total_tumors, 2/total_tumors, 0/total_tumors)
    # test miss and trunc equal to unique_samples_mutated
    test_coord_NAV3 = ((611, 1), (611, 2), (611, 3)) 
    test_vals_NAV3 = (7/total_tumors, 5/total_tumors, 2/total_tumors) 
    # check that silent mutations are not counted (TTN has many silent mutations)
    # and missense and trucation not equal to unique_samples_mutated 
    test_coord_TTN = ((992, 1),(992, 2),(992, 3))
    test_vals_TTN = (13/total_tumors, 10/total_tumors, 4/total_tumors)
    # common test and highest count
    test_coord_VHL = ((1019, 1),(1019, 2),(1019, 3))
    test_vals_VHL = (82/total_tumors, 33/total_tumors, 49/total_tumors)
    
    
    test_coord_vals = [(test_coord_names, test_vals_names), (test_coord_ABCC3, test_vals_ABCC3),
                        (test_coord_ZNF532, test_vals_ZNF532), (test_coord_NAV3, test_vals_NAV3),
                        (test_coord_TTN, test_vals_TTN), (test_coord_VHL, test_vals_VHL)]

    for coord, val in test_coord_vals:
        PASS = check_getter(df, dimensions, headers, coord, val)
    
    print_test_result(PASS)

In [None]:
test_get_frequently_mutated_ccrcc_01_cutoff()

In [None]:
print("\nTesting get_frequently_mutated from utilities...")
test_get_frequently_mutated_en_default_cutoff()
test_get_frequently_mutated_co_default_cutoff()
test_get_frequently_mutated_ov_default_cutoff()
test_get_frequently_mutated_ccrcc_default_cutoff()


test_get_frequently_mutated_en_cutoff_20_cutoff()
test_get_frequently_mutated_co_cutoff_15_cutoff()
test_get_frequently_mutated_ov_05_cutoff()
test_get_frequently_mutated_ccrcc_01_cutoff()