# MUC5B Mutation Trans Effect on Transcriptomics

This notebook analyzes the trans effect of MUC5B mutation on interacting and other proteins Transcriptomics, in Endometrial, Colon, and Ovarian cancer.

### Library Imports

In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import collections

import warnings
warnings.filterwarnings("ignore")

import cptac
import cptac.algorithms as al

en = cptac.Endometrial()
co = cptac.Colon()
ov = cptac.Ovarian()

                                    

### Select Gene

In [2]:
gene = "MUC5B"

### Investigate Proteomics, Phosphoproteomics, Acetylproteomics, or Transcriptomics

In [3]:
#omics = "proteomics"
omics = "transcriptomics"
#omics = "phosphoproteomics"
#omics = "acetylproteomics"

### Track all significant comparisons in Dataframe

In [4]:
all_significant_comparisons = pd.DataFrame(columns=['Cancer_Type', 'Gene', 'Comparison','Interacting_Protein','P_Value'])

In [5]:
def add_to_all_significant_comparisons(df, cancer, interacting, all_sig_comp):
    expanded = df
    expanded['Gene'] = gene
    expanded['Cancer_Type'] = cancer
    expanded['Interacting_Protein'] = interacting
    
    updated_all_comparisons = pd.concat([all_sig_comp, expanded], sort=False)
    
    return updated_all_comparisons

# Interacting Proteins: Transcriptomics

### Generate interacting protein list

Make a call to the cptac.algorithms get interacting proteins method, which interacts with the uniprot and string databases to generate a list of known interacting partners with the given gene

In [6]:
'''Use get interacting proteins method to generate list of interacting proteins'''
interacting_proteins = al.get_interacting_proteins(gene)

'''Show interacting protein list'''
print("Interacting Proteins:")
for interacting_protein in interacting_proteins:
    print(interacting_protein)

Interacting Proteins:
GALNT8
ST6GAL1
ST6GALNAC2
ST3GAL3
MUC17
GALNT16
GALNT12
MUC21
MUC3A
ST3GAL2
GCNT3
MUC16
C1GALT1
MUC6
MUC7
MUC20
MUC15
MUC4
GALNTL6
ST3GAL4
MUC5B
MUC12
B3GNT6
MUC1
MUC13
MUC5AC
UBC
KIR2DS2
C7orf25
ICE2
FRMD1
PCK2
HBM
ARHGAP12
TIMM50
DDX31
ESR2
AGR2


## Endometrial

### Test for significant comparisons in any of interacting proteins

In [7]:
'''Create dataframe in order to do comparisons with wrap_ttest'''
protdf = en.join_omics_to_mutations(mutations_genes=[gene], omics_df_name=omics, omics_genes=interacting_proteins)
protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']

'''Create the binary valued column needed to do the comparison'''
for ind, row in protdf.iterrows():
    if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
        protdf.at[ind,'Label'] = 'Mutated'
    else:
        protdf.at[ind,'Label'] = 'Wildtype'

'''Format the dataframe correctly'''
protdf = protdf.drop(gene+"_Mutation",axis=1)
protdf = protdf.drop(gene+"_Location",axis=1)
protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
protdf = protdf.drop("Sample_Status",axis=1)


'''Make list of columns to be compared using t-tests'''
col_list = list(protdf.columns)
col_list.remove('Label')

print("Doing t-test comparisons\n")

'''Call wrap_ttest, pass in formatted dataframe'''
wrap_results = al.wrap_ttest(protdf, 'Label', col_list)

'''Print results, if anything significant was found'''
if wrap_results is not None:
        print(wrap_results)
        print("\n\n")
        
        all_significant_comparisons = add_to_all_significant_comparisons(wrap_results, "Endometrial", True, all_significant_comparisons)
        

Doing t-test comparisons

No significant comparisons.


## Colon

### Test for significant comparisons in any of interacting proteins

In [8]:
'''Create dataframe in order to do comparisons with wrap_ttest'''
protdf = co.join_omics_to_mutations(mutations_genes=[gene], omics_df_name=omics, omics_genes=interacting_proteins)
protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']

'''Create the binary valued column needed to do the comparison'''
for ind, row in protdf.iterrows():
    if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
        protdf.at[ind,'Label'] = 'Mutated'
    else:
        protdf.at[ind,'Label'] = 'Wildtype'

'''Format the dataframe correctly'''
protdf = protdf.drop(gene+"_Mutation",axis=1)
protdf = protdf.drop(gene+"_Location",axis=1)
protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
protdf = protdf.drop("Sample_Status",axis=1)

'''Make list of columns to be compared using t-tests'''
col_list = list(protdf.columns)
col_list.remove('Label')

print("Doing t-test comparisons\n")

'''Call wrap_ttest, pass in formatted dataframe'''
wrap_results = al.wrap_ttest(protdf, 'Label', col_list)

'''Print results, if anything significant was found'''
if wrap_results is not None:
        print(wrap_results)
        print("\n\n")
        
        all_significant_comparisons = add_to_all_significant_comparisons(wrap_results, "Colon", True, all_significant_comparisons)


MUC17 did not match any columns in transcriptomics dataframe. MUC17_transcriptomics column inserted, but filled with NaN.
GALNT16 did not match any columns in transcriptomics dataframe. GALNT16_transcriptomics column inserted, but filled with NaN.
MUC21 did not match any columns in transcriptomics dataframe. MUC21_transcriptomics column inserted, but filled with NaN.
MUC16 did not match any columns in transcriptomics dataframe. MUC16_transcriptomics column inserted, but filled with NaN.
MUC6 did not match any columns in transcriptomics dataframe. MUC6_transcriptomics column inserted, but filled with NaN.
MUC7 did not match any columns in transcriptomics dataframe. MUC7_transcriptomics column inserted, but filled with NaN.
MUC15 did not match any columns in transcriptomics dataframe. MUC15_transcriptomics column inserted, but filled with NaN.
GALNTL6 did not match any columns in transcriptomics dataframe. GALNTL6_transcriptomics column inserted, but filled with NaN.
MUC5B did not match 

## Ovarian

### Test for significant comparisons in any of interacting proteins

In [9]:
'''Create dataframe in order to do comparisons with wrap_ttest'''
protdf = ov.join_omics_to_mutations(mutations_genes=[gene], omics_df_name=omics, omics_genes=interacting_proteins)
protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']

'''Create the binary valued column needed to do the comparison'''
for ind, row in protdf.iterrows():
    if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
        protdf.at[ind,'Label'] = 'Mutated'
    else:
        protdf.at[ind,'Label'] = 'Wildtype'

'''Format the dataframe correctly'''
protdf = protdf.drop(gene+"_Mutation",axis=1)
protdf = protdf.drop(gene+"_Location",axis=1)
protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
protdf = protdf.drop("Sample_Status",axis=1)

'''Make list of columns to be compared using t-tests'''
col_list = list(protdf.columns)
col_list.remove('Label')

print("Doing t-test comparisons\n")

'''Call wrap_ttest, pass in formatted dataframe'''
wrap_results = al.wrap_ttest(protdf, 'Label', col_list)

'''Print results, if anything significant was found'''
if wrap_results is not None:
        print(wrap_results)
        print("\n\n")
        
        all_significant_comparisons = add_to_all_significant_comparisons(wrap_results, "Ovarian", True, all_significant_comparisons)


GALNT8 did not match any columns in transcriptomics dataframe. GALNT8_transcriptomics column inserted, but filled with NaN.
C1GALT1 did not match any columns in transcriptomics dataframe. C1GALT1_transcriptomics column inserted, but filled with NaN.
MUC20 did not match any columns in transcriptomics dataframe. MUC20_transcriptomics column inserted, but filled with NaN.
MUC1 did not match any columns in transcriptomics dataframe. MUC1_transcriptomics column inserted, but filled with NaN.
MUC5AC did not match any columns in transcriptomics dataframe. MUC5AC_transcriptomics column inserted, but filled with NaN.
ICE2 did not match any columns in transcriptomics dataframe. ICE2_transcriptomics column inserted, but filled with NaN.
Doing t-test comparisons

No significant comparisons.


# All Proteins: Transcriptomics

Expand our search for significant comparisons to all proteins in our dataset

## Endometrial

In [10]:
try:
    print("\nGene: ", gene)

    '''Use all proteins'''

    '''Create dataframe in order to do comparisons with wrap_ttest'''
    protdf = en.join_omics_to_mutations(mutations_genes=[gene], omics_df_name=omics)
    protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']
    
    '''Create the binary valued column needed to do the comparison'''
    for ind, row in protdf.iterrows():
        if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
            protdf.at[ind,'Label'] = 'Mutated'
        else:
            protdf.at[ind,'Label'] = 'Wildtype'

    '''Format the datafram correctly'''
    protdf = protdf.drop(gene+"_Mutation",axis=1)
    protdf = protdf.drop(gene+"_Location",axis=1)
    protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
    protdf = protdf.drop("Sample_Status",axis=1)

    '''Make list of columns to be compared using t-tests'''
    col_list = list(protdf.columns)
    col_list.remove('Label')

    print("Doing t-test comparisons\n")
    
    '''Call wrap_ttest, pass in formatted dataframe'''
    wrap_results = al.wrap_ttest(protdf, 'Label', col_list)

    '''Print results, if anything significant was found'''
    if wrap_results is not None:
            print(wrap_results)
            print("\n\n")
            
            all_significant_comparisons = add_to_all_significant_comparisons(wrap_results, "Endometrial", False, all_significant_comparisons)


except Exception as e:
    print("Error in Comparison")
    print(e)


Gene:  MUC5B
Doing t-test comparisons

                Comparison   P_Value
0  FAM86HP_transcriptomics  0.000001





## Colon

In [11]:
try:
    print("\nGene: ", gene)

    '''Use all proteins'''

    '''Create dataframe in order to do comparisons with wrap_ttest'''
    protdf = co.join_omics_to_mutations(mutations_genes=[gene], omics_df_name=omics)
    protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']
    
    '''Create the binary valued column needed to do the comparison'''
    for ind, row in protdf.iterrows():
        if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
            protdf.at[ind,'Label'] = 'Mutated'
        else:
            protdf.at[ind,'Label'] = 'Wildtype'

    '''Format the datafram correctly'''
    protdf = protdf.drop(gene+"_Mutation",axis=1)
    protdf = protdf.drop(gene+"_Location",axis=1)
    protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
    protdf = protdf.drop("Sample_Status",axis=1)

    '''Make list of columns to be compared using t-tests'''
    col_list = list(protdf.columns)
    col_list.remove('Label')

    print("Doing t-test comparisons\n")
    
    '''Call wrap_ttest, pass in formatted dataframe'''
    wrap_results = al.wrap_ttest(protdf, 'Label', col_list)

    '''Print results, if anything significant was found'''
    if wrap_results is not None:
            print(wrap_results)
            print("\n\n")
            
            all_significant_comparisons = add_to_all_significant_comparisons(wrap_results, "Colon", False, all_significant_comparisons)


except Exception as e:
    print("Error in Comparison")
    print(e)


Gene:  MUC5B
Doing t-test comparisons

                     Comparison       P_Value
0          GPX2_transcriptomics  1.758001e-10
1       POU5F1B_transcriptomics  4.688794e-10
2        RUBCNL_transcriptomics  1.541604e-09
3        ATP10B_transcriptomics  3.068562e-09
4         NUBPL_transcriptomics  3.884908e-09
5       CEACAM5_transcriptomics  6.870982e-09
6       CEACAM6_transcriptomics  9.942373e-09
7         ADNP2_transcriptomics  1.155737e-08
8    DLGAP1-AS2_transcriptomics  1.221313e-08
9      TOR1AIP2_transcriptomics  1.275600e-08
10         GRM8_transcriptomics  1.851409e-08
11       LY6G6D_transcriptomics  2.788723e-08
12        CCSAP_transcriptomics  3.180643e-08
13        EPDR1_transcriptomics  3.808045e-08
14         ACE2_transcriptomics  4.376884e-08
15        NR1I2_transcriptomics  4.771506e-08
16       MAPK12_transcriptomics  5.517810e-08
17       CASC21_transcriptomics  5.951666e-08
18    LINC00513_transcriptomics  9.176024e-08
19       GPR143_transcriptomics  1.07190

## Ovarian

In [12]:

print("\nGene: ", gene)

'''Use all proteins'''

'''Create dataframe in order to do comparisons with wrap_ttest'''
protdf = ov.join_omics_to_mutations(mutations_genes=[gene], omics_df_name=omics)
protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']
protdf = protdf.loc[:,~protdf.columns.duplicated()]

'''Create the binary valued column needed to do the comparison'''
for ind, row in protdf.iterrows():
    if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
        protdf.at[ind,'Label'] = 'Mutated'
    else:
        protdf.at[ind,'Label'] = 'Wildtype'

'''Format the datafram correctly'''
protdf = protdf.drop(gene+"_Mutation",axis=1)
protdf = protdf.drop(gene+"_Location",axis=1)
protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
protdf = protdf.drop("Sample_Status",axis=1)

'''Make list of columns to be compared using t-tests'''
col_list = list(protdf.columns)
col_list.remove('Label')

print("Doing t-test comparisons\n")

'''Call wrap_ttest, pass in formatted dataframe'''
wrap_results = al.wrap_ttest(protdf, 'Label', col_list)

'''Print results, if anything significant was found'''
if wrap_results is not None:
        print(wrap_results)
        print("\n\n")
        
        all_significant_comparisons = add_to_all_significant_comparisons(wrap_results, "Ovarian", False, all_significant_comparisons)


Gene:  MUC5B
Doing t-test comparisons

No significant comparisons.


### Print All Signififant Comparisons

In [13]:
if len(all_significant_comparisons) == 0:
    print('No Significant Comparisons!')
    
else:
    display(all_significant_comparisons)

Unnamed: 0,Cancer_Type,Gene,Comparison,Interacting_Protein,P_Value
0,Colon,MUC5B,MUC20_transcriptomics,True,4.319530e-06
1,Colon,MUC5B,FRMD1_transcriptomics,True,1.340157e-05
2,Colon,MUC5B,ST6GAL1_transcriptomics,True,6.725471e-05
3,Colon,MUC5B,MUC12_transcriptomics,True,1.108167e-04
4,Colon,MUC5B,ST6GALNAC2_transcriptomics,True,6.152779e-04
0,Endometrial,MUC5B,FAM86HP_transcriptomics,False,1.094390e-06
0,Colon,MUC5B,GPX2_transcriptomics,False,1.758001e-10
1,Colon,MUC5B,POU5F1B_transcriptomics,False,4.688794e-10
2,Colon,MUC5B,RUBCNL_transcriptomics,False,1.541604e-09
3,Colon,MUC5B,ATP10B_transcriptomics,False,3.068562e-09


### Write Significant Comparisons (if any) to Shared CSV file

In [14]:
existing_results = pd.read_csv(gene+'_Trans_Results.csv')

updated_results = pd.concat([existing_results, all_significant_comparisons], sort=False)

updated_results.to_csv(path_or_buf = gene + '_Trans_Results.csv', index=False)