# MUC5B Mutation Trans Effect on Proteomics

This notebook analyzes the trans effect of MUC5B mutation on interacting and other proteins, in Endometrial, Colon, and Ovarian cancer.

### Library Imports

In [1]:
import pandas as pd
import numpy as np
import scipy.stats

import warnings
warnings.filterwarnings("ignore")

import cptac
import cptac.algorithms as al

en = cptac.Endometrial()
co = cptac.Colon()
ov = cptac.Ovarian()

                                    

### Specify Gene

In [2]:
gene = "MUC5B"

### Investigate Proteomics, Phosphoproteomics, Acetylproteomics, or Transcriptomics

In [3]:
omics = "proteomics"
#omics = "transcriptomics"
#omics = "phosphoproteomics"
#omics = "acetylproteomics"

### Track all significant comparisons in dataframe

In [4]:
all_significant_comparisons = pd.DataFrame(columns=['Cancer_Type', 'Gene', 'Comparison','Interacting_Protein','P_Value'])

In [5]:
def add_to_all_significant_comparisons(df, cancer, interacting, all_sig_comp):
    expanded = df
    expanded['Gene'] = gene
    expanded['Cancer_Type'] = cancer
    expanded['Interacting_Protein'] = interacting
    
    updated_all_comparisons = pd.concat([all_sig_comp, expanded], sort=False)
    
    return updated_all_comparisons

# Interacting Proteins: Proteomics

### Generate interacting protein list

Make a call to the cptac.algorithms get interacting proteins method, which interacts with the uniprot and string databases to generate a list of known interacting partners with the given gene

In [6]:
'''Use get interacting proteins method to generate list of interacting proteins'''
interacting_proteins = al.get_interacting_proteins(gene)

print("Interacting Proteins:")
for interacting_protein in interacting_proteins:
    print(interacting_protein)

Interacting Proteins:
GALNT8
ST6GAL1
ST6GALNAC2
ST3GAL3
MUC17
GALNT16
GALNT12
MUC21
MUC3A
ST3GAL2
GCNT3
MUC16
C1GALT1
MUC6
MUC7
MUC20
MUC15
MUC4
GALNTL6
ST3GAL4
MUC5B
MUC12
B3GNT6
MUC1
MUC13
MUC5AC
UBC
KIR2DS2
C7orf25
ICE2
FRMD1
PCK2
HBM
ARHGAP12
TIMM50
DDX31
ESR2
AGR2


## Endometrial

### Test for significant comparisons in any of interacting proteins

In [7]:
'''Create dataframe in order to do comparisons with wrap_ttest'''
protdf = en.join_omics_to_mutations(mutations_genes=[gene], omics_df_name=omics, omics_genes=interacting_proteins)
protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']

'''Create the binary valued column needed to do the comparison'''
for ind, row in protdf.iterrows():
    if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
        protdf.at[ind,'Label'] = 'Mutated'
    else:
        protdf.at[ind,'Label'] = 'Wildtype'

'''Format the dataframe correctly'''
protdf = protdf.drop(gene+"_Mutation",axis=1)
protdf = protdf.drop(gene+"_Location",axis=1)
protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
protdf = protdf.drop("Sample_Status",axis=1)

'''Make list of columns to be compared using t-tests'''
col_list = list(protdf.columns)
col_list.remove('Label')

print("Doing t-test comparisons\n")

'''Call wrap_ttest, pass in formatted dataframe'''
wrap_results = al.wrap_ttest(protdf, 'Label', col_list)

'''Print results, if anything significant was found'''
if wrap_results is not None:
        print(wrap_results)
        print("\n\n")
        
        all_significant_comparisons = add_to_all_significant_comparisons(wrap_results, "Endometrial", True, all_significant_comparisons)


GALNT8 did not match any columns in proteomics dataframe. GALNT8_proteomics column inserted, but filled with NaN.
ST3GAL3 did not match any columns in proteomics dataframe. ST3GAL3_proteomics column inserted, but filled with NaN.
MUC17 did not match any columns in proteomics dataframe. MUC17_proteomics column inserted, but filled with NaN.
MUC21 did not match any columns in proteomics dataframe. MUC21_proteomics column inserted, but filled with NaN.
MUC3A did not match any columns in proteomics dataframe. MUC3A_proteomics column inserted, but filled with NaN.
MUC7 did not match any columns in proteomics dataframe. MUC7_proteomics column inserted, but filled with NaN.
MUC20 did not match any columns in proteomics dataframe. MUC20_proteomics column inserted, but filled with NaN.
MUC15 did not match any columns in proteomics dataframe. MUC15_proteomics column inserted, but filled with NaN.
MUC4 did not match any columns in proteomics dataframe. MUC4_proteomics column inserted, but filled 

## Colon

### Test for significant comparisons in any of interacting proteins

In [8]:
'''Create dataframe in order to do comparisons with wrap_ttest'''
protdf = co.join_omics_to_mutations(mutations_genes=[gene], omics_df_name=omics, omics_genes=interacting_proteins)
protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']

'''Create the binary valued column needed to do the comparison'''
for ind, row in protdf.iterrows():
    if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
        protdf.at[ind,'Label'] = 'Mutated'
    else:
        protdf.at[ind,'Label'] = 'Wildtype'

'''Format the dataframe correctly'''
protdf = protdf.drop(gene+"_Mutation",axis=1)
protdf = protdf.drop(gene+"_Location",axis=1)
protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
protdf = protdf.drop("Sample_Status",axis=1)

'''Make list of columns to be compared using t-tests'''
col_list = list(protdf.columns)
col_list.remove('Label')

print("Doing t-test comparisons\n")

'''Call wrap_ttest, pass in formatted dataframe'''
wrap_results = al.wrap_ttest(protdf, 'Label', col_list)

'''Print results, if anything significant was found'''
if wrap_results is not None:
        print(wrap_results)
        print("\n\n")
        
        all_significant_comparisons = add_to_all_significant_comparisons(wrap_results, "Colon", True, all_significant_comparisons)


GALNT8 did not match any columns in proteomics dataframe. GALNT8_proteomics column inserted, but filled with NaN.
ST6GALNAC2 did not match any columns in proteomics dataframe. ST6GALNAC2_proteomics column inserted, but filled with NaN.
ST3GAL3 did not match any columns in proteomics dataframe. ST3GAL3_proteomics column inserted, but filled with NaN.
MUC17 did not match any columns in proteomics dataframe. MUC17_proteomics column inserted, but filled with NaN.
GALNT16 did not match any columns in proteomics dataframe. GALNT16_proteomics column inserted, but filled with NaN.
MUC21 did not match any columns in proteomics dataframe. MUC21_proteomics column inserted, but filled with NaN.
MUC3A did not match any columns in proteomics dataframe. MUC3A_proteomics column inserted, but filled with NaN.
ST3GAL2 did not match any columns in proteomics dataframe. ST3GAL2_proteomics column inserted, but filled with NaN.
MUC16 did not match any columns in proteomics dataframe. MUC16_proteomics column

## Ovarian

### Test for significant comparisons in any of interacting proteins

In [9]:
'''Create dataframe in order to do comparisons with wrap_ttest'''
protdf = ov.join_omics_to_mutations(mutations_genes=[gene], omics_df_name=omics, omics_genes=interacting_proteins)
protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']


'''Create the binary valued column needed to do the comparison'''
for ind, row in protdf.iterrows():
    if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
        protdf.loc[ind,'Label'] = 'Mutated'
    else:
        protdf.loc[ind,'Label'] = 'Wildtype'

'''Format the dataframe correctly'''
protdf = protdf.drop(gene+"_Mutation",axis=1)
protdf = protdf.drop(gene+"_Location",axis=1)
protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
protdf = protdf.drop("Sample_Status",axis=1)
#protdf = protdf.drop_duplicates()
protdf = protdf.loc[:,~protdf.columns.duplicated()]


'''Make list of columns to be compared using t-tests'''
col_list = list(protdf.columns)
col_list.remove('Label')

print("Doing t-test comparisons\n")


'''Call wrap_ttest, pass in formatted dataframe'''
wrap_results = al.wrap_ttest(protdf, 'Label', col_list)

'''Print results, if anything significant was found'''
if wrap_results is not None:
        print(wrap_results)
        print("\n\n")
        
        all_significant_comparisons = add_to_all_significant_comparisons(wrap_results, "Ovarian", True, all_significant_comparisons)



GALNT8 did not match any columns in proteomics dataframe. GALNT8_proteomics column inserted, but filled with NaN.
ST6GALNAC2 did not match any columns in proteomics dataframe. ST6GALNAC2_proteomics column inserted, but filled with NaN.
ST3GAL3 did not match any columns in proteomics dataframe. ST3GAL3_proteomics column inserted, but filled with NaN.
MUC17 did not match any columns in proteomics dataframe. MUC17_proteomics column inserted, but filled with NaN.
MUC21 did not match any columns in proteomics dataframe. MUC21_proteomics column inserted, but filled with NaN.
MUC3A did not match any columns in proteomics dataframe. MUC3A_proteomics column inserted, but filled with NaN.
ST3GAL2 did not match any columns in proteomics dataframe. ST3GAL2_proteomics column inserted, but filled with NaN.
GCNT3 did not match any columns in proteomics dataframe. GCNT3_proteomics column inserted, but filled with NaN.
MUC6 did not match any columns in proteomics dataframe. MUC6_proteomics column inser

# All Proteins: Proteomics

## Endometrial

In [10]:
try:
    print("\nGene: ", gene)

    '''Use all proteins'''

    '''Create dataframe in order to do comparisons with wrap_ttest'''
    protdf = en.join_omics_to_mutations(mutations_genes=[gene], omics_df_name=omics)
    protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']
    
    '''Create the binary valued column needed to do the comparison'''
    for ind, row in protdf.iterrows():
        if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
            protdf.at[ind,'Label'] = 'Mutated'
        else:
            protdf.at[ind,'Label'] = 'Wildtype'

    '''Format the datafram correctly'''
    protdf = protdf.drop(gene+"_Mutation",axis=1)
    protdf = protdf.drop(gene+"_Location",axis=1)
    protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
    protdf = protdf.drop("Sample_Status",axis=1)

    '''Make list of columns to be compared using t-tests'''
    col_list = list(protdf.columns)
    col_list.remove('Label')

    print("Doing t-test comparisons\n")
    
    '''Call wrap_ttest, pass in formatted dataframe'''
    wrap_results = al.wrap_ttest(protdf, 'Label', col_list)

    '''Print results, if anything significant was found'''
    if wrap_results is not None:
            print(wrap_results)
            print("\n\n")
            
            all_significant_comparisons = add_to_all_significant_comparisons(wrap_results, "Endometrial", False, all_significant_comparisons)


except Exception as e:
    print("Error in Comparison")
    print(e)


Gene:  MUC5B
Doing t-test comparisons

No significant comparisons.


## Colon

In [11]:
try:
    print("\nGene: ", gene)
    
    '''Use all proteins'''
    '''Create dataframe in order to do comparisons with wrap_ttest'''
    protdf = co.join_omics_to_mutations(mutations_genes=[gene], omics_df_name=omics)
    protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']
    
    '''Create the binary valued column needed to do the comparison'''
    for ind, row in protdf.iterrows():
        if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
            protdf.at[ind,'Label'] = 'Mutated'
        else:
            protdf.at[ind,'Label'] = 'Wildtype'

    '''Format the dataframe correctly'''
    protdf = protdf.drop(gene+"_Mutation",axis=1)
    protdf = protdf.drop(gene+"_Location",axis=1)
    protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
    protdf = protdf.drop("Sample_Status",axis=1)

    '''Make list of columns to be compared using t-tests'''
    col_list = list(protdf.columns)
    col_list.remove('Label')

    print("Doing t-test comparisons\n")
    
    '''Call wrap_ttest, pass in formatted dataframe'''
    wrap_results = al.wrap_ttest(protdf, 'Label', col_list)

    '''Print results, if anything significant was found'''
    if wrap_results is not None:
            print(wrap_results)
            print("\n\n")
            
            all_significant_comparisons = add_to_all_significant_comparisons(wrap_results, "Colon", False, all_significant_comparisons)


except Exception as e:
    print("Error in Comparison")
    print(e)


Gene:  MUC5B
Doing t-test comparisons

            Comparison       P_Value
0     MRE11_proteomics  1.144045e-10
1      GPX2_proteomics  5.988006e-10
2    ANP32E_proteomics  3.042503e-09
3     KRT20_proteomics  3.969811e-09
4     RAD50_proteomics  9.817991e-09
5     PRDX5_proteomics  5.077905e-08
6   ARHGEF2_proteomics  7.919124e-08
7     NFKB2_proteomics  1.471187e-07
8    MRPS31_proteomics  3.790579e-07
9     PCSK9_proteomics  4.405533e-07
10    HNF4A_proteomics  6.761123e-07
11      VTN_proteomics  8.848484e-07
12   MYO15B_proteomics  9.027894e-07
13    NUBPL_proteomics  9.960365e-07
14   FUNDC1_proteomics  1.025502e-06
15    ACSL5_proteomics  1.289272e-06
16     ACE2_proteomics  1.332648e-06
17   GALNT1_proteomics  1.374055e-06
18     A1CF_proteomics  1.503580e-06
19  TFCP2L1_proteomics  1.666901e-06
20  SLC16A3_proteomics  2.262895e-06
21  PLA2G4A_proteomics  2.320262e-06
22    AIFM1_proteomics  2.482703e-06
23     GYS1_proteomics  2.763100e-06
24   ENGASE_proteomics  2.794402e-0

## Ovarian

In [12]:
print("\nGene: ", gene)

'''Use all proteins'''
'''Create dataframe in order to do comparisons with wrap_ttest'''
protdf = ov.join_omics_to_mutations(mutations_genes=[gene], omics_df_name=omics)
protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']


'''Create the binary valued column needed to do the comparison'''
for ind, row in protdf.iterrows():
    if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
        protdf.loc[ind,'Label'] = 'Mutated'
    else:
        protdf.loc[ind,'Label'] = 'Wildtype'

'''Format the dataframe correctly'''
protdf = protdf.drop(gene+"_Mutation",axis=1)
protdf = protdf.drop(gene+"_Location",axis=1)
protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
protdf = protdf.drop("Sample_Status",axis=1)
protdf = protdf.loc[:,~protdf.columns.duplicated()]


'''Make list of columns to be compared using t-tests'''
col_list = list(protdf.columns)
col_list.remove('Label')

print("Doing t-test comparisons\n")

'''Call wrap_ttest, pass in formatted dataframe'''
wrap_results = al.wrap_ttest(protdf, 'Label', col_list)

'''Print results, if anything significant was found'''
if wrap_results is not None:
        print(wrap_results)
        print("\n\n")
        
        all_significant_comparisons = add_to_all_significant_comparisons(wrap_results, "Ovarian", False, all_significant_comparisons)



Gene:  MUC5B
Doing t-test comparisons

No significant comparisons.


### Print all significant comparisons

In [13]:
if len(all_significant_comparisons) > 0:
    display(all_significant_comparisons)
    
else:
    print('No Significant Comparisons!')

Unnamed: 0,Cancer_Type,Gene,Comparison,Interacting_Protein,P_Value
0,Colon,MUC5B,PCK2_proteomics,True,1.13685e-05
0,Colon,MUC5B,MRE11_proteomics,False,1.144045e-10
1,Colon,MUC5B,GPX2_proteomics,False,5.988006e-10
2,Colon,MUC5B,ANP32E_proteomics,False,3.042503e-09
3,Colon,MUC5B,KRT20_proteomics,False,3.969811e-09
4,Colon,MUC5B,RAD50_proteomics,False,9.817991e-09
5,Colon,MUC5B,PRDX5_proteomics,False,5.077905e-08
6,Colon,MUC5B,ARHGEF2_proteomics,False,7.919124e-08
7,Colon,MUC5B,NFKB2_proteomics,False,1.471187e-07
8,Colon,MUC5B,MRPS31_proteomics,False,3.790579e-07


### Write significant comparisons (if any) to shared CSV file

In [14]:
existing_results = pd.read_csv(gene+'_Trans_Results.csv')

updated_results = pd.concat([existing_results, all_significant_comparisons], sort=False)

updated_results.to_csv(path_or_buf = gene + '_Trans_Results.csv', index=False)