# MUC5B Mutation Trans Effect on Phosphoproteomics

This notebook analyzes the trans effect of MUC5B mutation on interacting and other proteins Phosphoproteomics, in Endometrial, Colon, and Ovarian cancer.

### Library Imports

In [1]:
import pandas as pd
import numpy as np
import scipy.stats

import warnings
warnings.filterwarnings("ignore")

import cptac
import cptac.algorithms as al

en = cptac.Endometrial()
co = cptac.Colon()
ov = cptac.Ovarian()

                                    

### Select Gene

In [2]:
gene = "MUC5B"

### Investigate Proteomics, Phosphoproteomics, Acetylproteomics, or Transcriptomics

In [3]:
#omics = "proteomics"
#omics = "transcriptomics"
omics = "phosphoproteomics"
#omics = "acetylproteomics"

### Track all significant comparisons in Dataframe

In [4]:
all_significant_comparisons = pd.DataFrame(columns=['Cancer_Type', 'Gene', 'Comparison','Interacting_Protein','P_Value'])

In [5]:
def add_to_all_significant_comparisons(df, cancer, interacting, all_sig_comp):
    expanded = df
    expanded['Gene'] = gene
    expanded['Cancer_Type'] = cancer
    expanded['Interacting_Protein'] = interacting
    
    updated_all_comparisons = pd.concat([all_sig_comp, expanded], sort=False)
    
    return updated_all_comparisons

# Interacting Proteins: Phosphoproteomics

### Generate interacting protein list

In [6]:
'''Use get interacting proteins method to generate list of interacting proteins'''
interacting_proteins = al.get_interacting_proteins(gene)

print("Interacting Proteins:")
for interacting_protein in interacting_proteins:
    print(interacting_protein)

Interacting Proteins:
GALNT8
ST6GAL1
ST6GALNAC2
ST3GAL3
MUC17
GALNT16
GALNT12
MUC21
MUC3A
ST3GAL2
GCNT3
MUC16
C1GALT1
MUC6
MUC7
MUC20
MUC15
MUC4
GALNTL6
ST3GAL4
MUC5B
MUC12
B3GNT6
MUC1
MUC13
MUC5AC
UBC
KIR2DS2
C7orf25
ICE2
FRMD1
PCK2
HBM
ARHGAP12
TIMM50
DDX31
ESR2
AGR2


## Endometrial

### Test for significant comparisons in any of interacting proteins

In [7]:
'''Create dataframe in order to do comparisons with wrap_ttest'''
protdf = en.join_omics_to_mutations(mutations_genes=[gene], omics_df_name=omics, omics_genes=interacting_proteins)
protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']

'''Create the binary valued column needed to do the comparison'''
for ind, row in protdf.iterrows():
    if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
        protdf.at[ind,'Label'] = 'Mutated'
    else:
        protdf.at[ind,'Label'] = 'Wildtype'

'''Format the dataframe correctly'''
protdf = protdf.drop(gene+"_Mutation",axis=1)
protdf = protdf.drop(gene+"_Location",axis=1)
protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
protdf = protdf.drop("Sample_Status",axis=1)


'''Make list of columns to be compared using t-tests'''
col_list = list(protdf.columns)
col_list.remove('Label')

print("Doing t-test comparisons\n")

'''Call wrap_ttest, pass in formatted dataframe'''
wrap_results = al.wrap_ttest(protdf, 'Label', col_list)

'''Print results, if anything significant was found'''
if wrap_results is not None:
        print(wrap_results)
        print("\n\n")
        
        all_significant_comparisons = add_to_all_significant_comparisons(wrap_results, "Endometrial", True, all_significant_comparisons)


GALNT8 did not match any columns in phosphoproteomics dataframe. GALNT8_phosphoproteomics column inserted, but filled with NaN.
ST6GAL1 did not match any columns in phosphoproteomics dataframe. ST6GAL1_phosphoproteomics column inserted, but filled with NaN.
ST6GALNAC2 did not match any columns in phosphoproteomics dataframe. ST6GALNAC2_phosphoproteomics column inserted, but filled with NaN.
MUC17 did not match any columns in phosphoproteomics dataframe. MUC17_phosphoproteomics column inserted, but filled with NaN.
GALNT16 did not match any columns in phosphoproteomics dataframe. GALNT16_phosphoproteomics column inserted, but filled with NaN.
MUC21 did not match any columns in phosphoproteomics dataframe. MUC21_phosphoproteomics column inserted, but filled with NaN.
MUC3A did not match any columns in phosphoproteomics dataframe. MUC3A_phosphoproteomics column inserted, but filled with NaN.
ST3GAL2 did not match any columns in phosphoproteomics dataframe. ST3GAL2_phosphoproteomics column

## Colon

### Test for significant comparisons in any of interacting proteins

In [8]:
'''Create dataframe in order to do comparisons with wrap_ttest'''
protdf = co.join_omics_to_mutations(mutations_genes=[gene], omics_df_name=omics, omics_genes=interacting_proteins)
protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']

'''Create the binary valued column needed to do the comparison'''
for ind, row in protdf.iterrows():
    if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
        protdf.at[ind,'Label'] = 'Mutated'
    else:
        protdf.at[ind,'Label'] = 'Wildtype'

'''Format the dataframe correctly'''
protdf = protdf.drop(gene+"_Mutation",axis=1)
protdf = protdf.drop(gene+"_Location",axis=1)
protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
protdf = protdf.drop("Sample_Status",axis=1)

'''Make list of columns to be compared using t-tests'''
col_list = list(protdf.columns)
col_list.remove('Label')

print("Doing t-test comparisons\n")

'''Call wrap_ttest, pass in formatted dataframe'''
wrap_results = al.wrap_ttest(protdf, 'Label', col_list)

'''Print results, if anything significant was found'''
if wrap_results is not None:
        print(wrap_results)
        print("\n\n")
        
        all_significant_comparisons = add_to_all_significant_comparisons(wrap_results, "Colon", True, all_significant_comparisons)


GALNT8 did not match any columns in phosphoproteomics dataframe. GALNT8_phosphoproteomics column inserted, but filled with NaN.
ST6GAL1 did not match any columns in phosphoproteomics dataframe. ST6GAL1_phosphoproteomics column inserted, but filled with NaN.
ST6GALNAC2 did not match any columns in phosphoproteomics dataframe. ST6GALNAC2_phosphoproteomics column inserted, but filled with NaN.
ST3GAL3 did not match any columns in phosphoproteomics dataframe. ST3GAL3_phosphoproteomics column inserted, but filled with NaN.
MUC17 did not match any columns in phosphoproteomics dataframe. MUC17_phosphoproteomics column inserted, but filled with NaN.
GALNT16 did not match any columns in phosphoproteomics dataframe. GALNT16_phosphoproteomics column inserted, but filled with NaN.
GALNT12 did not match any columns in phosphoproteomics dataframe. GALNT12_phosphoproteomics column inserted, but filled with NaN.
MUC21 did not match any columns in phosphoproteomics dataframe. MUC21_phosphoproteomics co

## Ovarian

In [9]:
'''Create dataframe in order to do comparisons with wrap_ttest'''
protdf = ov.join_omics_to_mutations(mutations_genes=[gene], omics_df_name=omics, omics_genes=interacting_proteins) 
protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']

'''Create the binary valued column needed to do the comparison'''
for ind, row in protdf.iterrows():
    if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
        protdf.loc[ind,'Label'] = 'Mutated'
    else:
        protdf.loc[ind,'Label'] = 'Wildtype'

'''Format the dataframe correctly'''
protdf = protdf.drop(gene+"_Mutation",axis=1)
protdf = protdf.drop(gene+"_Location",axis=1)
protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
protdf = protdf.drop("Sample_Status",axis=1)
protdf = protdf.loc[:,~protdf.columns.duplicated()]

'''Make list of columns to be compared using t-tests'''
col_list = list(protdf.columns)
col_list.remove('Label')

print("Doing t-test comparisons\n")

'''Call wrap_ttest, pass in formatted dataframe'''
wrap_results = al.wrap_ttest(protdf, 'Label', col_list)

'''Print results, if anything significant was found'''
if wrap_results is not None:
        print(wrap_results)
        print("\n\n")
        
        all_significant_comparisons = add_to_all_significant_comparisons(wrap_results, "Ovarian", True, all_significant_comparisons)


GALNT8 did not match any columns in phosphoproteomics dataframe. GALNT8_phosphoproteomics column inserted, but filled with NaN.
ST6GAL1 did not match any columns in phosphoproteomics dataframe. ST6GAL1_phosphoproteomics column inserted, but filled with NaN.
ST6GALNAC2 did not match any columns in phosphoproteomics dataframe. ST6GALNAC2_phosphoproteomics column inserted, but filled with NaN.
ST3GAL3 did not match any columns in phosphoproteomics dataframe. ST3GAL3_phosphoproteomics column inserted, but filled with NaN.
MUC17 did not match any columns in phosphoproteomics dataframe. MUC17_phosphoproteomics column inserted, but filled with NaN.
GALNT16 did not match any columns in phosphoproteomics dataframe. GALNT16_phosphoproteomics column inserted, but filled with NaN.
ST3GAL2 did not match any columns in phosphoproteomics dataframe. ST3GAL2_phosphoproteomics column inserted, but filled with NaN.
GCNT3 did not match any columns in phosphoproteomics dataframe. GCNT3_phosphoproteomics co

# All Proteins: Phosphoproteomics

## Endometrial

In [10]:
try:
    print("\nGene: ", gene)

    '''Use all proteins'''
    proteomics = en.get_proteomics()
    all_proteins = proteomics.columns
    #all_proteins = all_proteins[:100]

    '''Create dataframe in order to do comparisons with wrap_ttest'''
    protdf = en.join_omics_to_mutations(mutations_genes=[gene], omics_df_name=omics)
    protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']
    
    '''Create the binary valued column needed to do the comparison'''
    for ind, row in protdf.iterrows():
        if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
            protdf.at[ind,'Label'] = 'Mutated'
        else:
            protdf.at[ind,'Label'] = 'Wildtype'

    '''Format the datafram correctly'''
    protdf = protdf.drop(gene+"_Mutation",axis=1)
    protdf = protdf.drop(gene+"_Location",axis=1)
    protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
    protdf = protdf.drop("Sample_Status",axis=1)

    '''Make list of columns to be compared using t-tests'''
    col_list = list(protdf.columns)
    col_list.remove('Label')

    print("Doing t-test comparisons\n")
    
    '''Call wrap_ttest, pass in formatted dataframe'''
    wrap_results = al.wrap_ttest(protdf, 'Label', col_list)

    '''Print results, if anything significant was found'''
    if wrap_results is not None:
            print(wrap_results)
            print("\n\n")
            
            all_significant_comparisons = add_to_all_significant_comparisons(wrap_results, "Endometrial", False, all_significant_comparisons)


except Exception as e:
    print("Error in Comparison")
    print(e)


Gene:  MUC5B
Doing t-test comparisons

No significant comparisons.


## Colon

In [11]:
try:
    print("\nGene: ", gene)

    '''Use all proteins'''
    proteomics = co.get_proteomics()
    all_proteins = proteomics.columns
    #all_proteins = all_proteins[:100]

    '''Create dataframe in order to do comparisons with wrap_ttest'''
    protdf = co.join_omics_to_mutations(mutations_genes=[gene], omics_df_name=omics)
    protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']
    
    '''Create the binary valued column needed to do the comparison'''
    for ind, row in protdf.iterrows():
        if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
            protdf.at[ind,'Label'] = 'Mutated'
        else:
            protdf.at[ind,'Label'] = 'Wildtype'

    '''Format the datafram correctly'''
    protdf = protdf.drop(gene+"_Mutation",axis=1)
    protdf = protdf.drop(gene+"_Location",axis=1)
    protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
    protdf = protdf.drop("Sample_Status",axis=1)

    '''Make list of columns to be compared using t-tests'''
    col_list = list(protdf.columns)
    col_list.remove('Label')

    print("Doing t-test comparisons\n")
    
    '''Call wrap_ttest, pass in formatted dataframe'''
    wrap_results = al.wrap_ttest(protdf, 'Label', col_list)

    '''Print results, if anything significant was found'''
    if wrap_results is not None:
            print(wrap_results)
            print("\n\n")
            
            all_significant_comparisons = add_to_all_significant_comparisons(wrap_results, "Colon", False, all_significant_comparisons)


except Exception as e:
    print("Error in Comparison")
    print(e)


Gene:  MUC5B
Doing t-test comparisons

                                Comparison       P_Value
0   MYO15B_S1025__Q96JP2_phosphoproteomics  3.506874e-08
1     FOXK2_S428__Q01167_phosphoproteomics  1.553143e-07
2    HUWE1_S1395__Q7Z6Z7_phosphoproteomics  2.311105e-07
3    SETD1B_S211__Q9UPS6_phosphoproteomics  3.421412e-07
4      SATB2_S39__Q9UPW6_phosphoproteomics  6.640062e-07
5        PKM_S37__P14618_phosphoproteomics  7.506440e-07
6     C5AR1_S327__P21730_phosphoproteomics  1.043313e-06
7      BCL9_S917__O00512_phosphoproteomics  1.087465e-06
8     HNF4A_S167__P41235_phosphoproteomics  1.233225e-06
9    TGFBR2_S553__P37173_phosphoproteomics  1.270564e-06
10    CEP68_S478__Q76N32_phosphoproteomics  1.474940e-06





## Ovarian

In [12]:

print("\nGene: ", gene)

'''Use all proteins'''
proteomics = ov.get_proteomics()
all_proteins = list(set(proteomics.columns))

'''Create dataframe in order to do comparisons with wrap_ttest'''
protdf = ov.join_omics_to_mutations(mutations_genes=[gene], omics_df_name=omics)
protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']
protdf = protdf.loc[:,~protdf.columns.duplicated()]

'''Create the binary valued column needed to do the comparison'''
for ind, row in protdf.iterrows():
    if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
        protdf.at[ind,'Label'] = 'Mutated'
    else:
        protdf.at[ind,'Label'] = 'Wildtype'

'''Format the datafram correctly'''
protdf = protdf.drop(gene+"_Mutation",axis=1)
protdf = protdf.drop(gene+"_Location",axis=1)
protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
protdf = protdf.drop("Sample_Status",axis=1)

'''Make list of columns to be compared using t-tests'''
col_list = list(protdf.columns)
col_list.remove('Label')

print("Doing t-test comparisons\n")

'''Call wrap_ttest, pass in formatted dataframe'''
wrap_results = al.wrap_ttest(protdf, 'Label', col_list)

'''Print results, if anything significant was found'''
if wrap_results is not None:
        print(wrap_results)
        print("\n\n")
        
        all_significant_comparisons = add_to_all_significant_comparisons(wrap_results, "Ovarian", False, all_significant_comparisons)


Gene:  MUC5B
Doing t-test comparisons

No significant comparisons.


### Print all significant comparisons

In [13]:
if len(all_significant_comparisons) > 0:
    display(all_significant_comparisons)
    
else:
    print('No Significant Comparisons!')

Unnamed: 0,Cancer_Type,Gene,Comparison,Interacting_Protein,P_Value
0,Colon,MUC5B,MYO15B_S1025__Q96JP2_phosphoproteomics,False,3.506874e-08
1,Colon,MUC5B,FOXK2_S428__Q01167_phosphoproteomics,False,1.553143e-07
2,Colon,MUC5B,HUWE1_S1395__Q7Z6Z7_phosphoproteomics,False,2.311105e-07
3,Colon,MUC5B,SETD1B_S211__Q9UPS6_phosphoproteomics,False,3.421412e-07
4,Colon,MUC5B,SATB2_S39__Q9UPW6_phosphoproteomics,False,6.640062e-07
5,Colon,MUC5B,PKM_S37__P14618_phosphoproteomics,False,7.50644e-07
6,Colon,MUC5B,C5AR1_S327__P21730_phosphoproteomics,False,1.043313e-06
7,Colon,MUC5B,BCL9_S917__O00512_phosphoproteomics,False,1.087465e-06
8,Colon,MUC5B,HNF4A_S167__P41235_phosphoproteomics,False,1.233225e-06
9,Colon,MUC5B,TGFBR2_S553__P37173_phosphoproteomics,False,1.270564e-06


### Write significant comparisons (if any) to shared CSV file

In [14]:
existing_results = pd.read_csv(gene+'_Trans_Results.csv')

updated_results = pd.concat([existing_results, all_significant_comparisons], sort=False)

updated_results.to_csv(path_or_buf = gene + '_Trans_Results.csv', index=False)