In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import statsmodels.stats.multitest
import operator




def wrap_ttest(df, label_column, comparison_columns=None, alpha=.05, return_all=False, correction_method='bonferroni', mincount=3, pval_return_corrected=True):
    try:
        '''Verify precondition that label column exists and has exactly 2 unique values'''
        label_values = df[label_column].unique()
        if len(label_values) != 2:
            print("Incorrectly Formatted Dataframe! Label column must have exactly 2 unique values.")
            return None
        
        '''Partition dataframe into two sets, one for each of the two unique values from the label column'''
        partition1 = df.loc[df[label_column] == label_values[0]]
        partition2 = df.loc[df[label_column] == label_values[1]]

        '''If no comparison columns specified, use all columns except the specified labed column'''
        if not comparison_columns:
            comparison_columns = list(df.columns)
            comparison_columns.remove(label_column)

        '''Determine the number of real valued columns on which we will do t-tests'''
        number_of_comparisons = len(comparison_columns)

        '''Store comparisons and p-values in two arrays'''
        comparisons = []
        pvals = []
        
        '''Loop through each comparison column, perform the t-test, and record the p-val'''
        
        for column in comparison_columns:
            if len(partition1[column].dropna(axis=0)) <= mincount:
                continue
            elif len(partition2[column].dropna(axis=0)) <= mincount:
                continue
            else:
                stat, pval = scipy.stats.ttest_ind(partition1[column].dropna(axis=0), partition2[column].dropna(axis=0))
                comparisons.append(column)
                pvals.append(pval)
                
        '''Correct for multiple testing to determine if each comparison meets the new cutoff'''
        results = statsmodels.stats.multitest.multipletests(pvals=pvals, alpha=alpha, method=correction_method)
        reject = results[0]

        '''Format results in a pandas dataframe'''
        results_df = pd.DataFrame(columns=['Comparison','P_Value'])

        '''If return all, add all comparisons and p-values to dataframe'''
        if return_all:
            if pval_return_corrected:
                results_df['Comparison'] = comparisons
                results_df['P_Value'] = results[1]

            else:
                results_df['Comparison'] = comparisons
                results_df['P_Value'] = pvals

            '''Else only add significant comparisons'''
        else:
            for i in range(0, len(reject)):
                if reject[i]:
                    if pval_return_corrected:
                        results_df = results_df.append({'Comparison':comparisons[i],'P_Value':results[1][i]}, ignore_index=True)
                    else:
                        results_df = results_df.append({'Comparison':comparisons[i],'P_Value':pvals[i]}, ignore_index=True)


        '''Sort dataframe by ascending p-value'''
        results_df = results_df.sort_values(by='P_Value', ascending=True)
        results_df = results_df.reset_index(drop=True)

        '''If results df is not empty, return it, else return None'''
        if len(results_df) > 0:
            return results_df
        else:
            print("Empty")
            return results[1]


    except:
        print("Incorrectly Formatted Dataframe!")
        return results[1]

  import pandas.util.testing as tm


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u



In [3]:
en = cptac.Endometrial()
br = cptac.Brca()
cl = cptac.Colon()

                                                

In [4]:
gene = 'PIK3CA'

# Endometrial

In [5]:
mut = en.get_genotype_all_vars(gene, mutation_hotspot=['E542K', 'E545K', 'H1047R'])
phos = en.get_phosphoproteomics(tissue_type="tumor")
phos  = cptac.utils.reduce_multiindex(phos, flatten=True)
phos



Name,AAAS_S495,AAAS_S541,AAAS_Y485,AACS_S618,AAED1_S12,AAGAB_S310,AAGAB_S311,AAK1_S14,AAK1_S18,AAK1_S20,...,ZZZ3_S397,ZZZ3_S411,ZZZ3_S420,ZZZ3_S424,ZZZ3_S426,ZZZ3_S468,ZZZ3_S89,ZZZ3_T415,ZZZ3_T418,ZZZ3_Y399
C3L-00006,,,,-0.881,-1.810,,,,-0.2420,-0.2420,...,0.18400,,,,-0.20500,,,,,
C3L-00008,,,,,0.084,,,-1.1100,-0.3830,-1.0900,...,-0.17100,,,-0.393,-0.17100,,0.29,,0.16050,-0.06350
C3L-00032,-0.202,,,,-1.880,,,,0.3820,-0.0416,...,,,,,,,,,,
C3L-00090,-0.002,,-0.4070,,,,,,,-0.5550,...,0.13970,,,,-0.55900,,,,,0.29800
C3L-00098,0.556,-0.0461,,,0.941,,0.429,0.3620,0.6970,-0.0529,...,-0.15875,,,0.196,0.06175,,,,,-0.29000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01520,0.398,,-0.0901,,,,,,,-0.0845,...,-0.14750,,,,-0.07520,,,,,-0.13800
C3N-01521,0.750,0.7040,,,0.209,,-0.229,0.2215,-0.1200,-0.5100,...,0.33600,,,0.442,0.42200,,,,,0.27000
C3N-01537,0.526,,,,-0.967,,,0.4700,-0.3570,-0.3140,...,-0.05860,,-0.559,,0.30900,,,,,
C3N-01802,,,,,0.180,,,-0.2250,0.7010,0.1400,...,-0.13200,,,-0.920,-0.13200,,0.00,,-0.04685,0.20165


In [6]:
joined = mut.join(phos)
joined['Mutation'].unique()

#select samples containing hotspot mutations
en_hotspot_df = joined[joined.Location.str.contains('E542K') | 
                    joined.Location.str.contains('E545K') |
                    joined.Location.str.contains('H1047R')]
wildtype  = joined.loc[joined.Mutation == "Wildtype_Tumor"]

endo_hotspot = en_hotspot_df.append(wildtype)
endo_hotspot = endo_hotspot.drop(columns = ["Mutation_Status", "Location"])


# Keep two values to compare
compare = ['Wildtype_Tumor','Missense_Mutation']
get = endo_hotspot['Mutation'].isin(compare)
missense_wt = endo_hotspot[get]
missense_wt['Mutation'].value_counts()


# Step 2 get the difference in medians
missense = missense_wt[missense_wt.Mutation == "Missense_Mutation"] #get all missense_mutations
wt = missense_wt[missense_wt.Mutation == "Wildtype_Tumor"] #getn all wildtype
missense_med = missense.median() #get the medain of the missense. This will give the median expression among all samples, for each gene
wt_med = wt.median() #get the median of the wildtype


In [7]:
missense_wt

Name,Mutation,AAAS_S495,AAAS_S541,AAAS_Y485,AACS_S618,AAED1_S12,AAGAB_S310,AAGAB_S311,AAK1_S14,AAK1_S18,...,ZZZ3_S397,ZZZ3_S411,ZZZ3_S420,ZZZ3_S424,ZZZ3_S426,ZZZ3_S468,ZZZ3_S89,ZZZ3_T415,ZZZ3_T418,ZZZ3_Y399
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00006,Missense_Mutation,,,,-0.881,-1.81,,,,-0.242,...,0.184,,,,-0.205,,,,,
C3L-00032,Missense_Mutation,-0.202,,,,-1.88,,,,0.382,...,,,,,,,,,,
C3L-00362,Missense_Mutation,,,,0.844,0.279,,,,-0.355,...,0.0805,,,,-0.109,,,,,
C3L-00601,Missense_Mutation,0.594,,,0.0,-0.982,-0.276,,,-0.109,...,0.134,,,0.32,0.32,,,,,
C3L-00605,Missense_Mutation,-0.162,,0.14,,,,,,,...,0.189,,,,0.386,,,,,0.596
C3L-00921,Missense_Mutation,-0.525,,,0.894,-1.4,,-0.255,0.271,0.761,...,-0.144,,,,-0.22,,,0.336,,-0.131
C3L-00947,Missense_Mutation,0.46,,,,-0.265,,,,0.608,...,0.483,,,,0.483,,,,,
C3N-00323,Missense_Mutation,0.532,,,,-0.0342,,,,-0.447,...,0.628,,0.2,-0.806,-0.806,,,,0.635,0.2
C3N-00324,Missense_Mutation,-0.791,,,,,,,,0.281,...,-0.3435,,,,-0.0994,,,,,
C3N-00383,Missense_Mutation,-0.0591,,,,-1.77,,,,-0.533,...,,,,,,,,,,


In [8]:
wrap_ttest(missense_wt, "Mutation", return_all=True, mincount=7)

Unnamed: 0,Comparison,P_Value
0,CAMK2D_S506,0.485209
1,AAAS_S495,1.000000
2,RAB12_S106,1.000000
3,RAB11FIP5_T287,1.000000
4,RAB11FIP5_S538,1.000000
...,...,...
22111,HDGFL2_S676,1.000000
22112,HDGFL2_S649,1.000000
22113,HDGFL2_S637,1.000000
22114,HDGFL3_S122,1.000000


# Brca

In [9]:
phosphoproteomics = br.get_phosphoproteomics(tissue_type="tumor")
fil = phosphoproteomics.columns.get_level_values('Name').duplicated(False)
duplicates = phosphoproteomics[phosphoproteomics.columns[fil]]
no_duplicates = phosphoproteomics[phosphoproteomics.columns[~fil]]
no_duplicates
duplicates

Name,AAAS,AAAS,AAGAB,AAGAB,AAK1,AAK1,AAK1,AAK1,AAK1,AAK1,...,ZZEF1,ZZZ3,ZZZ3,ZZZ3,ZZZ3,ZZZ3,ZZZ3,ZZZ3,ZZZ3,ZZZ3
Site,S495,S541,S310S311,S311,S14,S18,S21,S618T620S623,S623S624,S624,...,S2526,S113,S314,S381,S391,S397,S397T428N429,S606,S82,S89
Peptide,IAHIPLYFVNAQFPRFsPVLGR,AQEPPAGGGGSIHDLPLFTETSPTSAPWDPLPGPPPVLPHsPHSHL,AFWMAIGGDRDEIEGLssDEEH,AFWMAIGGDRDEIEGLSsDEEH,EQGGsGLGSGSSGGGGSTSGLGSGYIGR,REQGGSGLGsGSSGGGGSTSGLGSGYIGR,REQGGSGLGSGSsGGGGSTSGLGSGYIGR,VGsLtPPsSPK,VGSLTPPssPK,VGSLTPPSsPK,...,sLRLEEQSAK,RQTEPVsPVLK,IVTACLPVEHVNQLTTEPATGPFSETQSSLRDsEEEVDVVGDSSASK,YTLRTsPR,AAPTRGsPTK,NSsPYRENGQFEENNLSPNETNATVSDNVSQSPTNPGEISQNEK,NSsPYRENGQFEENNLSPNETNATVSDNVSQSPtnPGEISQNEK,VGLPARPKsPLDPK,ESWVsPR,GLsSSEK
Database_ID,NP_056480.1|NP_001166937.1,NP_056480.1|NP_001166937.1,NP_078942.3|NP_001258815.1,NP_078942.3|NP_001258815.1,NP_055726.3,NP_055726.3,NP_055726.3,NP_055726.3,NP_055726.3,NP_055726.3,...,NP_055928.3,NP_056349.1,NP_056349.1,NP_056349.1,NP_056349.1,NP_056349.1,NP_056349.1,NP_056349.1|NP_001295166.1,NP_056349.1,NP_056349.1
CPT000814,1.9431,,0.0127,-0.4495,,-1.1852,-0.8333,0.0863,,-1.8617,...,-1.7098,0.0735,0.2238,-0.6702,-8.8556,,-0.8493,-0.1744,-0.0273,-6.8916
CPT001846,0.2274,,0.6895,-0.7680,,0.6895,0.3903,,,-1.4208,...,,-0.8199,,-0.8790,-4.0222,,1.1806,-1.2700,-0.8983,-3.6228
X01BR001,-2.2853,-0.8967,0.4842,-1.2458,-0.0310,0.8805,0.7448,1.0649,,0.2678,...,,0.1092,,,-1.6299,-0.2017,-0.3953,-2.1328,-0.8815,-2.1191
X01BR008,1.3714,,-0.6224,-1.2732,-1.1351,-0.8652,0.0031,0.2173,-0.4208,-0.5652,...,0.7110,0.3824,,,1.3333,0.7840,0.1078,0.4824,,-0.9748
X01BR009,0.2682,,-0.6207,-2.3561,-0.3078,0.3785,0.6990,-0.1082,0.2908,-0.8399,...,-1.4189,-0.7779,,,0.5267,-0.6661,0.5539,-0.3728,,-0.8354
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X21BR001,0.9424,,0.0477,-0.3365,-0.4697,-0.4213,0.1117,-0.5217,0.4458,-1.8577,...,0.9147,0.2035,,,0.9078,0.7330,1.5429,0.6015,,-2.2315
X21BR002,0.2955,,0.1169,0.5654,-0.7218,-0.2368,0.2974,-0.0491,0.2737,-0.9643,...,1.0358,-1.2871,,,-0.4611,-0.3007,0.4122,-0.6325,,-0.4046
X21BR010,-0.1201,-0.1477,0.8338,0.2280,-0.0877,-0.4698,-0.2266,-0.0214,0.7840,-0.2750,...,0.1697,-0.4731,0.1700,-1.5727,-2.2380,0.9095,0.3844,0.1160,-3.4806,0.0170
X22BR005,-0.3378,3.2472,1.0000,1.8410,,-0.6455,0.6330,0.2228,,,...,,1.6131,0.9179,,1.3874,,0.0792,1.0615,,0.3436


In [10]:
#DEALING WITH DUPLICATES
#get the pval and min for duplicates

duplicates = br.reduce_multiindex(duplicates, flatten=True)

# We need to figure which of databaseID we want to use, for each protein


#  Step 1 - Create dataframe in order to do comparisons with wrap_ttest

mut_type = br.get_genotype_all_vars(gene)
joined = mut_type.join(duplicates)
joined

#select samples containing hotspot mutations
br_hotspot_df = joined[joined.Location.str.contains('E542K') | 
                    joined.Location.str.contains('E545K') |
                    joined.Location.str.contains('H1047R')]
wildtype  = joined.loc[joined.Mutation == "Wildtype_Tumor"]

brace_hotspot = br_hotspot_df.append(wildtype)
brace_hotspot = brace_hotspot.drop(columns = ["Mutation_Status", "Location"])
brace_hotspot

# Keep two values to compare
compare = ['Wildtype_Tumor','Missense_Mutation']
get = brace_hotspot['Mutation'].isin(compare)
missense_wt = brace_hotspot[get]
missense_wt['Mutation'].value_counts()



#step 3 do a t test for every gene and report the p-value
cols = list(missense_wt.columns[1:])
t_test = u.wrap_ttest(missense_wt, 'Mutation', cols, return_all = True)
t_test = t_test.set_index('Comparison')

# Step 4 join median and p value together
dup_df = median_diff.join(t_test)


duplicate_gene_names = list(set(duplicates.columns.get_level_values('Name'))) #get a list of genes names that are duplicated
# duplicate_gene_names

for name in duplicate_gene_names:
    gene_duplicate = dup_df[dup_df.index.str.contains(name)]
    min_row = gene_duplicate[gene_duplicate.P_Value == gene_duplicate.P_Value.min()]
    new_no_dup_df=no_dup_df.append(min_row)
new_no_dup_df

#reformat the table by getting rid of databaseIDs so we can join it to Endo and BRca
split = no_dup_df.index.str.split('_').str[0:2]
no_dup_df['new_index'] = split
# '_'.join(split)
no_dup_df['new_index'] = no_dup_df['new_index'].str.join('_')


brca_df = no_dup_df.set_index("new_index")


#are there duplicates in the index
brca_df


AttributeError: 'Brca' object has no attribute 'reduce_multiindex'

In [None]:
phosphoproteomics.columns.get_level_values('Name')

In [None]:
mut = br.get_genotype_all_vars(gene, mutation_hotspot=['E542K', 'E545K', 'H1047R'])
phos = br.get_phosphoproteomics(tissue_type="tumor")


In [None]:
dropped = cptac.utils.reduce_multiindex(phos, ["Database_ID", "Peptide"])
dropped

dropped = br.reduce_multiindex(dropped, flatten=True)
dropped

fil = dropped.columns.get_level_values('Name').duplicated(False)
duplicates = phos[phos.columns[fil]]
no_duplicates = dropped[dropped.columns[~fil]]

len(duplicates.columns.get_level_values('Name').unique())
duplicates

In [None]:
#FOR NO DUPLICATES
#get the pval and min for no duplicates

#  Step 1 - Create dataframe in order to do comparisons with wrap_ttest
mut_type = br.get_genotype_all_vars(gene)
# mut_type = mut_type.drop(columns = ["Mutation_Status", "Location"])

joined = mut_type.join(no_duplicates)
joined

#select samples containing hotspot mutations
br_hotspot_df = joined[joined.Location.str.contains('E542K') | 
                    joined.Location.str.contains('E545K') |
                    joined.Location.str.contains('H1047R')]
wildtype  = joined.loc[joined.Mutation == "Wildtype_Tumor"]

braca_hotspot = br_hotspot_df.append(wildtype)
braca_hotspot = braca_hotspot.drop(columns = ["Mutation_Status", "Location"])

braca_hotspot


# Keep two values to compare
compare = ['Wildtype_Tumor','Missense_Mutation']
get = braca_hotspot['Mutation'].isin(compare)
missense_wt = braca_hotspot[get]
missense_wt['Mutation'].value_counts()

# Step 2 get the difference in medians
missense = missense_wt[missense_wt.Mutation == "Missense_Mutation"] #get all missense_mutations
wt = missense_wt[missense_wt.Mutation == "Wildtype_Tumor"] #getn all wildtype
missense_med = missense.median() #get the medain of the missense. This will give the median expression among all samples, for each gene
wt_med = wt.median() #get the median of the wildtype


no_dup_d = {}

for prot in no_duplicates:
    dif_brca = wt_med[prot] - missense_med[prot]
    no_dup_d[prot] = dif_brca
#     print(wt_med[prot], '-', missense_med[prot], '=',dif_endo)

median_diff = pd.DataFrame.from_dict(no_dup_d, orient='index', columns=['Difference_In_Median'])

#step 3 do a t test for every gene and report the p-value
cols = list(missense_wt.columns[1:])
t_test = u.wrap_ttest(missense_wt, 'Mutation', cols, return_all = True)
t_test = t_test.set_index('Comparison')

# Step 4 join median and p value together
no_dup_df = median_diff.join(t_test)
no_dup_df

# Colon

In [None]:
mut = cl.get_genotype_all_vars(gene, mutation_hotspot=['E542K', 'E545K', 'H1047R'])
phos = cl.get_phosphoproteomics(tissue_type="tumor")
# phos  = cl.reduce_multiindex(phos, flatten=True)
phos

In [None]:
dropped = cl.reduce_multiindex(phos, 'Database_ID')
dropped = cl.reduce_multiindex(dropped, flatten=True)

fil = dropped.columns.get_level_values('Name').duplicated(False)
duplicates = phos[phos.columns[fil]] #for duplicates we want to keep all levels, so grab from phos instead of dropped
no_duplicates = dropped[dropped.columns[~fil]]

len(duplicates.columns.get_level_values('Name').unique())
# cl.reduce_multiindex(duplicates, flatten=True)
duplicates

In [None]:
#FOR NO DUPLICATES
#get the pval and min for no duplicates

#  Step 1 - Create dataframe in order to do comparisons with wrap_ttest
mut_type = cl.get_genotype_all_vars(gene)
# mut_type = mut_type.drop(columns = ["Mutation_Status", "Location"])

joined = mut_type.join(no_duplicates)
joined

#select samples containing hotspot mutations
cl_hotspot_df = joined[joined.Location.str.contains('E542K') | 
                    joined.Location.str.contains('E545K') |
                    joined.Location.str.contains('H1047R')]
wildtype  = joined.loc[joined.Mutation == "Wildtype_Tumor"]

colon_hotspot = cl_hotspot_df.append(wildtype)
colon_hotspot = colon_hotspot.drop(columns = ["Mutation_Status", "Location"])

colon_hotspot

# Keep two values to compare
compare = ['Wildtype_Tumor','nonsynonymous SNV']
get = colon_hotspot['Mutation'].isin(compare)
missense_wt = colon_hotspot[get]
missense_wt['Mutation'].value_counts()

# Step 2 get the difference in medians
missense = missense_wt[missense_wt.Mutation == "nonsynonymous SNV"] #get all missense_mutations
wt = missense_wt[missense_wt.Mutation == "Wildtype_Tumor"] #getn all wildtype
missense_med = missense.median() #get the medain of the missense. This will give the median expression among all samples, for each gene
wt_med = wt.median() #get the median of the wildtype


no_dup_d = {}

for prot in no_duplicates:
    dif_colon = wt_med[prot] - missense_med[prot]
    no_dup_d[prot] = dif_colon
#     print(wt_med[prot], '-', missense_med[prot], '=',dif_endo)

median_diff = pd.DataFrame.from_dict(no_dup_d, orient='index', columns=['Difference_In_Median'])

#step 3 do a t test for every gene and report the p-value
cols = list(missense_wt.columns[1:])
t_test = u.wrap_ttest(missense_wt, 'Mutation', cols, return_all = True)
t_test = t_test.set_index('Comparison')

# Step 4 join median and p value together
no_dup_df = median_diff.join(t_test)
no_dup_df

In [None]:
#DEALING WITH DUPLICATES
#get the pval and min for duplicates

duplicates = cl.reduce_multiindex(duplicates, flatten=True)

# We need to figure which of databaseID we want to use, for each protein


#  Step 1 - Create dataframe in order to do comparisons with wrap_ttest

mut_type = cl.get_genotype_all_vars(gene)
joined = mut_type.join(duplicates)
joined

#select samples containing hotspot mutations
cl_hotspot_df = joined[joined.Location.str.contains('E542K') | 
                    joined.Location.str.contains('E545K') |
                    joined.Location.str.contains('H1047R')]
wildtype  = joined.loc[joined.Mutation == "Wildtype_Tumor"]

colon_hotspot = cl_hotspot_df.append(wildtype)
colon_hotspot = colon_hotspot.drop(columns = ["Mutation_Status", "Location"])
colon_hotspot

# Keep two values to compare
compare = ['Wildtype_Tumor','nonsynonymous SNV']
get = colon_hotspot['Mutation'].isin(compare)
missense_wt = colon_hotspot[get]
missense_wt['Mutation'].value_counts()

# Step 2 get the difference in medians
missense = missense_wt[missense_wt.Mutation == "nonsynonymous SNV"] #get all missense_mutations
wt = missense_wt[missense_wt.Mutation == "Wildtype_Tumor"] #getn all wildtype
missense_med = missense.median() #get the medain of the missense. This will give the median expression among all samples, for each gene
wt_med = wt.median() #get the median of the wildtype

duplicates

duplicates_d = {}

for prot in duplicates:
    dif_brca = wt_med[prot] - missense_med[prot]
    duplicates_d[prot] = dif_brca
#     print(wt_med[prot], '-', missense_med[prot], '=',dif_endo)

median_diff = pd.DataFrame.from_dict(duplicates_d, orient='index', columns=['Difference_In_Median'])
median_diff

#step 3 do a t test for every gene and report the p-value
cols = list(missense_wt.columns[1:])
t_test = u.wrap_ttest(missense_wt, 'Mutation', cols, return_all = True)
t_test = t_test.set_index('Comparison')

# Step 4 join median and p value together
dup_df = median_diff.join(t_test)


duplicate_gene_names = list(set(duplicates.columns.get_level_values('Name'))) #get a list of genes names that are duplicated
# duplicate_gene_names

for name in duplicate_gene_names:
    gene_duplicate = dup_df[dup_df.index.str.contains(name)]
    min_row = gene_duplicate[gene_duplicate.P_Value == gene_duplicate.P_Value.min()]
    new_no_dup_df=no_dup_df.append(min_row)
new_no_dup_df

#reformat the table by getting rid of databaseIDs so we can join it to Endo and BRca
split = no_dup_df.index.str.split('_').str[0:2]
no_dup_df['new_index'] = split
# '_'.join(split)
no_dup_df['new_index'] = no_dup_df['new_index'].str.join('_')


colon_df = no_dup_df.set_index("new_index")


#are there duplicates in the index
colon_df



# Join Endo, Brca, and Colon together


In [None]:
brca_df=brca_df.rename_axis(None)

In [None]:
brca_endo = brca_df.join(endo_df, lsuffix='_Brca', rsuffix='_Endo')
brca_endo

In [None]:
final_table = brca_endo.join(colon_df, rsuffix='_colon')
final_table

In [None]:
#only keep significant pvals
significant = final_table.loc[(final_table['P_Value_Brca'] <= .05) |
                (final_table['P_Value_Endo'] <= .05) |
                (final_table['P_Value'] <= .05)]
significant

In [None]:
#significant.to_csv("phosphoproteomics_trans.csv")