In [37]:
import pandas as pd
import numpy as np
import binarization_functions as bf

## Step 1a: Load the data

In [38]:
endo = pd.read_csv('Data/proteomic_sig_enrichments_endo_ALL.csv')
colon = pd.read_csv('Data/proteomic_sig_enrichments_colon_ALL.csv')

In [39]:
#Format DataFrames

#Endometrial
endo.reset_index(drop=True)
endo_cols = list(endo.columns)
endo_cols[0] = 'Index'
endo.columns = endo_cols
endo.set_index('Index', inplace=True)

#Colon
colon.reset_index(drop=True)
colon_cols = list(colon.columns)
colon_cols[0] = 'Index'
colon.columns = colon_cols
colon.set_index('Index', inplace=True)

## Step 1b: Find Significant enrichments

In [40]:
#Endometrial

#Get rid of unnecessary NaN values
endo = endo.dropna(axis=0, how='all')

print("Testing for significant enrichments in Endometrial data:")
endo_sig_cols = []
for col in endo.columns:
    endo_sig_col = bf.significantEnrichments(endo, col, 0.05)
    #Get rid of columns that have no significant enrichments
    if endo_sig_col is not None: 
        endo_sig_cols.append(sig_col)
    else:
        endo = endo.drop(col, axis=1)

Testing for significant enrichments in Endometrial data:
14 significant protein enrichments in Proteomics_Tumor_Normal_Other_tumor

4 significant protein enrichments in Myometrial_invasion_Specify_50%_or_more

586 significant protein enrichments in Histologic_type_Serous

118 significant protein enrichments in Path_Stage_Reg_Lymph_Nodes-pN_FIGO_III

11 significant protein enrichments in LVSI_1.0

5 significant protein enrichments in Age_Young

5 significant protein enrichments in Tumor_Site_Anterior

3 significant protein enrichments in Tumor_Focality_Multifocal

9 significant protein enrichments in MSI_status_MSI-H

9 significant protein enrichments in Genomics_subtype_MSI-H



In [41]:
#Colon

#Get rid of unnecessary NaN values
colon = colon.dropna(axis=0, how='all')

print("Testing for significant enrichments in Colon data:")
colon_sig_cols = []
for col in colon.columns:
    colon_sig_col = bf.significantEnrichments(colon, col, 0.05)
    #Get rid of columns that have no significant enrichments
    if colon_sig_col is not None: 
        colon_sig_cols.append(sig_col)
    else:
        colon = colon.drop(col, axis=1)

Testing for significant enrichments in Colon data:
12 significant protein enrichments in Proteomic_subtype_DorE

173 significant protein enrichments in mutation_rate_High_Mutation_Rate

162 significant protein enrichments in Mutation_Phenotype_MSI-H



## Step 2a: Find a specific category to compare

Here we will compare MSI_H enrichments between Colon and Endometrial Cancer to see if there are any common enrichments

In [42]:
#Get data from csv files and compare

#Create dataframe and list of significant enrichments from endometrial MSI-H samples
endo_MSI_H = pd.read_csv('Data/proteomic_endo_MSI_H_enrichments.csv')
endo_MSI_H.reset_index(drop=True)
endo_MSI_H.columns = ['Index', 'Genomics_subtype_MSI-H_P_values']
endo_MSI_H.set_index('Index', inplace=True)
MSI_sig_enrichments_endo = list(endo_MSI_H.index)
print("Significant enrichments for endometrial cancer patients:")
print(MSI_sig_enrichments_endo)
print('\n')

#Read the same data from colon, and create a list to compare them
colon_MSI_H = pd.read_csv('Data/proteomic_colon_MSI_H_enrichments.csv')
colon_MSI_H.columns = ['Index', 'Mutation_Phenotype_MSI-H_P_values']
colon_MSI_H.set_index('Index', inplace=True)
MSI_sig_enrichments_colon = list(colon_MSI_H.index)
print("Significant enrichments for colon cancer patients:")
print(MSI_sig_enrichments_colon)
print('\n')

#Compare enrichments between colon and endometrial for MSI_H
for item in MSI_sig_enrichments_endo:
    if item in MSI_sig_enrichments_colon:
        print(item+" enrichment does overlap between colon and endometrial cancers.\n")
    else:
        print(item+" enrichment does not overlap between colon and endometrial cancers.\n")
        
#Conclusion? There are no significant enrichments that overlap on MSI_H between colon and endometrial

Significant enrichments for endometrial cancer patients:
['CCL20', 'PEG10', 'RPL22L1']


Significant enrichments for colon cancer patients:
['ADGRG6', 'AIF1L', 'APOL1', 'APOL2', 'CPOX', 'EIF2D', 'EXO1', 'HPSE', 'MAPK12', 'NDUFA4L2', 'PIK3AP1', 'PMF1', 'QSOX1', 'RNF19B', 'S100A16', 'SSFA2', 'USF1', 'WARS', 'WDR25']


CCL20 enrichment does not overlap between colon and endometrial cancers.

PEG10 enrichment does not overlap between colon and endometrial cancers.

RPL22L1 enrichment does not overlap between colon and endometrial cancers.



## Step 2b: Do a general comparison for all enrichment columns

Because the only clinical attribute in common between proteomic outlier data for endometrial and colon cancers is MSI_H, this step is irrelevant. However, in some cases, there may be more columns in common, in which case you may opt to automate a comparison that applies concepts from the above cell.