# Functions and Imports for Ovarian Cancer Ratio Analysis

### Imports 
This includes transcriptional regulatory interactions from TRRUST v2 (https://www.grnpedia.org/trrust/downloadnetwork.php)

In [35]:
import pandas as pd
import numpy as np
import scipy.stats
import re
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import cptac
tf_data = pd.read_csv('/Users/amandaoliphant/Documents/GitHub/ProteinComplexes/Data_Files/trrust_rawdata.human.tsv', sep='\t', header=None)
tf_data.columns = ['TF', 'Regulated_Gene', 'Regulation_Type', 'ID']

## Initialize datasets

In [22]:
# Create a workable dictionary of protein complex information (all cancer types)
# Select for human complexes
complexData = pd.read_csv('/Users/amandaoliphant/Documents/GitHub/ProteinComplexes/Data_Files/allComplexes.txt', sep='\t')
complexData = complexData.loc[complexData['Organism'] == 'Human']

# Split the proteins in each complex into invdividual columns of a new df
subunitNames = complexData['subunits(Gene name)'].str.split(';', expand = True)
subunitNames.index = complexData.ComplexName

# Create a dictionary (key = complex name, value = list of proteins in complex) and remove None values
subunitNames = subunitNames.loc[~subunitNames.index.duplicated()]
subunitNames = subunitNames.transpose().to_dict('list')
for key, val in subunitNames.items():
    subunitNames[key] = [value for value in val if value != None]

# Ovarian Data

cptac.download(dataset='ovarian', version='latest')
ov = cptac.Ovarian()
ov_clinical = ov.get_clinical()
ov_mutations = ov.get_mutations()
ov_mutation_data = ov_clinical[['Patient_ID', 'Sample_Tumor_Normal']].join(ov_mutations).dropna(axis = 0)
ov_mutation_data = ov_mutation_data.loc[ov_mutation_data['Sample_Tumor_Normal'] == 'Tumor']

#Only for ovarian: Import non-normalized proteomics data for ratio analysis
data = pd.read_csv('/Users/amandaoliphant/Documents/GitHub/ProteinComplexes/Data_Files/proteinGroups_cleaned.txt', sep='\t')

# Remove duplicate rows from the dataset
data = data.set_index('Gene_Name')
data = data.loc[~data.index.duplicated()]

# Endometrial Data
cptac.download(dataset='endometrial', version='latest')
en = cptac.Endometrial()
en_clinical = en.get_clinical()
en_mutations = en.get_mutations()
en_mutation_data = en_clinical[['Patient_ID', 'Proteomics_Tumor_Normal']].join(en_mutations).dropna(axis = 0)
en_mutation_data = en_mutation_data.loc[en_mutation_data['Proteomics_Tumor_Normal'] == 'Tumor']

# Colon Data
cptac.download(dataset='colon', version='latest')
colon = cptac.Colon()
colon_clinical = colon.get_clinical()
colon_mutations = colon.get_mutations()
colon_mutation_data = colon_clinical[['Patient_ID', 'Sample_Tumor_Normal']].join(colon_mutations).dropna(axis = 0)
colon_mutation_data = colon_mutation_data.loc[colon_mutation_data['Sample_Tumor_Normal'] == 'Tumor']

# Renal Data
cptac.download(dataset='renalccrcc', version='latest')
renal = cptac.RenalCcrcc

All files already downloaded and correct.
All files already downloaded and correct.
All files already downloaded and correct.
All files already downloaded and correct.


## Define basic analysis and plotting functions

### get_ratio_df

In [8]:
"""
get_ratio_df
-----------
All patients, single protein, tumor vs normal
Returns two dataframes: tumor and normal containing ratios for the proteins

Parameters:
    prot1, prot2 = (gene) names of the two proteins to be compared (ratio of prot1 / prot2)
    
Returns:
    Two dataframes: tumor and normal containing ratios for the proteins
  
"""
def get_ratio_df(prot1, prot2):
    # Make sure that both proteins are in our dataset
    if not data.index.contains(prot1) or not data.index.contains(prot2): return None, None
    
    # Create a dataframe of ratios for easy testing
    tumor_rows_list = []
    normal_rows_list = []
    for patient in data.columns:
        
        # Determine the sample type
        sample_type = 'Tumor'
        if '_NM' in patient: sample_type = 'Normal'
            
        # Find the ratio prot1 / prot2 and create a new row for the dataframe
        if data.at[prot2, patient] == 0: 
            ratio = np.nan
        else: 
            ratio = data.at[prot1, patient] / data.at[prot2, patient]
        # Create a row entry for the dictionary
        row_dict = {'Ratio': ratio, 'Sample_Type': sample_type, 'Patient_ID': patient}
        
        # Add the new row to the tumor or normal list, depending on the sample type
        if sample_type == 'Tumor':
            tumor_rows_list.append(row_dict)
        else:
            normal_rows_list.append(row_dict)
    
    # Convert the row lists into dataframes
    tumor_ratio_df = pd.DataFrame(tumor_rows_list)
    normal_ratio_df = pd.DataFrame(normal_rows_list)
    
    # If there were no valid ratios for either the tumor or normal samples, return None
    if (not np.isfinite(tumor_ratio_df['Ratio']).any()) or (not np.isfinite(normal_ratio_df['Ratio']).any()): 
        return None, None
    
    # Drop na values
    tumor_ratio_df = tumor_ratio_df.dropna(axis = 0)
    normal_ratio_df = normal_ratio_df.dropna(axis = 0)
    
    return tumor_ratio_df, normal_ratio_df

### test_complex

In [9]:
"""
test_complex
------------
Perform statistical tests on every combination of proteins in a given complex, printing those with significant p-values

Parameters:
    complex_name = name of the complex
    test_type = type of test to perform, levene or ttest
    
Returns:
    Classification of the complex: whether it has proteins missing in the normal, missing proteins in cancer,
    or other (proteins present in both)
    Prints tests with statistically significant values

"""
def test_complex(complex_name, test_type = 'ttest'):
    
    prot_list = subunitNames[complex_name]
    sig_result_found = False
    
    # Keep track of how many significant ratios in this complex include only 0 values for tumor/normal 
    num_missing_tumor = 0
    num_missing_normal = 0
    
    # Calculate the cutoff for significance (Bonferroni correction)
    alpha = 0.05 / (len(subunitNames)*(len(subunitNames) - 1))
    
    for i in range(0, len(prot_list)):
        for j in range(0, len(prot_list)):
            if j == i: continue
                
            # Test the two proteins (ratio of prot1 / prot2)
            tumor_ratio_df, normal_ratio_df = get_ratio_df(prot_list[i], prot_list[j])
            if tumor_ratio_df is None or normal_ratio_df is None: continue

            # Perform the selected statistical test on the ratios
            
            # If both tumor and normal only have 0 values for the ratio, the tests don't work
            if (sum(tumor_ratio_df['Ratio']) == 0) and (sum(normal_ratio_df['Ratio']) == 0): continue   
            # If there are not enough samples, the tests also don't work
            if (len(tumor_ratio_df) < 2) or (len(normal_ratio_df) < 2): continue
                
            if test_type == 'ttest':
                test_result = scipy.stats.ttest_ind(tumor_ratio_df['Ratio'], normal_ratio_df['Ratio'])[1]
            else:
                # This test throws a warning if the variance in both is 0, so we check for that
                if (np.var(tumor_ratio_df['Ratio']) == 0) and (np.var(normal_ratio_df['Ratio']) == 0): continue
                test_result = scipy.stats.levene(tumor_ratio_df['Ratio'], normal_ratio_df['Ratio'])[1]
            
            # If the p-value is significant, print the result and return the appropriate classification
            if test_result < alpha:
                sig_result_found = True
                # Determine the classification of this ratio
                if not (normal_ratio_df['Ratio'] != 0).any():
                    num_missing_normal += 1
                    print('Normal missing ' + prot_list[i] + ' / ' + prot_list[j])
                elif not (tumor_ratio_df['Ratio'] != 0).any():
                    num_missing_tumor += 1
                    print('Tumor missing ' + prot_list[i] + ' / ' + prot_list[j])
    
                print(prot_list[i] + ' / ' + prot_list[j] + ': ' + str(test_result))
                
    if sig_result_found: 
        
        print(complex_name)
        print('---------------------------')
        print('---------------------------')
        
        # Classify the complex as a whole depending on the classification of the majority of ratios
        # If none of the ratios had all 0 values for tumor or normal, it is classified as "Other"
        if num_missing_normal == 0 and num_missing_tumor == 0:
            return "Other"
        elif num_missing_normal >= num_missing_tumor:
            return "Normal Missing Proteins"
        elif num_missing_tumor > num_missing_normal:
            return "Tumor Missing Proteins"
        
    return "NS"

### test_all_complexes

In [10]:
"""
test_all_complexes
------------------
Test all complexes in the dataset for significance

Parameters:
    test_type (optional) = ttest or levene
    
Returns:
    A dictionary: {"Normal Missing Proteins": [List of complexes], 
    "Tumor Missing Proteins": [List of complexes], 
    "Other": [List of complexes]}
    
"""

def test_all_complexes(test_type = 'ttest'):
    
    if test_type != 'ttest' and test_type != 'levene':
        print("Error: test_type must equal 'ttest' or 'levene'")
        return
    
    classified_complexes = {"Normal Missing Proteins": [], "Tumor Missing Proteins": [], "Other": [], "NS": []}
    
    for key, val in subunitNames.items():
        complex_type = test_complex(key, test_type = test_type)
        classified_complexes[complex_type].append(key)
        
    return classified_complexes

### find_mutations

In [11]:
"""
find_mutations
--------------
Search mutational data for mutations in proteins that change in a complex or transcription factors
known to regulate said proteins

Parameters:
    complex_dict = dictionary of protein complexes (output of test_all_complexes)
    transcription_factors = if True, will test for mutations in transcription factors known to regulate
        proteins in the complex
        
Returns:
    Prints results (how many patients have mutations in the protein or transcription factor)

"""

def find_mutations(complex_dict, transcription_factors = False):
    
    for key, val in complex_dict.items():
        if key == 'NS': continue
        print(key)
        print('--------------')
        print('--------------')
        for complex_name in val:
            changed_proteins = []
            mutation_rate = {}

            # Calculate the cutoff for significance (Bonferroni correction)
            alpha = 0.05 / (len(subunitNames)*(len(subunitNames) - 1))

            # Perform tests to find statistically significant differences in ratios
            protein_list = subunitNames[complex_name]
            for prot1 in protein_list:
                for prot2 in protein_list:
                    if prot1 == prot2: continue

                    # Test the two proteins (ratio of prot1 / prot2)
                    tumor_ratio_df, normal_ratio_df = get_ratio_df(prot1, prot2)
                    if tumor_ratio_df is None or normal_ratio_df is None: continue

                    # Perform the selected statistical tests on the ratios
                    
                    # If both tumor and normal only have 0 values for the ratio, the tests don't work
                    if (sum(tumor_ratio_df['Ratio']) == 0) and (sum(normal_ratio_df['Ratio']) == 0): continue
                    # If there are not enough samples, the tests also don't work
                    if (len(tumor_ratio_df) < 2) or (len(normal_ratio_df) < 2): continue
                
                    ttest_result = scipy.stats.ttest_ind(tumor_ratio_df['Ratio'], normal_ratio_df['Ratio'])[1]
                    # The levene test throws a warning if the variance in both is 0, so we check for that
                    if (np.var(tumor_ratio_df['Ratio']) != 0) and (np.var(normal_ratio_df['Ratio']) != 0):
                        levene_result = scipy.stats.levene(tumor_ratio_df['Ratio'], normal_ratio_df['Ratio'])[1]
                    else:
                        levene_result = 1

                    # If the p-value is significant, add the numerator protein to changed_proteins
                    if ttest_result < alpha or levene_result < alpha:
                        changed_proteins.append(prot1)

            changed_proteins = list(set(changed_proteins))
            for protein in changed_proteins:
                if transcription_factors:
                    tfs = tf_data.loc[tf_data['Regulated_Gene'] == protein]['TF']
                    for tf in tfs:
                        num_mutations = len(set(mutation_data.loc[mutation_data['Gene'] == tf]['Patient_ID']))
                        tf_title = tf + ' (regulates ' + protein + ')'
                        if num_mutations > 0:
                            mutation_rate[tf_title] = num_mutations
                else:
                    num_mutations = len(set(mutation_data.loc[mutation_data['Gene'] == protein]['Patient_ID']))
                    if num_mutations > 0:
                        mutation_rate[protein] = num_mutations
            if len(mutation_rate) > 0:
                print(complex_name)
                print(mutation_rate)
                print('--------------')

### find_unique_mutations

In [12]:
"""
find_unique_mutations
---------------------
Find the number of patients with a mutation in any one of the proteins in a list

Parameters:
    prot_list = list of proteins to look for mutations in
    
Returns:
    prints the number of patients with a mutation in a protein in prot_list as well as their identifiers
    returns a list of those patients
"""

def find_unique_mutations(prot_list, cancer_type = 'ov'):
    # Set the cancer type
    if cancer_type == 'ov': mutation_data = ov_mutation_data
    elif cancer_type == 'colon': mutation_data = colon_mutation_data
    elif cancer_type == 'en': mutation_data = en_mutation_data
    else: print('Error: cancer_type must be "ov", "colon", or "en"')
        
    patients = []
    mutated_proteins = {}
    for protein in prot_list:
        mutations = list(mutation_data.loc[mutation_data['Gene'] == protein]['Patient_ID'])
        if len(mutations) > 0:
            mutated_proteins[protein] = len(mutations)
            patients = patients + mutations
    patients = list(set(patients))
    print('Patients with a mutation in any of the ' + str(len(prot_list)) + ' given proteins: ' + str(len(patients)) + '\n')
    print(mutated_proteins)
    return patients

### plot_ratios

In [13]:
"""
plot_ratios
-----------
Create a seaborn plot for the ratios in a dataframe

Parameters:
    prot1, prot2 = the two proteins to plot (ratio of prot1 / prot2)
    by_patient (optional) = T/F whether or not to visualize matched samples
    mutation_list (optional) = plot will distinguish patients with a mutation in any of the proteins in this list
    
Returns:
    Displays a plot of the data
    
"""
def plot_ratios(prot1, prot2, by_patient = False, mutation_list = None):
    
    # Set up dataframe for plotting
    tumor_ratio_df, normal_ratio_df = get_ratio_df(prot1, prot2)
    plot_data = tumor_ratio_df.append(normal_ratio_df)
    
    # Reformat for visualizing matched samples if necessary
    if by_patient or mutation_list:
        # Create a new column for matched status
        tumor_ratio_df['Matched_Status'] = 'Unmatched'
        normal_ratio_df['Matched_Status'] = 'Unmatched'
        # Classify samples as matched/unmatched
        normal_ratio_df.loc[((normal_ratio_df['Patient_ID']).str.replace('_NM', '')).isin(tumor_ratio_df['Patient_ID']), 'Matched_Status'] = 'Matched'
        tumor_ratio_df.loc[((tumor_ratio_df['Patient_ID']) + '_NM').isin(normal_ratio_df['Patient_ID']), 'Matched_Status'] = 'Matched'
        # Label samples accordingly in plot_data
        plot_data = tumor_ratio_df.append(normal_ratio_df)
        if mutation_list:
            # Find patients with mutations in the given proteins
            patient_list = find_unique_mutations(mutation_list)
            plot_data.loc[plot_data['Patient_ID'].isin(patient_list), 'Matched_Status'] = 'Has_Mutation'
        plot_data.loc[plot_data['Matched_Status'] == 'Unmatched', 'Patient_ID'] = 'Unmatched_Sample'
        plot_data['Patient_ID'] = plot_data['Patient_ID'].str.replace('_NM','')
    
    # Reformat for visualizing a certain list of patients if necessary
    if mutation_list and not by_patient:
        plot_data.loc[~plot_data['Patient_ID'].isin(patient_list), 'Patient_ID'] = 'Other Mutation'
        plot_data.loc[plot_data['Patient_ID'].isin(patient_list), 'Patient_ID'] = 'Given Mutation'
    elif mutation_list:
        plot_data.loc[plot_data['Patient_ID'].isin(patient_list), 'Patient_ID'] = 'Given Mutation'
        
    # Print results of statistical tests
    print('T-test p-value: ' + str(scipy.stats.ttest_ind(tumor_ratio_df['Ratio'], normal_ratio_df['Ratio'])[1]))
    print('Levene p-value: ' + str(scipy.stats.levene(tumor_ratio_df['Ratio'], normal_ratio_df['Ratio'])[1]))
    
    a4_dims = (10, 10)
    fig, ax = plt.subplots(figsize=a4_dims)

    # Create the plot
    if by_patient or patient_list:
        boxplt = sns.boxplot(data=plot_data, x='Sample_Type', y='Ratio', color='w', showfliers=False)
        boxplt = sns.stripplot(data=plot_data, x='Sample_Type', y='Ratio', hue='Patient_ID', size=10, dodge=True, jitter=True)
        boxplt.get_legend().set_bbox_to_anchor((1, 1, 0, 0))
    else:
        boxplt = sns.boxplot(data=plot_data, x='Sample_Type', y='Ratio', showfliers=False)
        boxplt = sns.stripplot(data=plot_data, x='Sample_Type', y='Ratio', dodge=True, jitter=True, color='.3')

    # Add styling
    boxplt.set_title('Ratio of ' + prot1 + ' / ' + prot2, fontsize='25')
    boxplt.set_xlabel('')
    boxplt.set_ylabel('Protein Expression Ratio', fontsize='20')
    boxplt.tick_params(labelsize='15')

### ttest_proteomics

In [14]:
"""
ttest_proteomics
----------------
Performs and prints a basic t-test on the proteomics of a given protein (tumor vs normal)

Parameters:
    protein = protein to test
    use_cptac (optional) = T/F whether to use the CPTAC proteomics data (default) or the non-normalized data
    
Returns:
    p-value of the t-test
"""

def ttest_proteomics(protein, use_cptac = True, cancer_type = 'ov'):
    
    # This uses the normalized proteomics data from the CPTAC package
    if use_cptac:
        # Get data from the appropriate cancer type
        if cancer_type == 'ov':
            test_data = ov.join_metadata_to_omics('clinical', 'proteomics', 
                                                metadata_cols = ['Patient_ID', 'Sample_Tumor_Normal'], 
                                                omics_genes = protein)
        elif cancer_type == 'colon': 
            test_data = colon.join_metadata_to_omics('clinical', 'proteomics', 
                                                metadata_cols = ['Patient_ID', 'Sample_Tumor_Normal'], 
                                                omics_genes = protein)
        elif cancer_type == 'en': 
            test_data = en.join_metadata_to_omics('clinical', 'proteomics', 
                                                metadata_cols = ['Patient_ID', 'Proteomics_Tumor_Normal'], 
                                                omics_genes = protein)
        else: print('Error: cancer_type must be "ov", "colon", or "en"')
            
        test_data = test_data.loc[:, ~test_data.columns.duplicated()]
        if cancer_type == 'en':
            test_data.rename(columns={protein+'_proteomics': protein, 'Proteomics_Tumor_Normal': 'Sample_Type'}, inplace = True)
            test_data.loc[test_data['Sample_Type'] != 'Tumor', 'Sample_Type'] = 'Normal'
        else:
            test_data.rename(columns={protein+'_proteomics': protein, 'Sample_Tumor_Normal': 'Sample_Type'}, inplace = True)
        tumor_df = test_data.loc[test_data['Sample_Type'] == 'Tumor'].dropna(axis = 0)
        normal_df = test_data.loc[test_data['Sample_Type'] == 'Normal'].dropna(axis = 0)
    
    # Otherwise use the non-normalized data to make the plot
    else:
        test_data = pd.DataFrame(data.loc[data.index == protein].transpose())
        test_data['Sample_Type'] = 'Tumor'
        test_data.loc[test_data.index.str.contains('_NM'), 'Sample_Type'] = 'Normal'

        tumor_df = test_data.loc[test_data['Sample_Type'] == 'Tumor']
        normal_df = test_data.loc[test_data['Sample_Type'] == 'Normal']
    
    # Perform the t-test
    return scipy.stats.ttest_ind(tumor_df[protein], normal_df[protein])[1]

### find_differential_expression

In [15]:
"""
find_differential_expression
----------------------------
Searches through all the proteomic data and finds proteins with differential expression in tumor cells using t-tests.
Organizes results into categories: tumor lower and normal lower

Parameters:
    cancer_type = 'ov', 'en', or 'colon'; the cancer to search for
    
Returns:
    A dictionary of the form {tumor_lower: [list_of_proteins], normal_lower: [list_of_proteins]}
    'tumor_lower' means that there are lower expression levels of that protein in tumor samples
    
"""

def find_differential_expression(cancer_type = 'ov'):
    results = {'tumor_lower': [], 'normal_lower': []}
    
    # Get data from the appropriate cancer type
    if cancer_type == 'ov':
        alpha = 0.05 / len(ov.get_proteomics().columns)
        for protein in ov.get_proteomics().columns:
            test_data = ov.join_metadata_to_omics('clinical', 'proteomics', 
                                                    metadata_cols = ['Patient_ID', 'Sample_Tumor_Normal'], 
                                                    omics_genes = protein)
            test_data = test_data.loc[:, ~test_data.columns.duplicated()]
            test_data.rename(columns={protein+'_proteomics': protein, 'Sample_Tumor_Normal': 'Sample_Type'}, inplace = True)

            tumor_df = test_data.loc[test_data['Sample_Type'] == 'Tumor'].dropna(axis = 0)
            normal_df = test_data.loc[test_data['Sample_Type'] == 'Normal'].dropna(axis = 0)
            
            pval = scipy.stats.ttest_ind(tumor_df[protein], normal_df[protein])[1]
            if pval < alpha:
                mean_tumor = tumor_df[protein].mean()
                mean_normal = normal_df[protein].mean()
                if mean_tumor < mean_normal: 
                    results['tumor_lower'].append(protein)
                    print(protein + ': ' + str(pval) + ' (tumor lower)')
                else: 
                    results['normal_lower'].append(protein)
                    print(protein + ': ' + str(pval) + ' (normal lower)')
            
        
    elif cancer_type == 'colon': 
        alpha = 0.05 / len(colon.get_proteomics().columns)
        for protein in colon.get_proteomics().columns:
            test_data = colon.join_metadata_to_omics('clinical', 'proteomics', 
                                                    metadata_cols = ['Patient_ID', 'Sample_Tumor_Normal'], 
                                                    omics_genes = protein)
            test_data = test_data.loc[:, ~test_data.columns.duplicated()]
            test_data.rename(columns={protein+'_proteomics': protein, 'Sample_Tumor_Normal': 'Sample_Type'}, inplace = True)

            tumor_df = test_data.loc[test_data['Sample_Type'] == 'Tumor'].dropna(axis = 0)
            normal_df = test_data.loc[test_data['Sample_Type'] == 'Normal'].dropna(axis = 0)
            
            pval = scipy.stats.ttest_ind(tumor_df[protein], normal_df[protein])[1]
            if pval < alpha:
                mean_tumor = tumor_df[protein].mean()
                mean_normal = normal_df[protein].mean()
                if mean_tumor < mean_normal: 
                    results['tumor_lower'].append(protein)
                    print(protein + ': ' + str(pval) + ' (tumor lower)')
                else: 
                    results['normal_lower'].append(protein)
                    print(protein + ': ' + str(pval) + ' (normal lower)')
        
    elif cancer_type == 'en': 
        alpha = 0.05 / len(en.get_proteomics().columns)
        for protein in en.get_proteomics().columns:
            test_data = en.join_metadata_to_omics('clinical', 'proteomics', 
                                                    metadata_cols = ['Patient_ID', 'Proteomics_Tumor_Normal'], 
                                                    omics_genes = protein)
            test_data = test_data.loc[:, ~test_data.columns.duplicated()]
            test_data.rename(columns={protein+'_proteomics': protein, 'Proteomics_Tumor_Normal': 'Sample_Type'}, inplace = True)
            test_data.loc[test_data['Sample_Type'] != 'Tumor', 'Sample_Type'] = 'Normal'

            tumor_df = test_data.loc[test_data['Sample_Type'] == 'Tumor'].dropna(axis = 0)
            normal_df = test_data.loc[test_data['Sample_Type'] == 'Normal'].dropna(axis = 0)
            
            pval = scipy.stats.ttest_ind(tumor_df[protein], normal_df[protein])[1]
            if pval < alpha:
                mean_tumor = tumor_df[protein].mean()
                mean_normal = normal_df[protein].mean()
                if mean_tumor < mean_normal: 
                    results['tumor_lower'].append(protein)
                    print(protein + ': ' + str(pval) + ' (tumor lower)')
                else: 
                    results['normal_lower'].append(protein)
                    print(protein + ': ' + str(pval) + ' (normal lower)')
        
    else: print('Error: cancer_type must be "ov", "colon", or "en"')
    
    return results
    

# Plotting functions

### plot_phosphoproteomics

In [16]:
# https://stackoverflow.com/questions/24685012/pandas-dataframe-renaming-multiple-identically-named-columns
# Rename duplicated columns in a dataframe so that each column has a unique name
def df_column_uniquify(df):
    df_columns = df.columns
    new_columns = []
    for item in df_columns:
        counter = 1
        newitem = item
        while newitem in new_columns:
            counter += 1
            newitem = "{}_{}".format(item, counter)
        new_columns.append(newitem)
    df.columns = new_columns
    return df

def plot_phosphoproteomics(protein, by_patient = False, print_pvals = True, remove_duplicates = False, cancer_type = 'ov'):
    
    # Get data from the appropriate cancer type
    if cancer_type == 'ov':
        phos_data = ov.join_metadata_to_omics('clinical', 'phosphoproteomics', 
                            metadata_cols = ['Patient_ID', 'Sample_Tumor_Normal'], 
                            omics_genes = protein)
    elif cancer_type == 'colon': 
        phos_data = colon.join_metadata_to_omics('clinical', 'phosphoproteomics', 
                            metadata_cols = ['Patient_ID', 'Sample_Tumor_Normal'], 
                            omics_genes = protein)
    elif cancer_type == 'en': 
        phos_data = en.join_metadata_to_omics('clinical', 'phosphoproteomics', 
                            metadata_cols = ['Patient_ID', 'Sample_Tumor_Normal'], 
                            omics_genes = protein)
    else: print('Error: cancer_type must be "ov", "colon", or "en"')
    
    if remove_duplicates: phos_data = phos_data.loc[:, ~phos_data.columns.duplicated()]
    else: phos_data = df_column_uniquify(phos_data)
    plot_data = pd.melt(phos_data, id_vars = ['Patient_ID', 'Sample_Tumor_Normal'], var_name = 'Location', value_name = 'Reading').dropna(axis = 0)
    plot_data['Location'] = plot_data['Location'].str.replace('_phosphoproteomics', '')
    
    # Perform t-tests on tumor vs normal phosphoproteomics for each site
    if print_pvals:
        for column in phos_data.columns:
            if column != 'Patient_ID' and column != 'Sample_Tumor_Normal':
                sitedf = phos_data[['Patient_ID', 'Sample_Tumor_Normal', column]]
                tumordf = sitedf.loc[sitedf['Sample_Tumor_Normal'] == 'Tumor'].dropna(axis = 0)
                normaldf = sitedf.loc[sitedf['Sample_Tumor_Normal'] == 'Normal'].dropna(axis = 0)
                if len(tumordf) > 2 and len(normaldf) > 2:
                    pval = scipy.stats.ttest_ind(tumordf[column], normaldf[column])[1]
                    print(column + ' t-test: ' + str(pval))
    
    # Plot the data
    a4_dims = (20, 20)
    fig, ax = plt.subplots(figsize=a4_dims)

    boxplt = sns.boxplot(data=plot_data, x='Location', y='Reading', hue='Sample_Tumor_Normal', showfliers=False, palette = 'Blues')
    boxplt = sns.stripplot(data=plot_data, x='Location', y='Reading', hue='Sample_Tumor_Normal', dodge=True, jitter=True, color='.3')

    # Add styling
    boxplt.set_title(protein + ' Phosphorylation', fontsize='25')
    boxplt.set_xlabel('')
    boxplt.set_ylabel('Phosphorylation Level', fontsize='20')
    boxplt.tick_params(labelsize='10')


### plot_proteomics

In [17]:
"""
plot_proteomics
---------------
Create a seaborn plot of the proteomics data for a single protein

Parameters:
    protein = the protein to plot
    use_cptac (optional) = T/F whether to use the CPTAC proteomics data (default) or the non-normalized data
    by_patient (optional) = T/F whether or not to visualize matched samples
    print_pvals (optional) = T/F whether or not to print the p-values of the statistical tests
    
Returns:
    Displays a plot of the data

"""

def plot_proteomics(protein, use_cptac = True, by_patient = False, print_pvals = True, cancer_type = 'ov'):
    
    # This uses the normalized proteomics data from the CPTAC package
    if use_cptac:
        # Get data from the appropriate cancer type
        if cancer_type == 'ov':
            plot_data = ov.join_metadata_to_omics('clinical', 'proteomics', 
                                                metadata_cols = ['Patient_ID', 'Sample_Tumor_Normal'], 
                                                omics_genes = protein)
        elif cancer_type == 'colon': 
            plot_data = colon.join_metadata_to_omics('clinical', 'proteomics', 
                                                metadata_cols = ['Patient_ID', 'Sample_Tumor_Normal'], 
                                                omics_genes = protein)
        elif cancer_type == 'en': 
            plot_data = en.join_metadata_to_omics('clinical', 'proteomics', 
                                                metadata_cols = ['Patient_ID', 'Proteomics_Tumor_Normal'], 
                                                omics_genes = protein)
        else: print('Error: cancer_type must be "ov", "colon", or "en"')
            
        plot_data = plot_data.loc[:, ~plot_data.columns.duplicated()]
        if cancer_type == 'en':
            plot_data.rename(columns={protein+'_proteomics': protein, 'Proteomics_Tumor_Normal': 'Sample_Type'}, inplace = True)
            plot_data.loc[plot_data['Sample_Type'] != 'Tumor', 'Sample_Type'] = 'Normal'
        else:
            plot_data.rename(columns={protein+'_proteomics': protein, 'Sample_Tumor_Normal': 'Sample_Type'}, inplace = True)
        plot_data = plot_data.dropna(axis = 0)
        plot_data['Matched_Status'] = 'Unmatched'
        tumor_df = plot_data.loc[plot_data['Sample_Type'] == 'Tumor']
        normal_df = plot_data.loc[plot_data['Sample_Type'] == 'Normal']
    
    # Otherwise use the non-normalized data to make the plot
    else:
        plot_data = pd.DataFrame(data.loc[data.index == protein].transpose())
        plot_data['Sample_Type'] = 'Tumor'
        plot_data.loc[plot_data.index.str.contains('_NM'), 'Sample_Type'] = 'Normal'

        plot_data['Patient_ID'] = plot_data.index
        plot_data['Matched_Status'] = 'Unmatched'
        tumor_df = plot_data.loc[plot_data['Sample_Type'] == 'Tumor']
        normal_df = plot_data.loc[plot_data['Sample_Type'] == 'Normal']
    
    # Format to show matched patients if necessary
    if by_patient:
        pd.options.mode.chained_assignment = None 
        
        # Classify samples as matched/unmatched
        normal_df.loc[((normal_df['Patient_ID']).str.replace('_NM|N', '')).isin(tumor_df['Patient_ID']), 'Matched_Status'] = 'Matched'
        tumor_df.loc[((tumor_df['Patient_ID']) + '_NM').isin(normal_df['Patient_ID']), 'Matched_Status'] = 'Matched'
        tumor_df.loc[((tumor_df['Patient_ID']) + 'N').isin(normal_df['Patient_ID']), 'Matched_Status'] = 'Matched'
        tumor_df.loc[('N' + (tumor_df['Patient_ID'])).isin(normal_df['Patient_ID']), 'Matched_Status'] = 'Matched'
        
        # Label samples accordingly in plot_data
        plot_data = tumor_df.append(normal_df)
        plot_data.loc[plot_data['Matched_Status'] == 'Unmatched', 'Patient_ID'] = 'Unmatched_Sample'
        plot_data['Patient_ID'] = plot_data['Patient_ID'].str.replace('_NM|N','')
    
    if print_pvals:
        # Print results of statistical tests
        print('T-test p-value: ' + str(scipy.stats.ttest_ind(tumor_df[protein], normal_df[protein])[1]))
        print('Levene p-value: ' + str(scipy.stats.levene(tumor_df[protein], normal_df[protein])[1]))
    
    # Create the plot
    a4_dims = (10, 10)
    fig, ax = plt.subplots(figsize=a4_dims)
    
    if by_patient:
        boxplt = sns.boxplot(data=plot_data, x='Sample_Type', y=protein, color='w', showfliers=False)
        boxplt = sns.stripplot(data=plot_data, x='Sample_Type', y=protein, hue='Patient_ID', size=10, dodge=True, jitter=True)
        boxplt.get_legend().set_bbox_to_anchor((1, 1, 0, 0))
    else:
        boxplt = sns.boxplot(data=plot_data, x='Sample_Type', y=protein, showfliers=False)
        boxplt = sns.stripplot(data=plot_data, x='Sample_Type', y=protein, dodge=True, jitter=True, color='.3')
        
    # Add styling
    boxplt.set_title(protein + ' Proteomics', fontsize='25')
    boxplt.set_xlabel('')
    boxplt.set_ylabel('Protein Expression', fontsize='20')
    boxplt.tick_params(labelsize='15')

### plot_complex_clinical

In [62]:
"""
plot_complex_clinical
---------------------
Creates a countplot of the number of proteins in a complex that are differentially expressed in tumor/normal matched samples
Also displays the given clinical feature 

Parameters:
    cancer_type = can be colon, 
    clinical_feature = the name of the column from the clinical dataframe that you want to visualize
    protein_list = list of proteins to analyze (most likely the proteins in a complex)
    expression_levels = increased, decreased, or nochange; will determine if the plot shows the number
        of proteins with increased expression in tumor samples, decreased in tumor samples, or no change between
        tumor/normal
    categorize = How to bin patients together based on number of proteins with differential expression.
        small: Creates bins of 0-5, 6-10, 11-15, etc. up to 30+ for the number of proteins
        large: Creates bins of 0-100, 101-200, etc. up to 1200+
    complex_name (optional) = The title for the plot
    
Returns:
    Displays a Seaborn countplot
    x-axis: Number of proteins with differential expression
    y-axis: Number of patients in this category
    hue: clinical feature
    
"""

def plot_complex_clinical(cancer_type, clinical_feature, protein_list, expression_levels = 'increased', categorize = 'small', complex_name = ''):
    if cancer_type == 'colon':
        # Start out with a df including the patient IDs and the given clinical feature
        plot_data = pd.DataFrame()
        plot_data = colon.get_clinical()[['Patient_ID', clinical_feature]]
        plot_data['Sample_ID'] = plot_data.index
        
        # Add protein expression values for all proteins in the given list
        proteomics_data = colon.get_proteomics()[protein_list]
        plot_data = plot_data.join(proteomics_data)

        # Extract all the normal matched samples into a new df and remove the 'N' to match them with tumor samples
        normal = plot_data.loc[plot_data['Patient_ID'].str.contains('N')]
        normal['Patient_ID'] = normal['Patient_ID'].str.replace('N', '')
        normal.index = normal['Patient_ID']

        # Create new columns to count the number of proteins with increased/decreased/nochange levels in tumor samples
        # Remove the normal samples from plot_data
        plot_data['Num_Tumor_Increased'] = 0 
        plot_data['Num_Tumor_Decreased'] = 0
        plot_data['Num_No_Change'] = 0
        plot_data = plot_data.loc[~plot_data['Patient_ID'].str.contains('N')]
        plot_data.index = plot_data['Patient_ID']

        # Remove unmatched samples from each dataframe
        plot_data = plot_data.loc[plot_data['Patient_ID'].isin(normal['Patient_ID'])]
        normal = normal.loc[normal['Patient_ID'].isin(plot_data['Patient_ID'])]

        # Fill out the Num_* columns by comparing the protein expression for matched patients in plot_data/normal dfs
        for protein in protein_list:
            mean_diff = np.abs(np.mean(plot_data[protein]) - np.mean(normal[protein]))
            plot_data.loc[((plot_data[protein] - normal[protein]) >= mean_diff), 'Num_Tumor_Increased'] += 1
            plot_data.loc[((plot_data[protein] - normal[protein]) <= (-1*mean_diff)), 'Num_Tumor_Decreased'] += 1
            plot_data.loc[((plot_data[protein] - normal[protein]) <= mean_diff)
                          & ((plot_data[protein] - normal[protein]) >= (-1*mean_diff)), 'Num_No_Change'] += 1

        # Call another function to create the plot
        create_complex_clinical_plot(plot_data, clinical_feature, protein_list, categorize, expression_levels, complex_name)
    
    else: print('Error: invalid or unimplemented cancer type')

### create_complex_clinical_plot

In [64]:
"""
create_complex_clinical_plot
----------------------------
Function called by plot_complex_clinical; this prepares the dataframe for plotting and creates the actual plot

Parameters are the same as those passed to plot_complex_clinical

"""
def create_complex_clinical_plot(plot_data, clinical_feature, protein_list, categorize, expression_levels, complex_name):
    # Drop the proteomics data (not necessary at this point)
    plot_data.drop(protein_list, axis = 1, inplace=True)
    
    # If indicated, bin the patients into groups based on number of proteins increased/decreased/nochange
    # This makes it easier to view, otherwise there are lots of ticks on the x-axis
    if categorize == 'small':
        plot_data['Increased_Category'] = '0-5'
        plot_data.loc[((plot_data['Num_Tumor_Increased'] > 5) & (plot_data['Num_Tumor_Increased'] <= 10)), 'Increased_Category'] = '6-10'
        plot_data.loc[((plot_data['Num_Tumor_Increased'] > 10) & (plot_data['Num_Tumor_Increased'] <= 15)), 'Increased_Category'] = '11-15'
        plot_data.loc[((plot_data['Num_Tumor_Increased'] > 15) & (plot_data['Num_Tumor_Increased'] <= 20)), 'Increased_Category'] = '16-20'
        plot_data.loc[((plot_data['Num_Tumor_Increased'] > 20) & (plot_data['Num_Tumor_Increased'] <= 25)), 'Increased_Category'] = '21-25'
        plot_data.loc[((plot_data['Num_Tumor_Increased'] > 25) & (plot_data['Num_Tumor_Increased'] <= 30)), 'Increased_Category'] = '26-30'
        plot_data.loc[(plot_data['Num_Tumor_Increased'] > 30), 'Increased_Category'] = '31+'
        
        plot_data['Decreased_Category'] = '0-5'
        plot_data.loc[((plot_data['Num_Tumor_Decreased'] > 5) & (plot_data['Num_Tumor_Decreased'] <= 10)), 'Decreased_Category'] = '6-10'
        plot_data.loc[((plot_data['Num_Tumor_Decreased'] > 10) & (plot_data['Num_Tumor_Decreased'] <= 15)), 'Decreased_Category'] = '11-15'
        plot_data.loc[((plot_data['Num_Tumor_Decreased'] > 15) & (plot_data['Num_Tumor_Decreased'] <= 20)), 'Decreased_Category'] = '16-20'
        plot_data.loc[((plot_data['Num_Tumor_Decreased'] > 20) & (plot_data['Num_Tumor_Decreased'] <= 25)), 'Decreased_Category'] = '21-25'
        plot_data.loc[((plot_data['Num_Tumor_Decreased'] > 25) & (plot_data['Num_Tumor_Decreased'] <= 30)), 'Decreased_Category'] = '26-30'
        plot_data.loc[(plot_data['Num_Tumor_Decreased'] > 30), 'Decreased_Category'] = '31+'

        plot_data['No_Change_Category'] = '0-5'
        plot_data.loc[((plot_data['Num_No_Change'] > 5) & (plot_data['Num_No_Change'] <= 10)), 'No_Change_Category'] = '6-10'
        plot_data.loc[((plot_data['Num_No_Change'] > 10) & (plot_data['Num_No_Change'] <= 15)), 'No_Change_Category'] = '11-15'
        plot_data.loc[((plot_data['Num_No_Change'] > 15) & (plot_data['Num_No_Change'] <= 20)), 'No_Change_Category'] = '16-20'
        plot_data.loc[((plot_data['Num_No_Change'] > 20) & (plot_data['Num_No_Change'] <= 25)), 'No_Change_Category'] = '21-25'
        plot_data.loc[((plot_data['Num_No_Change'] > 25) & (plot_data['Num_No_Change'] <= 30)), 'No_Change_Category'] = '26-30'
        plot_data.loc[(plot_data['Num_No_Change'] > 30), 'No_Change_Category'] = '31+'
        
        # Create the appropriate plot based on the desired differential expression type
        # increased means increased in tumor samples
        a4_dims = (10, 10)
        fig, ax = plt.subplots(figsize=a4_dims)
        
        if expression_levels == 'increased':
            countplot = sns.countplot(x='Increased_Category', hue=clinical_feature, data=plot_data, order = ['0-5', '11-15', '16-20', '21-25', '26-30', '31+'])
            countplot.set_xlabel('Number of Proteins with Elevated Expression in Tumor Samples', fontsize = '15')
            
        elif expression_levels == 'decreased':
            countplot = sns.countplot(x='Decreased_Category', hue=clinical_feature, data=plot_data, order = ['0-5', '11-15', '16-20', '21-25', '26-30', '31+'])
            countplot.set_xlabel('Number of Proteins with Decreased Expression in Tumor Samples', fontsize = '15')
            
        else:
            countplot = sns.countplot(x='No_Change_Category', hue=clinical_feature, data=plot_data, order = ['0-5', '11-15', '16-20', '21-25', '26-30', '31+'])
            countplot.set_xlabel('Number of Proteins with No Change in Expression Between Tumor/Normal Samples', fontsize = '15')
        
        tick_spacing = 2
        ax.yaxis.set_major_locator(ticker.MultipleLocator(tick_spacing))
        countplot.set_ylabel('Number of Individuals', fontsize = '15')
        countplot.set_title(complex_name, fontsize = '25')
       
    elif categorize == 'large':
        plot_data['Increased_Category'] = '0-100'
        plot_data.loc[((plot_data['Num_Tumor_Increased'] > 100) & (plot_data['Num_Tumor_Increased'] <= 200)), 'Increased_Category'] = '101-200'
        plot_data.loc[((plot_data['Num_Tumor_Increased'] > 200) & (plot_data['Num_Tumor_Increased'] <= 300)), 'Increased_Category'] = '201-300'
        plot_data.loc[((plot_data['Num_Tumor_Increased'] > 300) & (plot_data['Num_Tumor_Increased'] <= 400)), 'Increased_Category'] = '301-400'
        plot_data.loc[((plot_data['Num_Tumor_Increased'] > 400) & (plot_data['Num_Tumor_Increased'] <= 500)), 'Increased_Category'] = '401-500'
        plot_data.loc[((plot_data['Num_Tumor_Increased'] > 500) & (plot_data['Num_Tumor_Increased'] <= 600)), 'Increased_Category'] = '501-600'
        plot_data.loc[((plot_data['Num_Tumor_Increased'] > 600) & (plot_data['Num_Tumor_Increased'] <= 700)), 'Increased_Category'] = '601-700'
        plot_data.loc[((plot_data['Num_Tumor_Increased'] > 700) & (plot_data['Num_Tumor_Increased'] <= 800)), 'Increased_Category'] = '701-800'
        plot_data.loc[(plot_data['Num_Tumor_Increased'] > 800), 'Increased_Category'] = '801+'
        
        plot_data['Decreased_Category'] = '0-100'
        plot_data.loc[((plot_data['Num_Tumor_Decreased'] > 100) & (plot_data['Num_Tumor_Decreased'] <= 200)), 'Decreased_Category'] = '101-200'
        plot_data.loc[((plot_data['Num_Tumor_Decreased'] > 200) & (plot_data['Num_Tumor_Decreased'] <= 300)), 'Decreased_Category'] = '201-300'
        plot_data.loc[((plot_data['Num_Tumor_Decreased'] > 300) & (plot_data['Num_Tumor_Decreased'] <= 400)), 'Decreased_Category'] = '301-400'
        plot_data.loc[((plot_data['Num_Tumor_Decreased'] > 400) & (plot_data['Num_Tumor_Decreased'] <= 500)), 'Decreased_Category'] = '401-500'
        plot_data.loc[((plot_data['Num_Tumor_Decreased'] > 500) & (plot_data['Num_Tumor_Decreased'] <= 600)), 'Decreased_Category'] = '501-600'
        plot_data.loc[((plot_data['Num_Tumor_Decreased'] > 600) & (plot_data['Num_Tumor_Decreased'] <= 700)), 'Decreased_Category'] = '601-700'
        plot_data.loc[((plot_data['Num_Tumor_Decreased'] > 700) & (plot_data['Num_Tumor_Decreased'] <= 800)), 'Decreased_Category'] = '701-800'
        plot_data.loc[(plot_data['Num_Tumor_Decreased'] > 800), 'Decreased_Category'] = '801+'
        
        plot_data['No_Change_Category'] = '0-100'
        plot_data.loc[((plot_data['Num_No_Change'] > 100) & (plot_data['Num_No_Change'] <= 200)), 'No_Change_Category'] = '101-200'
        plot_data.loc[((plot_data['Num_No_Change'] > 200) & (plot_data['Num_No_Change'] <= 300)), 'No_Change_Category'] = '201-300'
        plot_data.loc[((plot_data['Num_No_Change'] > 300) & (plot_data['Num_No_Change'] <= 400)), 'No_Change_Category'] = '301-400'
        plot_data.loc[((plot_data['Num_No_Change'] > 400) & (plot_data['Num_No_Change'] <= 500)), 'No_Change_Category'] = '401-500'
        plot_data.loc[((plot_data['Num_No_Change'] > 500) & (plot_data['Num_No_Change'] <= 600)), 'No_Change_Category'] = '501-600'
        plot_data.loc[((plot_data['Num_No_Change'] > 600) & (plot_data['Num_No_Change'] <= 700)), 'No_Change_Category'] = '601-700'
        plot_data.loc[((plot_data['Num_No_Change'] > 700) & (plot_data['Num_No_Change'] <= 800)), 'No_Change_Category'] = '701-800'
        plot_data.loc[(plot_data['Num_No_Change'] > 800), 'No_Change_Category'] = '801+'
        
        # Create the appropriate plot based on the desired differential expression type
        # increased means increased in tumor samples
        a4_dims = (10, 10)
        fig, ax = plt.subplots(figsize=a4_dims)
        
        if expression_levels == 'increased':
            countplot = sns.countplot(x='Increased_Category', hue=clinical_feature, data=plot_data, order = ['0-100', '101-200', '201-300', '301-400', '401-500', '501-600', '601-700', '701-800', '801+'])
            countplot.set_xlabel('Number of Proteins with Elevated Expression in Tumor Samples', fontsize = '15')
            
        elif expression_levels == 'decreased':
            countplot = sns.countplot(x='Decreased_Category', hue=clinical_feature, data=plot_data, order = ['0-100', '101-200', '201-300', '301-400', '401-500', '501-600', '601-700', '701-800', '801+'])
            countplot.set_xlabel('Number of Proteins with Decreased Expression in Tumor Samples', fontsize = '15')
            
        else:
            countplot = sns.countplot(x='No_Change_Category', hue=clinical_feature, data=plot_data, order = ['0-100', '101-200', '201-300', '301-400', '401-500', '501-600', '601-700', '701-800', '801+'])
            countplot.set_xlabel('Number of Proteins with No Change in Expression Between Tumor/Normal Samples', fontsize = '15')
        
        tick_spacing = 2
        ax.yaxis.set_major_locator(ticker.MultipleLocator(tick_spacing))
        countplot.set_ylabel('Number of Individuals', fontsize = '15')
        countplot.set_title(complex_name, fontsize = '25')
    
    else:
        # Same idea as above, just without binning the patients
        plot_data['Increased_Category'] = plot_data['Num_Tumor_Increased'].astype(str)
        plot_data['Decreased_Category'] = plot_data['Num_Tumor_Decreased'].astype(str)
        plot_data['No_Change_Category'] = plot_data['Num_No_Change'].astype(str)
        a4_dims = (10, 10)
        fig, ax = plt.subplots(figsize=a4_dims)
        
        if expression_levels == 'increased':
            x_axis_labels = list(set(plot_data['Increased_Category'].astype(int).sort_values()))
            x_axis_labels = [str(i) for i in x_axis_labels] 
            countplot = sns.countplot(x='Increased_Category', hue=clinical_feature, data=plot_data, order = x_axis_labels)
            countplot.set_xlabel('Number of Proteins with Elevated Expression in Tumor Samples', fontsize = '15')
            
        elif expression_levels == 'decreased':
            x_axis_labels = list(set(plot_data['Decreased_Category'].astype(int).sort_values()))
            x_axis_labels = [str(i) for i in x_axis_labels] 
            countplot = sns.countplot(x='Decreased_Category', hue=clinical_feature, data=plot_data, order = x_axis_labels)
            countplot.set_xlabel('Number of Proteins with Decreased Expression in Tumor Samples', fontsize = '15')
            
        else:
            x_axis_labels = list(set(plot_data['No_Change_Category'].astype(int).sort_values()))
            x_axis_labels = [str(i) for i in x_axis_labels] 
            countplot = sns.countplot(x='No_Change_Category', hue=clinical_feature, data=plot_data, order = x_axis_labels)
            countplot.set_xlabel('Number of Proteins with No Change in Expression Between Tumor/Normal Samples', fontsize = '15')
        
        tick_spacing = 2
        ax.yaxis.set_major_locator(ticker.MultipleLocator(tick_spacing))
        countplot.set_ylabel('Number of Individuals', fontsize = '15')
        countplot.set_title(complex_name, fontsize = '25')

### tidy_split

In [18]:
# Thanks StackOverflow! 
# https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows

def tidy_split(df, column, sep='|', keep=False):
    """
    Split the values of a column and expand so the new DataFrame has one split
    value per row. Filters rows where the column is missing.

    Params
    ------
    df : pandas.DataFrame
        dataframe with the column to split and expand
    column : str
        the column to split and expand
    sep : str
        the string used to split the column's values
    keep : bool
        whether to retain the presplit value as it's own row

    Returns
    -------
    pandas.DataFrame
        Returns a dataframe with the same columns as `df`.
    """
    indexes = list()
    new_values = list()
    #df = df.dropna(subset=[column])
    for i, presplit in enumerate(df[column].astype(str)):
        values = presplit.split(sep)
        if keep and len(values) > 1:
            indexes.append(i)
            new_values.append(presplit)
        for value in values:
            indexes.append(i)
            new_values.append(value)
    new_df = df.iloc[indexes, :].copy()
    new_df[column] = new_values
    return new_df