# Generate plots to show the consistency of sibling peptide ratios

In [None]:
import numpy as np
import pandas as pd

import random
import sys, os
from itertools import combinations

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap

In [None]:
#Read data frame
data_df = pd.read_csv('preprocess_datasets/preprocessed_datasets/2019_guo_nci60_formatted_peptide_quants.tsv', sep = '\t', index_col = 0)
print(data_df.shape)

# Eliminate rows that contains more than one protein ID 
protein_ids = data_df['Protein']
no_of_occurences = [v.count(';') for v in protein_ids.values]
indices_to_remove = np.where(np.asarray(no_of_occurences) > 0)[0]
protein_ids[indices_to_remove]

data_df = data_df.drop(indices_to_remove)
print("Non-unique proteins eliminated")
print("Data frame shape: ", data_df.shape)


In [None]:
data_df = data_df.drop(['R2', 'best mscore', 'numNA',
                        'Charge 1', 'Charge 2', 'Charge 3', 
                        'Charge 4', 'Charge 5', 'Charge 6'], axis = 1)
data_df

In [None]:
# Select all pairs of peptide siblings for all proteins
all_proteins = np.unique(data_df['Protein'].values)
all_peptides = data_df['Peptide'].values
print("Number of proteins: ", len(all_proteins))
print("Unique proteins: ", all_proteins)

#For all proteins, record indices of all sibling peptides
sibling_pairs = []

for protein in all_proteins:
    print("-------")
    print(protein)
    
    sub_df = data_df[data_df['Protein'] == protein]
    print(sub_df.iloc[:, :2])
    
    sub_peptides = sub_df['Peptide'].values
    sub_peptide_indices = [np.where(all_peptides == s)[0][0] for s in sub_peptides]

    for i in range(len(sub_peptide_indices)):
        for j in range(i + 1, len(sub_peptide_indices)):
            #print(all_peptides[sub_peptide_indices[i]], "-", all_peptides[sub_peptide_indices[j]])
            pair = [sub_peptide_indices[i], sub_peptide_indices[j]]
            sibling_pairs.append(pair)
            
#Record all sibling pairs
sibling_pairs = np.asarray(sibling_pairs)
print("Number of peptide pairs ", sibling_pairs.shape[0])

#Define all pairs of random runs to sample intensities from
random_run_pairs = list(combinations(np.arange(3, data_df.shape[1]), 2))
print("Total number of random run pairs ", len(random_run_pairs))


## Randomly sample sibling ratios

In [None]:
#Super function to sample sibling peptides
def randomSiblingRatios(random_run_count = 1000, supress_print = True):
    #supresses prints
    if not supress_print:
        return randomSiblingRatios_sub(random_run_count)
    
    #enables all prints for debugging
    else:
        with HiddenPrints():
            return randomSiblingRatios_sub(random_run_count)
        
#Function to sample sibling peptides
def randomSiblingRatios_sub(random_run_count = 1000):
    
    #Siblings ratio: 
    #ratio1 = sibling1 in run 1 / sibling1 in run 2
    #ratio2 = sibling2 in run 1 / sibling2 in run 2
    
    final_sibling_ratios_siblings = np.zeros((random_run_count, 2))
    current_sample_count = 0
    while current_sample_count < random_run_count:
        
        print("----------")
        
        #Select a random sibling pair
        random_sibling_pair = random.sample(range(0, len(sibling_pairs) - 1), 1)
        print(data_df.iloc[sibling_pairs[random_sibling_pair][0], :2])
        
        #Select a random experiment pair
        random_experiment_pair = random.sample(range(0, len(random_run_pairs) - 1), 1)
        random_experiment_pair = np.asarray(random_run_pairs)[random_experiment_pair][0]
        print("Random experiment pair: ", random_experiment_pair)
        
        #For each pair, record the quantities
        quantities = data_df.iloc[sibling_pairs[random_sibling_pair][0], random_experiment_pair]
        print(quantities)
        
        #Reject sample if any values are 0
        if np.all(quantities.values != 0) and \
            np.all(~np.isnan(quantities.values)):
            
            quantities = np.array(quantities.values, dtype=np.float64)
            
            #Calculate sibling ratios
            sibling_ratios = quantities[:, 0] / quantities[:, 1]
            print("Sibling ratios: ", sibling_ratios)
            
            #Record final results
            final_sibling_ratios_siblings[current_sample_count] = sibling_ratios
            current_sample_count += 1

    return final_sibling_ratios_siblings


## Randomly sample non-sibling ratios

In [None]:
#Suppressing print statements
class HiddenPrints:
    
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

#Super function to sample non-sibling peptides
def randomNonsiblingRatios(random_run_count = 1000, supress_print = True):
   
    #supresses prints
    if not supress_print:
        return randomNonsiblingRatios_sub(random_run_count)
    
    #enables all prints for debugging
    else:
        with HiddenPrints():
            return randomNonsiblingRatios_sub(random_run_count)
        
#Function to sample non-sibling peptides
def randomNonsiblingRatios_sub(random_run_count = 1000):
    
    final_nonsibling_ratios_siblings = np.zeros((random_run_count, 2))
    current_sample_count = 0
    while current_sample_count < random_run_count:
        
        print("----------")
        
        #Select a random peptide pair
        random_peptide_pair = random.sample(range(0, len(all_peptides) - 1), 2)
        print(data_df.iloc[random_peptide_pair, :2])
        
        #Reject sample if peptides are sibling
        if data_df.iloc[random_peptide_pair[0]]['Protein'] != data_df.iloc[random_peptide_pair[1]]['Protein']:
            
            #Select a random experiment pair
            random_experiment_pair = random.sample(range(0, len(random_run_pairs) - 1), 1)
            random_experiment_pair = np.asarray(random_run_pairs)[random_experiment_pair][0]
            print("Random experiment pair: ", random_experiment_pair)

            #For each pair, record the quantities
            quantities = data_df.iloc[random_peptide_pair, random_experiment_pair]
            print(quantities)

            #Reject sample if any values are 0
            if np.all(quantities.values != 0) and \
               np.all(~np.isnan(quantities.values)):
                
                quantities = np.array(quantities.values, dtype=np.float64)
            
                #Calculate sibling ratios
                sibling_ratios = quantities[:, 0] / quantities[:, 1]
                print("Sibling ratios: ", sibling_ratios)

                #Record final results
                final_nonsibling_ratios_siblings[current_sample_count] = sibling_ratios
                current_sample_count += 1
                
    return final_nonsibling_ratios_siblings





In [None]:
#Define function for create plots
def createPlots(sibling_ratios, nonsibling_ratios, range_start = -6, range_end = 6):
    
    cmap = LinearSegmentedColormap.from_list('mycmap', ['#e7f4fd', '#2d98da', '#3867d6', '#0c2d41'])

    #Create plot for siblings
    g = sns.jointplot(data=np.log10(pd.DataFrame(sibling_ratios, columns = ['Ratio from run 1', 'Ratio from run 2'])), 
                      height = 15,
                      x="Ratio from run 1", y="Ratio from run 2", 
                      kind = 'kde', cmap = cmap, 
                      marginal_kws=dict(color='#2d98da', alpha=0.5), fill=True)

    g.ax_joint.set_xlim([range_start, range_end])
    g.ax_joint.set_ylim([range_start, range_end])
    g.ax_joint.set_xticks(np.arange(range_start, range_end + 1))
    g.ax_joint.set_yticks(np.arange(range_start, range_end + 1))
    g.ax_joint.grid(True)
    g.ax_joint.plot([range_start, range_end], [range_start, range_end], lw = 3, color = 'Black', alpha = 0.5)
    g.fig.suptitle('Distribution of sibling peptide ratios', 
                   fontsize = 30)
    g.fig.subplots_adjust(top=0.95) 
    g.ax_joint.set_xlabel('Intensity ratio for sibling peptide 1', fontsize=30)
    g.ax_joint.set_ylabel('Intensity ratio for sibling peptide 2', fontsize=30)
    label_ticks = []
    for i in range(range_start, range_end + 1):
        label_ticks.append('$10^{' + str(i) + '}$')
    g.ax_joint.set_xticklabels(label_ticks, fontsize = 30)
    g.ax_joint.set_yticklabels(label_ticks, fontsize = 30)
    
    
    #Create plot for non-siblings
    g = sns.jointplot(data=np.log10(pd.DataFrame(nonsibling_ratios, columns = ['Ratio from run 1', 'Ratio from run 2'])), 
                      height = 15,
                      x="Ratio from run 1", y="Ratio from run 2", 
                      kind = 'kde', fill = True, cmap = cmap, 
                      marginal_kws=dict(color='#2d98da', alpha = 0.5))

    g.ax_joint.set_xlim([range_start, range_end])
    g.ax_joint.set_ylim([range_start, range_end])
    g.ax_joint.set_xticks(np.arange(range_start, range_end + 1))
    g.ax_joint.set_yticks(np.arange(range_start, range_end + 1))
    g.ax_joint.grid(True)
    g.ax_joint.plot([range_start, range_end], [range_start, range_end], lw = 3, color = 'Black', alpha = 0.5)
    g.fig.suptitle('Distribution of non-sibling peptide ratios', 
                   fontsize = 30)
    g.fig.subplots_adjust(top=0.95) 
    g.ax_joint.set_xlabel('Intensity ratio for non-sibling peptide 1', fontsize=30)
    g.ax_joint.set_ylabel('Intensity ratio for non-sibling peptide 2', fontsize=30)
    label_ticks = []
    for i in range(range_start, range_end + 1):
        label_ticks.append('$10^{' + str(i) + '}$')
    g.ax_joint.set_xticklabels(label_ticks, fontsize = 30)
    g.ax_joint.set_yticklabels(label_ticks, fontsize = 30)
    

In [None]:
randomSiblingRatios(random_run_count = 10, supress_print = False)
sibling_ratios = randomSiblingRatios(random_run_count = 10000)

In [None]:
randomNonsiblingRatios(random_run_count = 10, supress_print = False)
nonsibling_ratios = randomNonsiblingRatios(random_run_count = 10000)

In [None]:
createPlots(sibling_ratios, nonsibling_ratios, -2, 2)

from scipy.stats import pearsonr
print("Correlation for siblings: ", pearsonr(np.log10(sibling_ratios[:, 0]), np.log10(sibling_ratios[:, 1])))
print("Correlation for non-siblings: ", pearsonr(np.log10(nonsibling_ratios[:, 0]), np.log10(nonsibling_ratios[:, 1])))

from sklearn.metrics import mean_squared_error
print("MSE for siblings: ", mean_squared_error(np.log10(sibling_ratios[:, 0]), np.log10(sibling_ratios[:, 1])))
print("MSE for non-siblings: ", mean_squared_error(np.log10(nonsibling_ratios[:, 0]), np.log10(nonsibling_ratios[:, 1])))

