In [None]:
import numpy as np
import pandas as pd

import random
import sys, os
from itertools import combinations

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

In [None]:
############################ 
#Recording sibling peptide intensities and adjusted abundances

#Suppressing print statements
class HiddenPrints:
    
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

#Super function to sample sibling peptides
def randomSiblingRatios(data_df, coeff_df, random_run_count = 0, supress_print = True):
    #supresses prints
    if not supress_print:
        return randomSiblingRatios_sub(data_df, coeff_df, random_run_count)
    
    #enables all prints for debugging
    else:
        with HiddenPrints():
            return randomSiblingRatios_sub(data_df, coeff_df, random_run_count)
        
#Function to sample sibling peptides
def randomSiblingRatios_sub(data_df, coeff_df, random_run_count = 0):
    
    #We will record the scores for all peptide pairs for randomly sampled runs
    
    #############################
    #Define all sibling pairs
    all_proteins = np.unique(data_df['Protein'].values)
    print("Number of proteins: ", len(all_proteins))
    
    #For all proteins, record indices of all sibling peptides
    sibling_pairs = []

    for protein in all_proteins:
    
        sub_df = data_df[data_df['Protein'] == protein]
        sub_peptide_indices =  [np.where(data_df.index == i)[0][0] for i in sub_df.index]

        for i in range(len(sub_peptide_indices)):
            for j in range(i + 1, len(sub_peptide_indices)):
                pair = [sub_peptide_indices[i], sub_peptide_indices[j]]
                sibling_pairs.append(pair)

    #Record all sibling pairs
    sibling_pairs = np.asarray(sibling_pairs)
    print("Number of peptide pairs ", sibling_pairs.shape[0])

    #############################

    run_start_index = 11
    #Define all random runs to sample intensities from
    random_runs = list(np.arange(run_start_index, data_df.shape[1]))
    print("Total number of random runs ", len(random_runs))

    #Record observed intensities for the peptides
    all_sibling_intensitites = []
    
    #Record adjusted abundances for the peptides
    all_sibling_abundances = []
    
    if random_run_count > 0:
        sample_count = random_run_count
    else:
        sample_count = len(sibling_pairs)
    
    for n in range(sample_count):
        print("----------")
        print(data_df.iloc[sibling_pairs[n]]['Protein'])
                   
        for random_experiment in range(run_start_index, data_df.shape[1]):
            #For each peptide, record the quantities
            quantities = data_df.iloc[sibling_pairs[n], random_experiment]
            print(quantities)
            print(sibling_pairs[n])

            #Reject sample if any values are nan
            if np.all(~np.isnan(quantities.values.ravel())) and \
                   np.all(quantities.values.ravel() != 0.0):

                #Calculate the abundances by dividing quantities to peptide coefficients
                sibling_coefficients = coeff_df.iloc[sibling_pairs[n], 0]
                print("Sibling intensities: ", quantities)
                print("Sibling coeffs: ", sibling_coefficients)

                sibling_abundances = quantities.values.ravel() / sibling_coefficients.values.ravel()
                print("Sibling abundances: ", sibling_abundances)

                #Record final results
                all_sibling_intensitites.append(quantities.values.ravel())
                all_sibling_abundances.append(list(sibling_abundances))

                #Record the other pair as well
                all_sibling_intensitites.append(quantities.values.ravel()[::-1])
                all_sibling_abundances.append(list(sibling_abundances)[::-1])

    all_sibling_intensitites = np.array(all_sibling_intensitites)
    all_sibling_intensitites = all_sibling_intensitites.reshape((len(all_sibling_intensitites), 2))

    all_sibling_abundances = np.array(all_sibling_abundances)
    all_sibling_abundances = all_sibling_abundances.reshape((len(all_sibling_abundances), 2))

    print("Mean abundance: ", np.mean(all_sibling_abundances))

    return [all_sibling_intensitites, all_sibling_abundances]

In [None]:
n_runs = 120
seq_length = 60

#Read dataset
data_df = pd.read_csv('preprocess_datasets/preprocessed_datasets/2019_guo_nci60_formatted_peptide_quants.tsv', 
                      sep = '\t', index_col = 0)

print("Peptide df ", data_df.shape)
print("Peptide df ", data_df.head())

#Input 1 is peptide intensity measurements
#Also record intensity measurements for pairs
q_df = data_df.iloc[:, -n_runs:]

#Normalize the intensities such that the sum of elements in each column is equal
X = q_df.values
print("Quants before normalization ", X.sum(axis = 0))
X = (X / X.sum(axis=0, keepdims=1)) * X.shape[0]
print("Quants after normalization ", X.sum(axis = 0))
q_df = pd.DataFrame(X, index = q_df.index, columns = q_df.columns)
data_df.iloc[:, -n_runs:] = q_df

#Input 2 is protein mappings
#Convert protein labels to int values
protein_labels = data_df['Protein'].values
unique_proteins = np.unique(protein_labels)
n_proteins = len(unique_proteins)
print("Number of unique proteins ", n_proteins)
int_protein_labels = [np.where(protein_labels[i] == unique_proteins)[0][0] for i in range(protein_labels.shape[0])]
int_protein_labels = np.asarray(int_protein_labels)
print("Protein labels ", int_protein_labels)
n_peptides = data_df.shape[0]

print("No of peptides: ", n_peptides)
print("No of proteins: ", n_proteins)
print("No of runs: ", n_runs)

#Split the proteins into train/validation/test sets

train_proteins, test_proteins = train_test_split((np.arange(len(np.unique(protein_labels)))), 
                                   test_size=0.2, random_state=12345)

#Define train/validation/test peptide pairs
train_peptides = np.concatenate([list(np.where(protein_labels == np.unique(protein_labels)[p])[0]) for p in train_proteins])
test_peptides = np.concatenate([list(np.where(protein_labels == np.unique(protein_labels)[p])[0]) for p in test_proteins])

print("No of train/test proteins: %d/%d" % (len(train_proteins), len(test_proteins)))
print("No of train/test peptides: %d/%d" % (len(train_peptides), len(test_peptides)))

#Split the runs into train/validation/test sets
#Modified code for replicate samples
train_runs, test_runs = train_test_split((np.arange(q_df.shape[1] / 2)), 
                                   test_size=0.2, random_state=12345)

train_runs = np.array([[2*i, 2*i+1] for i in train_runs]).astype(int).ravel()
test_runs = np.array([[2*i, 2*i+1] for i in test_runs]).astype(int).ravel()

print("No of train runs ", len(train_runs))
print("No of test runs ", len(test_runs))
print("Train runs ", train_runs)
print("Train runs ", q_df.columns[train_runs])
print("Test runs ", test_runs)
print("Test runs ", q_df.columns[test_runs])


In [None]:
#Split the data into training and test set
data_df_train = data_df.iloc[train_peptides]
data_df_test = data_df.iloc[test_peptides]

data_df_train = pd.concat([data_df_train.iloc[:, :-n_runs], data_df_train.iloc[:, data_df.shape[1] - n_runs + train_runs]], axis = 1)
data_df_test = pd.concat([data_df_test.iloc[:, :-n_runs], data_df_test.iloc[:, data_df.shape[1] - n_runs + test_runs]], axis = 1)

data_df_test

In [None]:
#Define function to create scatter plots
#Note that this scatter plot is for peptide ratios
def createHistogram_peptide_ratios(sibling_ratios, color, n_bins = 300):

    #####################
    # Create both plots together
    results = sibling_ratios[0]
    
    N = 50
    fig, ax = plt.subplots()
    fig.set_size_inches(20, 20)

    SMALL_SIZE = 60
    MEDIUM_SIZE = 80
    BIGGER_SIZE = 90

    plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
    plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
    plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
    plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
    plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

    ratio_scores1 = pd.DataFrame(np.log10(sibling_ratios[0][:, 0] / sibling_ratios[0][:, 1]), 
                                 columns = ['Ratio'])
    ratio_scores2 = pd.DataFrame(np.log10(sibling_ratios[1][:, 0] / sibling_ratios[1][:, 1]), 
                                 columns = ['Ratio'])
    
    ratio_scores1 = pd.DataFrame((sibling_ratios[0][:, 0] - sibling_ratios[0][:, 1]), 
                                 columns = ['Ratio'])
    ratio_scores2 = pd.DataFrame((sibling_ratios[1][:, 0] - sibling_ratios[1][:, 1]), 
                                 columns = ['Ratio'])
    
    ratio_scores2 = ratio_scores2[~(ratio_scores2 > np.mean(ratio_scores2) + 3 * np.std(ratio_scores2))]
    ratio_scores2 = ratio_scores2[~(ratio_scores2 < np.mean(ratio_scores2) - 3 * np.std(ratio_scores2))]
    
    ratio_scores1 = ratio_scores1[~(ratio_scores1 > np.mean(ratio_scores1) + 3 * np.std(ratio_scores1))]
    ratio_scores1 = ratio_scores1[~(ratio_scores1 < np.mean(ratio_scores1) - 3 * np.std(ratio_scores1))]
    
    sns.distplot(ratio_scores2, bins = n_bins, 
                 kde_kws={"lw": 10, "color": "#8854d0", "alpha": 1, },
                 hist_kws={"linewidth": 1, "color": "#8854d0"}, label = 'Adjusted')
     
    sns.distplot(ratio_scores1, bins = n_bins, 
                 kde_kws={"lw": 10, "color": color, "alpha": 1, },
                 hist_kws={"linewidth": 1, "color": color}, label = 'Observed')
    
    
    plt.xticks([-4, -2, 0, 2, 4])
    plt.xlim([-5, 5])
    #plt.legend()
#     plt.xlabel('Difference between \n sibling peptide abundances')
#     plt.ylabel('Density')
#     plt.show()
    
    
    

In [None]:
#Read training coefficients
coeff_df = pd.read_csv('trained_models/2019_guo_nci60/2019_guo_nci60_inferred_coefficients.tsv', sep = '\t', index_col = 0)
print("Coefficients ", coeff_df.shape)
coeff_df = coeff_df.abs()
coeff_df.sort_values(by = '0')

In [None]:
#Create training plots
randomSiblingRatios(data_df_train, coeff_df.loc[data_df_train.index], supress_print = False, random_run_count = 50)
results_train = randomSiblingRatios(data_df_train, coeff_df.loc[data_df_train.index], supress_print = True)
createHistogram_peptide_ratios(results_train, color = '#eb4d4b', n_bins = 100)


In [None]:
#Create test plots
randomSiblingRatios(data_df_test, coeff_df.loc[data_df_test.index], supress_print = False, random_run_count = 50)
results_test = randomSiblingRatios(data_df_test, coeff_df.loc[data_df_test.index], supress_print = True)
createHistogram_peptide_ratios(results_test, color = '#eb4d4b', n_bins = 100)