In [None]:
import numpy as np
import pandas as pd

import random
import sys, os
from itertools import combinations

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

In [None]:
############################ 
#Recording coefficient of variation for abundances
def recordCV(data_df, coeff_df, random_run_count = 0):
    
    #We will record the CV scores 
    
    #############################
    #Select peptides occuring on each protein
    all_proteins = np.unique(data_df['Protein'].values)
    print("Number of proteins: ", len(all_proteins))
    
    run_start_index = 11
    
    all_raw_CVs = []
    all_adj_CVs = []
    protein_sizes = []
    
    #For each protein, calculate CV before and after adjustment
    for protein in all_proteins:
        
        sub_df = data_df[data_df['Protein'] == protein]
        sub_peptide_indices =  [np.where(data_df.index == i)[0][0] for i in sub_df.index]

        protein_sizes.append(len(sub_peptide_indices))
        
        all_run_raw_CVs = []
        all_run_adj_CVs = []
        
        #Calculate for each run
        for random_experiment in range(run_start_index, data_df.shape[1]):
            
            #For each peptide, record the quantities
            quantities = data_df.iloc[sub_peptide_indices, random_experiment]
            #print(quantities)
            
            #Reject sample if any values are nan
            if np.all(~np.isnan(quantities.values.ravel())) and \
                   np.all(quantities.values.ravel() != 0.0):

                #Calculate the abundances by dividing quantities to peptide coefficients
                sibling_coefficients = coeff_df.iloc[sub_peptide_indices, 0]
#                 print("Sibling intensities: ", quantities)
#                 print("Sibling coeffs: ", sibling_coefficients)

                sibling_abundances = quantities.values.ravel() / sibling_coefficients.values.ravel()
#                 print("Sibling abundances: ", sibling_abundances)

                #Record final CVs
                raw_CV = np.std(quantities.values.ravel()) / np.mean(quantities.values.ravel())
                adj_CV = np.std(sibling_abundances) / np.mean(sibling_abundances)
                
                all_run_raw_CVs.append(raw_CV)
                all_run_adj_CVs.append(adj_CV)
        
#                 print("Raw run CVs ", np.mean(all_run_raw_CVs))
#                 print("Adj run CVs ", np.mean(all_run_adj_CVs))
    
        all_raw_CVs.append(all_run_raw_CVs)
        all_adj_CVs.append(all_run_adj_CVs)

    all_raw_CVs = np.array(all_raw_CVs)
    all_adj_CVs = np.array(all_adj_CVs)

#     print("Raw CVs ", np.mean(all_raw_CVs))
#     print("Adj CVs ", np.mean(all_adj_CVs))
    
    return [all_raw_CVs, all_adj_CVs, protein_sizes]


In [None]:
n_runs = 120
seq_length = 60

#Read dataset
data_df = pd.read_csv('preprocess_datasets/preprocessed_datasets/2019_guo_nci60_formatted_peptide_quants.tsv', 
                      sep = '\t', index_col = 0)

print("Peptide df ", data_df.shape)
print("Peptide df ", data_df.head())

#Input 1 is peptide intensity measurements
#Also record intensity measurements for pairs
q_df = data_df.iloc[:, -n_runs:]

#Normalize the intensities such that the sum of elements in each column is equal
X = q_df.values
print("Quants before normalization ", X.sum(axis = 0))
X = (X / X.sum(axis=0, keepdims=1)) * X.shape[0]
print("Quants after normalization ", X.sum(axis = 0))
q_df = pd.DataFrame(X, index = q_df.index, columns = q_df.columns)
data_df.iloc[:, -n_runs:] = q_df

#Input 2 is protein mappings
#Convert protein labels to int values
protein_labels = data_df['Protein'].values
unique_proteins = np.unique(protein_labels)
n_proteins = len(unique_proteins)
print("Number of unique proteins ", n_proteins)
int_protein_labels = [np.where(protein_labels[i] == unique_proteins)[0][0] for i in range(protein_labels.shape[0])]
int_protein_labels = np.asarray(int_protein_labels)
print("Protein labels ", int_protein_labels)
n_peptides = data_df.shape[0]

print("No of peptides: ", n_peptides)
print("No of proteins: ", n_proteins)
print("No of runs: ", n_runs)

#Split the proteins into train/validation/test sets

train_proteins, test_proteins = train_test_split((np.arange(len(np.unique(protein_labels)))), 
                                   test_size=0.2, random_state=12345)

#Define train/validation/test peptide pairs
train_peptides = np.concatenate([list(np.where(protein_labels == np.unique(protein_labels)[p])[0]) for p in train_proteins])
test_peptides = np.concatenate([list(np.where(protein_labels == np.unique(protein_labels)[p])[0]) for p in test_proteins])

print("No of train/test proteins: %d/%d" % (len(train_proteins), len(test_proteins)))
print("No of train/test peptides: %d/%d" % (len(train_peptides), len(test_peptides)))

#Split the runs into train/validation/test sets
#Modified code for replicate samples
train_runs, test_runs = train_test_split((np.arange(q_df.shape[1] / 2)), 
                                   test_size=0.2, random_state=12345)

train_runs = np.array([[2*i, 2*i+1] for i in train_runs]).astype(int).ravel()
test_runs = np.array([[2*i, 2*i+1] for i in test_runs]).astype(int).ravel()

print("No of train runs ", len(train_runs))
print("No of test runs ", len(test_runs))
print("Train runs ", train_runs)
print("Train runs ", q_df.columns[train_runs])
print("Test runs ", test_runs)
print("Test runs ", q_df.columns[test_runs])


In [None]:
#Split the data into training and test set
data_df_train = data_df.iloc[train_peptides]
data_df_test = data_df.iloc[test_peptides]

data_df_train = pd.concat([data_df_train.iloc[:, :-n_runs], data_df_train.iloc[:, data_df.shape[1] - n_runs + train_runs]], axis = 1)
data_df_test = pd.concat([data_df_test.iloc[:, :-n_runs], data_df_test.iloc[:, data_df.shape[1] - n_runs + test_runs]], axis = 1)

data_df_test

In [None]:
#Define function to create scatter plots
#Note that this scatter plot is for peptide ratios
def createPlots(scores, n_bins = 100):
    
    #Create comparison scatter plots
    fig, ax = plt.subplots()
    fig.set_size_inches(20, 20)

    SMALL_SIZE = 40
    MEDIUM_SIZE = 50
    BIGGER_SIZE = 70

    plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
    plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
    plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
    plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
    
    print("Creating a plot for all proteins...")
    raw_scores = scores[0].mean(axis = 1)
    adj_scores = scores[1].mean(axis = 1) 
    protein_sizes = scores[2]
    
    count_raw = 0
    count_adj = 0
    for i in range(raw_scores.shape[0]):
        if raw_scores[i] > adj_scores[i]:
            plt.scatter(raw_scores[i], 
                        adj_scores[i], 
                        s = 500, alpha = 0.5, color = "#8854d0")
            count_raw += 1

        elif raw_scores[i] < adj_scores[i]:
            plt.scatter(raw_scores[i], 
                        adj_scores[i], 
                        s = 500, alpha = 0.5, color = '#eb4d4b')
            count_adj += 1
            
    print("Count raw ", count_raw)
    print("Count adj ", count_adj)

    plt.plot([0.0, 2.0], [0.0, 2.0], '--', lw = 5, color = 'Black', zorder = -1)
    plt.xlim([0.0, 2.0])
    plt.ylim([0.0, 2.0])
    
    plt.xticks([0.0, 0.5, 1.0, 1.5, 2.0])
    plt.yticks([0.0, 0.5, 1.0, 1.5, 2.0])
    
    plt.xlabel('Coefficient of variation for observed abundances')
    plt.ylabel('Coefficient of variation for adjusted abundances')
    plt.grid()
    plt.show()
   
    print("Creating a plot for proteins with 5+ peptides...")
    
    #Create comparison scatter plots
    fig, ax = plt.subplots()
    fig.set_size_inches(20, 20)

    SMALL_SIZE = 40
    MEDIUM_SIZE = 50
    BIGGER_SIZE = 70

    plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
    plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
    plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
    plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
    
    raw_scores = scores[0].mean(axis = 1)
    adj_scores = scores[1].mean(axis = 1) 
    protein_sizes = scores[2]
    
    count_raw = 0
    count_adj = 0
    for i in range(raw_scores.shape[0]):
        if protein_sizes[i] > 5:
            if raw_scores[i] > adj_scores[i]:
                plt.scatter(raw_scores[i], 
                            adj_scores[i], 
                            s = 500, alpha = 0.5, color = "#8854d0")
                count_raw += 1

            elif raw_scores[i] < adj_scores[i]:
                plt.scatter(raw_scores[i], 
                            adj_scores[i], 
                            s = 500, alpha = 0.5, color = '#eb4d4b')
                count_adj += 1
            
    print("Count raw ", count_raw)
    print("Count adj ", count_adj)

    plt.plot([0.0, 2.0], [0.0, 2.0], '--', lw = 5, color = 'Black', zorder = -1)
    plt.xlim([0.0, 2.0])
    plt.ylim([0.0, 2.0])
    
    plt.xticks([0.0, 0.5, 1.0, 1.5, 2.0])
    plt.yticks([0.0, 0.5, 1.0, 1.5, 2.0])
    
    plt.xlabel('Coefficient of variation for observed abundances')
    plt.ylabel('Coefficient of variation for adjusted abundances')
    plt.grid()
    plt.show()

In [None]:
#Read training coefficients
coeff_df = pd.read_csv('trained_models/2019_guo_nci60/2019_guo_nci60_inferred_coefficients.tsv', sep = '\t', index_col = 0)
print("Coefficients ", coeff_df.shape)
coeff_df = coeff_df.abs()
coeff_df.sort_values(by = '0')

In [None]:
#Create test plots
results_test = recordCV(data_df_test, coeff_df.loc[data_df_test.index])
createPlots(results_test)