In [None]:
import numpy as np
import pandas as pd

import random
import copy
import re
import argparse, sys

import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

from keras.models import Sequential, Model
from tensorflow.keras import layers
from tensorflow.keras.layers import Layer, Dense, Input, Dropout, Conv2D, BatchNormalization, Activation, Flatten, Concatenate, MaxPooling2D
from keras.wrappers.scikit_learn import KerasRegressor
import keras.backend as K
from keras.layers import Lambda
from keras import optimizers
from keras.callbacks import EarlyStopping
from keras.callbacks import Callback
from tensorflow.keras import initializers
from tensorflow.keras.constraints import max_norm

from scipy import stats
import scipy

import os
import random

In [None]:
seed_value= 12345
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)

print("Tensorflow version: ", tf.__version__)
print("Keras version: ", keras.__version__)

#Define hyperparameters
seq_length = 60
n_filters = 10
filter_size = 3
n_layers = 4
n_nodes = 40
dropout_rate = 0.25 

#Define Pepper model
class CustomLossLayer(layers.Layer):
    def __init__(self, n_proteins_layer, n_runs_layer, **kwargs):

        super(CustomLossLayer, self).__init__()
        self.n_proteins_layer = n_proteins_layer
        self.n_runs_layer = n_runs_layer

        #Define alpha variables that are trainable
        if n_proteins_layer == 1584 and n_runs_layer == 1:
            self.alphas = tf.Variable(alpha_initial_train, 
                                  trainable = True, 
                                  dtype = 'float32')
        else:

            init_values = np.random.rand(1527, 
                                         96) 
            self.alphas = tf.Variable(init_values, 
                                      trainable = True, 
                                      dtype = 'float32')

    def get_vars(self):
        return self.alphas

    def peptide_loss(self, y_true, y_pred):

        #Define all inputs
        c_pred = K.abs(y_pred)
        c_pred = tf.reshape(c_pred,[-1]) #this is very important for correctness of calculation
        q_input = y_true[:, :-1] #dimension (batch_size, K)
        label_input = y_true[:, -1] #dimension (batch_size, 1)
        label_input = tf.cast(label_input, tf.int32)

        #Exclude missing intensities in pairwise distance calculation
        zero_peptides = K.not_equal(q_input, K.constant(0))
        zero_peptides = K.cast(zero_peptides, K.floatx())

        #Exclude peptides with 0 coefficients in pairwise distance calculation
        zero_coeffs = K.not_equal(c_pred, K.constant(0))
        zero_coeffs = K.cast(zero_coeffs, K.floatx())
        zero_coeffs = tf.expand_dims(zero_coeffs, 1)

        #Find the corresponding alpha value for each peptide
        corresponding_protein_abundances = tf.gather(K.abs(self.alphas), label_input, axis = 0)

        #Calculate adjusted abundances
        c_pred = tf.expand_dims(c_pred, 1)
        adjusted_abundances = c_pred * corresponding_protein_abundances

        #Calculate the difference values
        differences = q_input - adjusted_abundances
        differences = differences * zero_peptides * zero_coeffs
        differences = K.square(differences)

        #Return the mean loss
        total_loss = K.sum(differences)
        all_runs = K.sum(zero_peptides * zero_coeffs)
        return total_loss / all_runs

    #We add the loss to the final model loss
    def call(self, y_true, y_pred):
        self.add_loss(self.peptide_loss(y_true, y_pred))
        return y_pred

#Define alpha-based peptide loss function
def peptide_loss(x, c_pred, alphas):

    c_pred = K.abs(c_pred)

    #q_input is the intensity values from the experiment
    q_input = x[:, :-1] #dimension (batch_size, K)
    #label_input is the protein labels for each peptide
    label_input = x[:, -1] #dimension (batch_size, 1)
    label_input = tf.cast(label_input, tf.int32)

    c_pred = tf.reshape(c_pred,[-1]) #this is very important for correctness of calculation

    #Exclude missing intensities in pairwise distance calculation
    zero_peptides = K.not_equal(q_input, K.constant(0))
    zero_peptides = K.cast(zero_peptides, K.floatx())

    #Exclude peptides with 0 coefficients in pairwise distance calculation
    zero_coeffs = K.not_equal(c_pred, K.constant(0))
    zero_coeffs = K.cast(zero_coeffs, K.floatx())
    zero_coeffs = tf.expand_dims(zero_coeffs, 1)

    #Find the corresponding alpha value for each peptide
    corresponding_protein_abundances = tf.gather(K.abs(alphas), label_input, axis = 0)

    #Calculate the differences
    c_pred = tf.expand_dims(c_pred, 1)
    adjusted_abundances = c_pred * corresponding_protein_abundances
    differences = q_input - adjusted_abundances
    differences = differences * zero_peptides * zero_coeffs
    differences = K.square(differences)

    #Record final average loss
    total_loss = K.sum(differences)
    all_runs = K.sum(zero_peptides * zero_coeffs)
    return total_loss / all_runs

#Define model
def define_model(n_proteins_layer, n_runs_layer):

    #Define custom absolute valued activation function
    def absActivation(x) :
        activated_x = K.abs(x)
        return activated_x

    #Define network
    inputs =  Input(shape=(seq_length, 20, 1), name = 'sequence')
    inputs_charge = Input(shape=(6,), name = 'charge')
    inputs_label = Input(shape=(n_runs_layer + 1,), name = 'y_true')

    #Define convolutional layers
    x = Conv2D(n_filters, kernel_size=(filter_size, 20), activation="relu", input_shape=(seq_length, 20, 1))(inputs)
    x = MaxPooling2D(pool_size=(2, 1))(x)
    sequence_representation = Flatten()(x)
    sequence_representation = Dropout(dropout_rate)(sequence_representation)

    #Second input is the one-hot encoded charge states
    concatenated_representation = Concatenate()([sequence_representation, inputs_charge])
    x = concatenated_representation

    #Define dense layers
    for n in range(n_layers):
        x = Dense(n_nodes, activation="relu")(x)
        x = Dropout(dropout_rate)(x)

    output = Dense(1, activation=absActivation)(x) #predict peptide coefficients

    #Define model with custom layer
    my_custom_layer = CustomLossLayer(n_proteins_layer, n_runs_layer)(inputs_label, output) # here can also initialize those var1, var2
    model = Model(inputs = [inputs, inputs_charge, inputs_label], outputs = my_custom_layer)
    model.summary()

    #Compile the model
    opt = tf.optimizers.Adam(learning_rate = 1e-3, clipnorm = 1)
    model.compile(optimizer=opt)

    return model


#Load pretrained model
pretrained_model = define_model(1527, 96)   
pretrained_model.load_weights('../trained_models/2019_guo_nci60/2019_guo_nci60_Coefficient_Predictor_Model.h5')

    

# 1) Generate sequences with substitutions

In [None]:
#Read the peptide sequences from dataset 1
sequence_df = pd.read_csv('../preprocess_datasets/preprocessed_datasets/2019_guo_nci60_formatted_peptide_quants.tsv',
                          sep = '\t', index_col = 0)

#Modify the sequences to exclude modifications
peptide_sequences = sequence_df['Peptide'].values
peptide_sequences = [s.replace('(UniMod:4)', '') for s in peptide_sequences]
peptide_sequences = [s.replace('(UniMod:35)', '') for s in peptide_sequences]

charges = sequence_df[['Charge 1', 'Charge 2', 'Charge 3', 'Charge 4', 'Charge 5', 'Charge 6']].values
print("Number of sequences ", len(peptide_sequences))

peptide_sequences

In [None]:
results = generateSequences(peptide_sequences, charges, sample_size = 5000)
results[0]

In [None]:
#Encode simulated sequences
all_sequence_pairs = results[0] 
all_sequence_pairs_onehot_encoded = results[1] 
all_sequence_pairs_charges = np.array(results[2])

pair1_sequences = np.array(all_sequence_pairs_onehot_encoded)[:,:60 * 20]
pair1_sequences = np.array(pair1_sequences).reshape((pair1_sequences.shape[0], 60, 20))
pair1_sequences = np.expand_dims(pair1_sequences, axis=3)

pair1_charges = all_sequence_pairs_charges[:, 0]

pair1_coefficients = pretrained_model.predict([pair1_sequences, pair1_charges,
                                               np.ones((pair1_sequences.shape[0],97))]).ravel()
pair1_coefficients = np.abs(pair1_coefficients)
pair1_coefficients = pd.DataFrame(pair1_coefficients)
print("Pair 1 coefficients ", pair1_coefficients)



In [None]:
#Encode simulated sequences
pair2_sequences =  np.array(all_sequence_pairs_onehot_encoded)[:, -1 * 60 * 20:]
pair2_sequences = np.array(pair2_sequences).reshape((pair2_sequences.shape[0], 60, 20))
pair2_sequences = np.expand_dims(pair2_sequences, axis=3)


pair2_charges = all_sequence_pairs_charges[:, 1]

pair2_coefficients = pretrained_model.predict([pair2_sequences, pair2_charges,
                                               np.ones((pair2_sequences.shape[0],97))]).ravel()
pair2_coefficients = np.abs(pair2_coefficients)
pair2_coefficients = pd.DataFrame(pair2_coefficients)
print("Pair 2 coefficients ", pair2_coefficients)


In [None]:
#Calculate coefficient differences
coeff_differences = pair1_coefficients.values.ravel() - pair2_coefficients.values.ravel() 
coeff_differences = pd.DataFrame(coeff_differences, index = all_sequence_pairs.index, columns = ['Coefficient Difference'])
coeff_differences = coeff_differences.abs()

coeff_differences['Start Position'] = results[0]['Start Position'].values
coeff_differences['End Position'] = results[0]['End Position'].values

coeff_differences

# 2) Create substitution plots

In [None]:
#Create plot with std error

from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

fig, ax = plt.subplots()
fig.set_size_inches(35, 20)

SMALL_SIZE = 60
MEDIUM_SIZE = 80
BIGGER_SIZE = 90

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
 
all_scores = coeff_differences.groupby('Start Position')['Coefficient Difference'].apply(lambda x:np.mean(x.abs()))

error_scores = coeff_differences.groupby('Start Position')['Coefficient Difference'].apply(lambda x:np.std(x.abs()))
error_scores[np.isnan(error_scores)] = 0
error_scores = error_scores.astype(float)

plt.scatter(coeff_differences.groupby('Start Position').mean().index, 
            all_scores, 
            s = 500, lw = 10, color='#eb4d4b', alpha = 0.5)

plt.errorbar(coeff_differences.groupby('Start Position').mean().index, all_scores, list(error_scores), 
             lw = 5, linestyle='None', marker='^', color = '#eb4d4b', alpha = 0.8)

#plt.xticks(coeff_df.groupby('Sequence Length').mean().index, rotation = 90)
plt.xlabel('N-terminus position')
plt.ylabel('Mean peptide coefficient')
plt.grid()




In [None]:
#Create plot with std error

from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

fig, ax = plt.subplots()
fig.set_size_inches(35, 20)

SMALL_SIZE = 60
MEDIUM_SIZE = 80
BIGGER_SIZE = 90

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
 
all_scores = coeff_differences.groupby('End Position')['Coefficient Difference'].apply(lambda x:np.mean(x.abs()))

error_scores = coeff_differences.groupby('End Position')['Coefficient Difference'].apply(lambda x:np.std(x.abs()))
error_scores[np.isnan(error_scores)] = 0
error_scores = error_scores.astype(float)

plt.scatter(-1 * coeff_differences.groupby('End Position').mean().index[::-1], 
            all_scores[::-1], 
            s = 500, lw = 10, color='#eb4d4b', alpha = 0.5)

plt.errorbar(-1 * coeff_differences.groupby('End Position').mean().index[::-1], all_scores[::-1], list(error_scores[::-1]), 
             lw = 5, linestyle='None', marker='^', color = '#eb4d4b', alpha = 0.8)

plt.xlabel('N-terminus position')
plt.ylabel('Mean peptide coefficient')
plt.grid()



