# Investigate the correlation between pepper coefficients and physicochemical properties of peptides  

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sequence_df = pd.read_csv('../preprocess_datasets/preprocessed_datasets/2019_guo_nci60_formatted_peptide_quants.tsv',
                          sep = '\t', index_col = 0)

#Read the peptide coefficients
coeff_df = pd.read_csv('../trained_models/2019_guo_nci60/2019_guo_nci60_inferred_coefficients.tsv', sep = '\t', index_col = 0)
coeff_df.index = sequence_df.index
coeff_df = pd.concat([sequence_df.iloc[:, :2], coeff_df], axis = 1)
coeff_df.columns = ['Sequence', 'Protein', 'Coefficient']
coeff_df.sort_values(by = 'Coefficient')

## 1) Plot distribution of coefficients

In [None]:
#Create histogram of coefficients

#Create plots
SMALL_SIZE = 50
MEDIUM_SIZE = 60
BIGGER_SIZE = 70

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

fig, ax = plt.subplots(figsize=(40, 20))

sns.distplot(coeff_df['Coefficient'].values, bins = 200, 
             hist = True, kde = False, label = 'Raw abundances')

plt.grid(which='both')
plt.grid(b=True, which='major')
plt.grid(b=True, which='minor')

plt.xlabel('Peptide coefficients')
plt.ylabel('Density')
plt.title('Distribution of peptide coefficients')



In [None]:
#Create histogram of log coefficients

SMALL_SIZE = 50
MEDIUM_SIZE = 60
BIGGER_SIZE = 70

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

fig, ax = plt.subplots(figsize=(40, 20))

sns.distplot(np.log2(coeff_df['Coefficient'].values), bins = 200, 
             hist = True, kde = False, label = 'Raw abundances')

plt.grid(which='both')
plt.grid(b=True, which='major')
plt.grid(b=True, which='minor')

plt.xlabel('Peptide coefficients (log scaled)')
plt.ylabel('Density')
plt.title('Distribution of peptide coefficients')




## 2) Plot coefficient vs peptide length

In [None]:
#Record peptide sequence lengths
sequence_lengths = [len(s) for s in coeff_df['Sequence'].values]
sequence_lengths = pd.DataFrame(sequence_lengths, index = coeff_df.index, columns = ['Sequence Length'])

coeff_df = pd.concat([coeff_df, sequence_lengths], axis = 1)
coeff_df

In [None]:
#Create plot with std error

from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

fig, ax = plt.subplots()
fig.set_size_inches(30, 20)

SMALL_SIZE = 60
MEDIUM_SIZE = 80
BIGGER_SIZE = 90

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
 
#Eliminate outliers
coeff_df = coeff_df[coeff_df['Sequence Length'] <= 45]
all_scores = coeff_df.groupby('Sequence Length').median().values

error_scores = coeff_df.groupby('Sequence Length')['Coefficient'].apply(lambda x:stats.sem(x))
error_scores[np.isnan(error_scores)] = 0
error_scores = error_scores.astype(float)

plt.scatter(coeff_df.groupby('Sequence Length').median().index, 
            all_scores, 
            s = 500, lw = 10, color='#eb4d4b', alpha = 0.5)

plt.errorbar(coeff_df.groupby('Sequence Length').median().index, all_scores, list(error_scores), 
             lw = 5, linestyle='None', marker='^', color = '#eb4d4b', alpha = 0.8)

#plt.xticks(coeff_df.groupby('Sequence Length').mean().index, rotation = 90)
plt.xlabel('Sequence length')
plt.ylabel('Median peptide coefficient')
plt.grid()


## 3) Plot distribution of hydrophobicity

In [None]:
aa_df = pd.read_csv('AAindex_aminoacid_scores.tsv', 
                    sep = '\t', index_col = 0)
aa_df

In [None]:
#Calculate scores by summing and averaging scores for each amino acid 
all_total_scores = []
all_mean_scores = []
for sequence in coeff_df['Sequence'].values:
    total_scores_for_sequence = np.zeros(aa_df.shape[0])
    length_of_sequence = 0
    for aa in list(sequence):
        total_scores_for_sequence += aa_df[aa].values
        length_of_sequence = length_of_sequence + 1
    mean_scores_for_sequence = total_scores_for_sequence / length_of_sequence
    
    total_scores_for_sequence = pd.DataFrame(total_scores_for_sequence.reshape((1, -1)), 
                                              columns = aa_df.index)
    mean_scores_for_sequence = pd.DataFrame(mean_scores_for_sequence.reshape((1, -1)), 
                                              columns = aa_df.index)
    all_total_scores.append(total_scores_for_sequence)
    all_mean_scores.append(mean_scores_for_sequence)
    
#Concatenate scores for all sequences
total_scores_for_sequences = pd.concat(all_total_scores)
total_scores_for_sequences.index = coeff_df.index
mean_scores_for_sequences = pd.concat(all_mean_scores)
mean_scores_for_sequences.index = coeff_df.index

total_scores_for_sequences

In [None]:
#Create plot with std error

from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

fig, ax = plt.subplots()
fig.set_size_inches(30, 20)
SMALL_SIZE = 60
MEDIUM_SIZE = 80
BIGGER_SIZE = 90

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
 
hydrophilicity_scores = total_scores_for_sequences['PRAM900101']
coefficients = coeff_df['Coefficient'].values

#group coefficients hydrophilicity
bins = np.arange(-250, 550, 50)
xtick_labels = []
for bin_index in range(1, len(bins) - 1):
    xtick_labels.append(str(str(bins[bin_index]) + ":" + str(bins[bin_index + 1])))
    
norm_binned_zen = [coefficients[np.where((hydrophilicity_scores > low) & (hydrophilicity_scores <= high))] for low, high in zip(bins[:-1], bins[1:])]

for i in range(len(norm_binned_zen)):
    plt.scatter(i, np.median(norm_binned_zen[i]), 
            s = 1000, lw = 10, color='#22a6b3', alpha = 0.5)
    
    plt.errorbar(i, np.median(norm_binned_zen[i]), 
                 stats.sem(norm_binned_zen[i]), 
                 lw = 5, linestyle='None', marker='^', color = '#22a6b3', alpha = 0.8)

plt.grid()
plt.xlabel('Peptide hydrophobicity')
plt.ylabel('Median peptide coefficient')
plt.xticks(np.arange(len(xtick_labels)), np.arange(-200, 500, 50), rotation = 90)
plt.show()



## 4) Plot top correlated features

In [None]:
from scipy.stats import pearsonr

feature_correlations = []
feature_names = []
for f in range(total_scores_for_sequences.shape[1]):
    aa_scores = total_scores_for_sequences.iloc[:, f].values
    coefficients = coeff_df['Coefficient'].values
#     print("Aa scores: ", aa_scores)
#     print("Coefficients: ", coefficients)
    
    if np.any(np.isnan(aa_scores)):
        continue
        
    corr, pvalue = pearsonr(aa_scores, coefficients)
    feature_correlations.append(np.abs(corr))
    if '(' in aa_df['Name'].values[f]:
        feature_name = aa_df['Name'].values[f][:aa_df['Name'].values[f].index('(')]
    else:
        feature_name = aa_df['Name'].values[f]
    feature_names.append(feature_name)
    print("Correlation ", np.abs(corr))

In [None]:
feature_correlations = pd.DataFrame(feature_correlations, index = feature_names)
feature_correlations

In [None]:
#Create plot with std error

from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap

fig, ax = plt.subplots()
fig.set_size_inches(5, 30)

SMALL_SIZE = 30
MEDIUM_SIZE = 40
BIGGER_SIZE = 50

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
 
scores = feature_correlations.iloc[feature_correlations[0].abs().argsort()][::-1][:25]

cmap = LinearSegmentedColormap.from_list('mycmap', ['#ffffff', '#eb4d4b'])

sns.heatmap(scores, cmap = cmap, annot = True, 
            linecolor = '#ffffff', linewidth = 1,
            vmin = 0.1, vmax = 0.3)

print()