## Preprocessing pipeline
### 1. We identify all peptides that occur in more than one protein, and we eliminate these from the matrix. 
### 2. We identify all peptides that occur in both modified and unmodified forms, and we eliminate these from the matrix.
### 3. We eliminate all pairs of peptides that overlap one another due to missed cleavages.
### 4. Among the remaining peptides, we identify and remove singletons, i.e., peptides with no siblings.
### 5. Finally, we one-hot encode the peptide sequences

In [None]:
import numpy as np
import pandas as pd

In [None]:
#Read peptide expressions
peptide_filename = '~/../../data/ms-quant/2019_guo_nci60/proteome/processed_peptide_quants.tsv'
peptide_df = pd.read_csv(peptide_filename, sep = '\t')
print("Total number of peptides: ", peptide_df.shape[0])
print("Total number of proteins: ", len(np.unique(peptide_df['protein accession number'].values)))
peptide_df

## 1) Eliminate shared peptides

In [None]:
#Find the peptide sequences with modifications
for protein in peptide_df['protein accession number']:
    if (';' in protein) or (' ' in protein):
        print(protein)

## 2) Eliminate modified/unmodified

In [None]:
#Find the peptide sequences with modifications

peptide_sequences = peptide_df['transition group id'].values
peptide_sequences = [s[s.index('_')+1:] for s in peptide_sequences]
peptide_df['transition group id'] = peptide_sequences

#Exclude C-mod peptides
peptide_sequences = [s.replace('C(UniMod:4)', 'C') for s in peptide_sequences]

peptide_sequences_modified = [s for s in peptide_sequences if 'UniMod' in s]

print("Total number of peptides with PTM: ", len(peptide_sequences_modified))
peptide_sequences_modified

In [None]:
#Find the corresponding sequences
peptide_sequences_to_eliminate = []

for seq_mod in peptide_sequences_modified:
    print("Modified: ", seq_mod)
    
    for seq in peptide_sequences:
        if seq == seq_mod.replace('M(UniMod:35)', 'M'):
            print("Corresponding: ", seq)
            #Eliminate a peptide if both modified and unmodified forms exist
            peptide_sequences_to_eliminate.append(seq)
            peptide_sequences_to_eliminate.append(seq_mod)
            print("Eliminate: ", seq, " ", seq_mod)
            
print("Total number of peptides to eliminate: ", len(peptide_sequences_to_eliminate))

In [None]:
#Eliminate the sequences
peptide_df = peptide_df[~peptide_df['transition group id'].isin(peptide_sequences_to_eliminate)]

print("Total number of remaining peptides: ", peptide_df.shape[0])
peptide_df

## 3) Eliminate missed cleavages

In [None]:
#Remove the modifications
peptide_sequences = peptide_df['transition group id'].values
peptide_sequences = [s.replace('C(UniMod:4)', 'C') for s in peptide_sequences]
peptide_sequences = [s.replace('M(UniMod:35)', 'M') for s in peptide_sequences]

peptide_df['transition group id'] = peptide_sequences

#Find the peptide sequences with missed cleavage
peptide_sequences_mc_indices = [index for index in np.arange(len(peptide_sequences)) if 'K' in peptide_sequences[index][:-3] or 'R' in peptide_sequences[index][:-3]]
print("Total number of peptides with missed cleavage: ", len(peptide_sequences_mc_indices))

peptide_sequences_mc = np.array(peptide_sequences)[peptide_sequences_mc_indices]
peptide_sequences_mc

In [None]:
#Find the the corresponding sequences for the missed cleavages

#Find all subsets of peptides 
peptide_sequences_to_eliminate = []

for mc_peptide in peptide_sequences_mc:
    print("\nPeptide: ", mc_peptide)
    
    all_sub_peptides = []
    for index, peptide in enumerate(peptide_sequences):
        if peptide[:-2] in(mc_peptide):
            #Find all matching peptides
            print("Sub: ", peptide)
            all_sub_peptides.append(peptide)
    
    #If no sub peptides we do not need to eliminate
    if np.all([len(s)== len(mc_peptide) for s in all_sub_peptides]):
        print("No need to eliminate")
    
    else:
        peptide_sequences_to_eliminate.extend(all_sub_peptides)
        print("Eliminate all")

print("Total number of peptides to eliminate: ", len(peptide_sequences_to_eliminate))


In [None]:
#Eliminate the sequences
peptide_df = peptide_df[~peptide_df['transition group id'].isin(peptide_sequences_to_eliminate)]
print("Total number of remaining peptides: ", peptide_df.shape[0])
peptide_df

## 4) Identify and remove singletons

In [None]:
#Eliminate any proteins with only one peptide
print("Eliminating proteins with a single peptide...")
selected_proteins = peptide_df.groupby('protein accession number')['transition group id'].count()[peptide_df.groupby('protein accession number')['transition group id'].count() > 1].index
peptide_df = peptide_df[peptide_df['protein accession number'].isin(selected_proteins)]
peptide_df

## Format the data

In [None]:
#Record charge states as well
print("Recording charge states...")

peptide_sequences = peptide_df['transition group id'].values

charge_states = peptide_df['transition group id'].values
charge_states = [s[s.index('_') + 1:] for s in peptide_sequences]
charge_states = pd.DataFrame(charge_states, index = peptide_df.index, columns = ['Charge'])
print("Charge states ", charge_states)

# define universe of possible input values
alphabet = '123456'

# define a mapping of chars to integers
char_to_int = dict((c, i) for i, c in enumerate(alphabet))

# integer encode input data
integer_encoded = [char_to_int[char] for char in charge_states.values.ravel()]

# one-hot encode
onehot_encoded = list()
for value in integer_encoded:
    letter = [0 for _ in range(len(alphabet))]
    letter[value] = 1
    onehot_encoded.append(letter)

charge_states = pd.DataFrame(onehot_encoded, index = peptide_df.index, 
                             columns = ['Charge 1', 'Charge 2', 'Charge 3', 'Charge 4', 'Charge 5', 'Charge 6'])
print("Charge states ", charge_states)

peptide_sequences = [s[:s.index('_')] for s in peptide_sequences]
peptide_df['transition group id'] = peptide_sequences
peptide_df

In [None]:
#Reformat the dataset
no_of_runs = 120
peptide_df = peptide_df.rename(columns={"transition group id": "Peptide", "protein accession number": "Protein"})
peptide_df = pd.concat([peptide_df.iloc[:, :-no_of_runs], charge_states, peptide_df.iloc[:, -no_of_runs:]], axis = 1)
print("Final df ", peptide_df.shape)
peptide_df

In [None]:
#Record final dataset
peptide_df.to_csv('preprocessed_datasets/2019_guo_nci60_formatted_peptide_quants.tsv', sep = '\t')

print("Final number of peptides ", peptide_df.shape[0])
print("Final number of proteins ", len(np.unique(peptide_df['Protein'])))

print("Final no of elements ", np.count_nonzero(peptide_df.fillna(0).values[:, -no_of_runs:]))
print("Percentage of existing values ", 100 * np.count_nonzero(peptide_df.fillna(0).values[:, -no_of_runs:]) / (peptide_df.values[:, -no_of_runs:].shape[0] * peptide_df.values[:, -no_of_runs:].shape[1]))

print("Final df ", peptide_df.shape)
peptide_df


## 5) One-hot encode peptide sequences

In [None]:
%run "../../../bin/onehot_encode_peptide_sequences.py" \
             "preprocessed_datasets/2019_guo_nci60_formatted_peptide_quants.tsv" \
             "120" \
             "preprocessed_datasets/2019_guo_nci60_onehot_encoded_peptide_quants.tsv"             
             