In [1]:
import pandas as pd
import os
from Bio import SeqIO as seqio
from kinaid.utility import Utility
import gzip
import numpy as np

In [2]:
def clean_phosphopeptides(phosphorylation_file, fasta_file, output_file) :
    exp_df = pd.read_csv(phosphorylation_file)

    #read in the fasta file
    fasta_dict = seqio.to_dict(seqio.parse(fasta_file, 'fasta'))

    exp_df = exp_df[exp_df['p_residue'].isin(['S', 'T', 'Y'])]

    systematic_name_to_SGD_dict = {name:record.description.split(', ')[0].split(' ')[2] for name,record in fasta_dict.items()}
    #systematic_name_to_SGD_dict = {name:SGD.split(':')[1] for name,SGD in systematic_name_to_SGD_dict.items()}

    exp_df['SGDID'] = exp_df['systematic_name'].map(systematic_name_to_SGD_dict)

    systematic_name_to_seq_dict = {name:str(record.seq) for name,record in fasta_dict.items()}
    exp_df['sequence'] = (exp_df['systematic_name'].map(systematic_name_to_seq_dict))

    #remove rows with missing sequences
    exp_df = exp_df[~exp_df['sequence'].isna()]

    #remove rows with stop codons
    exp_df['sequence'] = exp_df['sequence'].str.rstrip('*')

    #get sequence lengths
    exp_df['seq_len'] = exp_df['sequence'].str.len()

    #keep only rows where the sequence length is greater than or equal to the position of the phosphorylation site
    exp_df = exp_df[exp_df['seq_len'] >= exp_df['p_position']]

    exp_df['sanity_check'] = exp_df.apply(lambda row: row['sequence'][row['p_position']-1] == row['p_residue'], axis=1)
    
    exp_df = exp_df[exp_df['sanity_check']]

    #if p_position is less than 6, add 5 - p_position number of _'s to the beginning of the sequence
    exp_df['sequence'] = exp_df.apply(lambda row: '_'*(6 - row['p_position']) + row['sequence'], axis=1)

    #if p_position is greater than seq_len - 6, add p_position - (seq_len - 5) number of _'s to the end of the sequence
    exp_df['sequence'] = exp_df.apply(lambda row: row['sequence'] + '_'*(row['p_position'] - (row['seq_len'] - 5)), axis=1)

    #make window around p_position if p_position is greater than 5 from p_position - 6 to p_position + 5, else from 0 to 10
    exp_df['window'] = exp_df.apply(lambda row: row['sequence'][row['p_position']-6:row['p_position']+5] if row['p_position'] > 5 else row['sequence'][0:11], axis=1)

    exp_df['adj_p_value'] = exp_df['adj_p_value'].apply(lambda x: -1*np.log10(x))

    exp_df_filtered = exp_df[['SGDID', 'p_position', 'window', 'fc_log2', 'adj_p_value']].copy()

    exp_df_filtered.rename(columns={'SGDID':'SGD', 'p_position':'site', 'window':'peptide', 'fc_log2':'log2fc', 'adj_p_value':'adjpvalue'}, inplace=True)

    exp_df_filtered.to_csv(output_file, index=False)

In [3]:
test_dir = './test'
proteomes_dir = './proteomes'
published_yeast_file = 'yeast-perturbation.xlsx'
published_yeast_path = os.path.join(test_dir, published_yeast_file)
yeast_proteome_path = os.path.join(proteomes_dir, 'yeast_proteome.fasta')

if not os.path.exists(proteomes_dir):
    os.makedirs(proteomes_dir)
    
if not os.path.exists(yeast_proteome_path):
    print('Downloading yeast proteome')
    Utility.download_file('http://sgd-archive.yeastgenome.org/sequence/S288C_reference/orf_protein/orf_trans.fasta.gz', 'temp.gz')
    with gzip.open('temp.gz', 'rb') as f_in:
        with open(yeast_proteome_path, 'wb') as f_out:
            f_out.write(f_in.read())
    os.remove('temp.gz')


In [4]:
df = pd.read_excel(published_yeast_path, sheet_name='p_site_diff_reg')


#TOR1_experiments = ['CS18', 'CS23','HS37', 'HS42', 'HS48']
TOR1_experiments = set(['DP', 'LP', 'SP', 'YP'])

df_exps = df[df['treatment_id'].isin(TOR1_experiments)]

display(df_exps)

Unnamed: 0,reference,systematic_name,gene,p_site,p_residue,p_position,treatment_id,fc_log2,p_value,adj_p_value
44,YAL012W_S40,YAL012W,CYS3,S40,S,40,LP,0.219367,7.189179e-01,0.934251
48,YAL012W_S40,YAL012W,CYS3,S40,S,40,DP,0.876721,1.824667e-01,0.608548
70,YAL012W_S40,YAL012W,CYS3,S40,S,40,SP,3.562723,1.212967e-07,0.000013
83,YAL012W_S40,YAL012W,CYS3,S40,S,40,YP,-1.092939,1.295494e-01,0.528078
144,YAL013W_S2,YAL013W,DEP1,S2,S,2,LP,-0.030479,9.352545e-01,0.987880
...,...,...,...,...,...,...,...,...,...,...
529283,YMR005W_S49,YMR005W,TAF4,S49,S,49,YP,0.167245,9.050468e-01,0.981903
529344,YDR358W_S353,YDR358W,GGA1,S353,S,353,LP,2.012639,5.130426e-02,0.335476
529348,YDR358W_S353,YDR358W,GGA1,S353,S,353,DP,1.351322,2.241834e-01,0.658840
529370,YDR358W_S353,YDR358W,GGA1,S353,S,353,SP,-1.772703,1.143126e-01,0.499893


In [5]:
#make directory called TOR1
TOR1_dir = os.path.join(test_dir, 'TOR1')
os.makedirs(TOR1_dir, exist_ok=True)

for exp in TOR1_experiments:
    exp_df = df_exps[df_exps['treatment_id'] == exp]
    temp_file = os.path.join(TOR1_dir, 'temp.csv')
    exp_df.to_csv(temp_file, index=False)
    exp_file = os.path.join(TOR1_dir, f'TOR1_{exp}.csv')
    clean_phosphopeptides(temp_file, yeast_proteome_path, exp_file)
    os.remove(temp_file)

