# Finding the genomic location of specific phosphosites on the substrates of human protein kinases.

Using a dataset containing kinase-substrate interactions create a dataframe with genomic location of particuclar phosphosites and their neighbouring amino acid sequences.

In [None]:
#Install required packages.
!pip install requests
!pip install pandas

In [4]:
#Import required packagaes
import pandas as pd
import numpy as np
import re
import requests
import collections

# Genomic location of substrate genes.
From kinase/substrate interaction data create a file containing only substrate gene accession numbers.

In [3]:
#Read file with all kinase and substrate data.
hk_sub_df = pd.read_csv("new_clean_human_kinase_substrates.csv")
#Remove gene name duplicates-this is due to multiple PS being on some substrates.
hk_sub_df=hk_sub_df.drop_duplicates(subset='SUB_ACC_ID', keep='first', inplace=False)


#Create a new csv file containing only substrate gene accession numbers. 
hk_sub_Acc=hk_sub_df['SUB_ACC_ID']
hk_sub_Acc.to_csv('hk_sub_Acc.csv', sep=',', header=False, index=False) #This file will be uploaded to the ensembl biomart search.

Create a dataframe containing substrate gene locations by merging dataframes according to gene accession numbers.

In [7]:
#Create a dataframe containing gene names and gene accession IDs -genome location data will be added to this.
hk_sub_GeneNames_Acc=hk_sub_df[['SUB_GENE', 'SUB_ACC_ID']]



#Read in file with chromosomal locations of substrate genes generated from ensemble biomart.
Biomart_sub_gene_locations_df = pd.read_csv("Biomart_sub_gene_locations.csv")
Biomart_sub_gene_locations_df.columns=['SUB_GENE', 'Chromosome', 'Karyotype band', 'Strand'] #Change column names.



#Merge data frames to have substrate gene: name, accession number, genomic location, .
Sub_GeneLocation_df=pd.merge(hk_sub_GeneNames_Acc, Biomart_sub_gene_locations_df, on='SUB_GENE') #Merge dataframe based on gene name.



Sub_GeneLocation_df=Sub_GeneLocation_df.sort_values(["Chromosome",'Karyotype band'] , axis = 0, ascending =[True, True]) #Sort df by Chromosome and Karytopye band.
Sub_GeneLocation_df=Sub_GeneLocation_df.reset_index(drop=True) #Reset index


  SUB_GENE SUB_ACC_ID Chromosome Karyotype band  Strand
0    PHGDH     O43175          1            p12       1
1    CASQ2     O14958          1          p13.1      -1
2   ATP1A1     P05023          1          p13.1       1
3   CAPZA1     P52907          1          p13.2       1
4    SIKE1     Q9BRV8          1          p13.2      -1


# Extracting genomic location of phosphosites
Using a url extract coordinate information for residues on the substrates.

In [9]:
#Create an empty dataframe into which extracted data will be added.
Sub_PS_Genomic_Loc_df=pd.DataFrame(columns=['SUB_ACC_ID', 'Amino acid', 'Position', 'Start co', 'End co'])

#Read in Substrate gene location file to create data frame with accession numbers only.
hk_sub_df=pd.read_csv("Sub_GeneLocation_df.csv")
hk_sub_acc_df=hk_sub_df['SUB_ACC_ID']
# print(len(hk_sub_acc_df))
# hk_sub_acc_df_test=hk_sub_acc_df[0:500]

#Extracting phosphosite information from Ebi.

for i in hk_sub_acc_df:
    url = 'https://www.ebi.ac.uk/proteins/api/coordinates/{}'.format(i)  #Itterating through accession numbers create a url for each accession number.
    r = requests.get(url)  #Obtain url information.
    r_txt=r.text    #Convert url information into text.
    MR_info=re.findall('(?<=modified )(.*?)(?=feature)', r_txt) #Find all sections of the text which contain the information about modified residues which includes phosphosites.
   
    
#Create regular expressions to extract phosphosite position, amino acid, start and end coordinates.
#Create empty lists for the different data to be stored.

     #Position
    regex_ps_pos= re.compile('(?<=position=")(.*?)(?=")')
    ps_pos=[]

    #AA
    regex_ps_aa= re.compile('(?<=Phospho)(.*?)(?=<|;)')
    ps_aa=[]

    #Start
    regex_ps_gs= re.compile('(?<=begin position=")(.*?)(?=")')
    ps_gs=[]

    #End
    regex_ps_ge= re.compile('(?<=end position=")(.*?)(?=")')
    ps_ge=[]
    Sub_Acc=[]
    for entry in MR_info:                            #Itterating through each modified residue information section.
        Sub_Acc.append(i)                            #Add the accession number of the gene to a list so each entry contains the gene accession number.
        for p in re.findall(regex_ps_pos, entry):    #Find the position of the phosphosite.
            if len(p)<=4:                            #Only add to the positions list if value is less than or equal to 4 digits to avoid adding genomic coordinates.
                ps_pos.append(p)
        for a in re.findall(regex_ps_aa, entry):     #Find amino acid at phosphosite and append to relevent list.
            ps_aa.append(a)                        
        for s in re.findall(regex_ps_gs, entry):     #Find start coordinate of phosphosite and append to relevent list.
            ps_gs.append(s)
        for e in re.findall(regex_ps_ge, entry):     #Find end coordinate of phosphosite and append to relevent list.
            ps_ge.append(e)
            
#Zip all the lists and append to dataframe created previously. Change column names.
    Sub_PS_Genomic_Loc_df = Sub_PS_Genomic_Loc_df.append(pd.DataFrame((zip(Sub_Acc, ps_aa, ps_pos, ps_gs, ps_ge)), columns =['SUB_ACC_ID', 'Amino acid', 'Position', 'Start co', 'End co']))

#Convert amino acid names to one letter codes. 
Sub_PS_Genomic_Loc_df=Sub_PS_Genomic_Loc_df.replace(['serine.','serine'], 'S')
Sub_PS_Genomic_Loc_df=Sub_PS_Genomic_Loc_df.replace(['threonine.', 'threonine'], 'T')
Sub_PS_Genomic_Loc_df=Sub_PS_Genomic_Loc_df.replace(['tyrosine.', 'tyrosine'], 'Y')

#Combine phosphosite amino acid and position to have standardised information which can later be used to merge dataframes.
Sub_PS_Genomic_Loc_df['PS'] = Sub_PS_Genomic_Loc_df['Amino acid'] + Sub_PS_Genomic_Loc_df['Position']

del Sub_PS_Genomic_Loc_df['Amino acid'] #Delete columns which are no longer required.
del Sub_PS_Genomic_Loc_df['Position']

#Re order columns
Sub_PS_Genomic_Loc_df = Sub_PS_Genomic_Loc_df[['SUB_ACC_ID', 'PS', 'Start co', 'End co']]


#Generate a csv file from the dataframe created.
Sub_PS_Genomic_Loc_df.to_csv('PS_genomic_locations.csv', sep=',', header=True, index=False)

Merge Gene locations and phosphosite genomic locations into one dataframe.

In [29]:
Genomic_Location_of_Sub_PS=pd.merge(Sub_Gene_Location, PS_genomic_locations, on='SUB_ACC_ID') #Merge dataframes based on substrate accession number.

Merge genomic locations of phosphosites with original dataframe.

In [30]:
hk_all_data=pd.read_csv("new_clean_human_kinase_substrates.csv") #Read in original dataframe.
hk_sub_ACC_PS=hk_all_data[['SUB_ACC_ID', 'SUB_MOD_RSD']] #From original df subset to only include substrate identifiers required for merge.
hk_sub_ACC_PS.columns=['SUB_ACC_ID', 'PS'] #change SUB_MOD_RES NAME TO PS.


Genomic_Location_of_Sub_PS_merged=pd.merge(Genomic_Location_of_Sub_PS, hk_sub_ACC_PS, on=['SUB_ACC_ID', 'PS']) #Merge dataframes based on substrate accession number and phosphosite.

Adding Neighbouring amino acid sequences to phosphosite genomic location database.

In [31]:
#Create smaller df containing only substrate gene accession number, phosphosite position, and neighbouring amino acid sequence.
All_hk_sub_df=pd.read_csv("new_clean_human_kinase_substrates.csv")
N_aa=All_hk_sub_df[['SUB_ACC_ID', 'SUB_MOD_RSD', 'SITE_+/-7_AA']]

#Remove duplicates
N_aa= N_aa.drop_duplicates(subset=['SUB_ACC_ID', 'SUB_MOD_RSD'])
N_aa.columns=['SUB_ACC_ID', 'PS', 'Neighbouring amino acids +/-7'] #Rename columns
GL_and_neighbouring_aa_of_Sub_PS=pd.merge(Genomic_Location_of_Sub_PS_merged, N_aa , how='inner', on=['SUB_ACC_ID', 'PS']) #Merge df with genomic locations of PS with df of neighbouring sequences.

#Sort dataframe Chormosome and Karyotype band.
GL_and_neighbouring_aa_of_Sub_PS.sort_values(['Chromosome', 'Karyotype band'], ascending=[True, True])

GL_and_neighbouring_aa_of_Sub_PS.to_csv('GL_and_neighbouring_aa_of_Sub_PS.csv', sep=',', header=True, index=False) #Return as CSV file.

UPDATE-Add kinase identifiers required for database creation.

In [18]:
#Create smaller df containing only substrate gene accession number, phosphosite position, and neighbouring amino acid sequence.
All_hk_sub_df=pd.read_csv("new_clean_human_kinase_substrates.csv")
GL_and_neighbouring_aa_of_Sub_PS=pd.read_csv("GL_and_neighbouring_aa_of_Sub_PS.csv")
AA_and_Kinase=All_hk_sub_df[['SUB_ACC_ID', 'GENE', 'KIN_ACC_ID']]

#Remove duplicates
AA_and_Kinase= AA_and_Kinase.drop_duplicates(subset=['SUB_ACC_ID'])
AA_and_Kinase.columns=['SUB_ACC_ID', 'Kinase gene', 'KIN_ACC_ID'] #Rename columns
GL_and_neighbouring_aa_of_Sub_PS_final=pd.merge(GL_and_neighbouring_aa_of_Sub_PS, AA_and_Kinase , how='inner', on=['SUB_ACC_ID']) #Merge df with genomic locations of PS with df of neighbouring sequences and kinase identifiers.

#Sort dataframe Chormosome and Karyotype band.
GL_and_neighbouring_aa_of_Sub_PS_final.sort_values(['Chromosome', 'Karyotype band'], ascending=[True, True])


GL_and_neighbouring_aa_of_Sub_PS_final.to_csv('GL_and_neighbouring_aa_of_Sub_PS_final.csv', sep=',', header=True, index=False) #Return as CSV file.