# Data mining - Genomic locations of phosphosites.

In [1]:
#Import required packagaes
import pandas as pd
import re

# Genomic location of substrate genes.
Using substrate accession numbers from kinase/substrate dataframe to collect genomic locations of substrate genes from ensembl Biomart.

In [4]:
#Read file with all kinase and substrate data.
hk_sub_df = pd.read_csv("new_clean_human_kinase_substrates.csv")
#Remove gene name duplicates-this is due to most substrates having multiple phosphosites.
hk_sub_df=hk_sub_df.drop_duplicates(subset='SUB_ACC_ID', keep='first', inplace=False)



#From kinse and substrate dataframe create create a new csv file containing only substrate gene accession numbers. 
hk_sub_Acc=hk_sub_df['SUB_ACC_ID']
hk_sub_Acc.to_csv('hk_sub_Acc.csv', sep=',', header=False, index=False) #This file will be uploaded to the ensembl biomart search.



#From kinse and substrate dataframe create a new dataframe containing only gene names and gene accession IDs-Genomic locations will be added to this dataframe.
hk_sub_GeneNames_Acc=hk_sub_df[['SUB_GENE', 'SUB_ACC_ID']]


In [5]:
#Read in file with genomic locations of substrate genes generated from ensemble biomart. *database-attributes.*
Biomart_sub_gene_locations_df = pd.read_csv("Biomart_sub_gene_locations.csv")
Biomart_sub_gene_locations_df.columns=['SUB_GENE', 'Chromosome', 'Karyotype band', 'Strand'] #Change column names.


# Extracting genomic coordinates of phosphosites
Using a url extract genomic coordinates for each phosphosite on each substrate.

In [9]:
#Create an empty dataframe into which extracted data will be added.
Sub_PS_Genomic_Loc_df=pd.DataFrame(columns=['SUB_ACC_ID', 'Amino acid', 'Position', 'Start co', 'End co'])

#Read in Substrate gene location file to create data frame with accession numbers only.
hk_sub_df=pd.read_csv("Sub_GeneLocation_df.csv")
hk_sub_acc_df=hk_sub_df['SUB_ACC_ID']
# print(len(hk_sub_acc_df))
# hk_sub_acc_df_test=hk_sub_acc_df[0:500]

#Extracting phosphosite information from Ebi.

for i in hk_sub_acc_df:
    url = 'https://www.ebi.ac.uk/proteins/api/coordinates/{}'.format(i)  #Itterating through accession numbers create a url for each accession number.
    r = requests.get(url)  #Obtain url information.
    r_txt=r.text    #Convert url information into text.
    MR_info=re.findall('(?<=modified )(.*?)(?=feature)', r_txt) #Find all sections of the text which contain the information about modified residues which includes phosphosites.
   
    
#Create regular expressions to extract phosphosite position, amino acid, start and end coordinates.
#Create empty lists for the different data to be stored.

     #Position
    regex_ps_pos= re.compile('(?<=position=")(.*?)(?=")')
    ps_pos=[]

    #AA
    regex_ps_aa= re.compile('(?<=Phospho)(.*?)(?=<|;)')
    ps_aa=[]

    #Start
    regex_ps_gs= re.compile('(?<=begin position=")(.*?)(?=")')
    ps_gs=[]

    #End
    regex_ps_ge= re.compile('(?<=end position=")(.*?)(?=")')
    ps_ge=[]
    Sub_Acc=[]
    for entry in MR_info:                            #Itterating through each modified residue information section.
        Sub_Acc.append(i)                            #Add the accession number of the gene to a list so each entry contains the gene accession number.
        for p in re.findall(regex_ps_pos, entry):    #Find the position of the phosphosite.
            if len(p)<=4:                            #Only add to the positions list if value is less than or equal to 4 digits to avoid adding genomic coordinates.
                ps_pos.append(p)
        for a in re.findall(regex_ps_aa, entry):     #Find amino acid at phosphosite and append to relevent list.
            ps_aa.append(a)                        
        for s in re.findall(regex_ps_gs, entry):     #Find start coordinate of phosphosite and append to relevent list.
            ps_gs.append(s)
        for e in re.findall(regex_ps_ge, entry):     #Find end coordinate of phosphosite and append to relevent list.
            ps_ge.append(e)
            
#Zip all the lists and append to dataframe created previously. Change column names.
    Sub_PS_Genomic_Loc_df = Sub_PS_Genomic_Loc_df.append(pd.DataFrame((zip(Sub_Acc, ps_aa, ps_pos, ps_gs, ps_ge)), columns =['SUB_ACC_ID', 'Amino acid', 'Position', 'Start co', 'End co']))

#Convert amino acid names to one letter codes. 
Sub_PS_Genomic_Loc_df=Sub_PS_Genomic_Loc_df.replace(['serine.','serine'], 'S')
Sub_PS_Genomic_Loc_df=Sub_PS_Genomic_Loc_df.replace(['threonine.', 'threonine'], 'T')
Sub_PS_Genomic_Loc_df=Sub_PS_Genomic_Loc_df.replace(['tyrosine.', 'tyrosine'], 'Y')

#Combine phosphosite amino acid and position to have standardised information which can later be used to merge dataframes.
Sub_PS_Genomic_Loc_df['PS'] = Sub_PS_Genomic_Loc_df['Amino acid'] + Sub_PS_Genomic_Loc_df['Position']

del Sub_PS_Genomic_Loc_df['Amino acid'] #Delete columns which are no longer required.
del Sub_PS_Genomic_Loc_df['Position']

#Re order columns
Sub_PS_Genomic_Loc_df = Sub_PS_Genomic_Loc_df[['SUB_ACC_ID', 'PS', 'Start co', 'End co']]


#Generate a csv file from the dataframe created.
Sub_PS_Genomic_Loc_df.to_csv('PS_genomic_locations.csv', sep=',', header=True, index=False)