# Creating a single dataframe of phosphosite genomic locations with neighbouring amino acids, substarte and kinase information.

In [1]:
#Import required packagaes
import pandas as pd

In [2]:
#Read in files to be merged.
kinase_df = pd.read_csv("new_clean_human_kinase_substrates.csv")  #Kinase/substarte dataframe.
Biomart_df = pd.read_csv("Biomart_sub_gene_locations.csv")  #Genomic locations of substrate genes from ensembl biomart.
PS_gl_df = pd.read_csv("PS_genomic_locations.csv") #Genomic coordinates of individual phosphosites.


In [4]:
#From kinase_df create new dataframe only with relevant columns.
kinase_sub_rsd_df=kinase_df[['KINASE', 'KIN_ACC_ID', 'SUB_ACC_ID', 'SUB_GENE', 'SUB_MOD_RSD', 'SITE_+/-7_AA']]


In [5]:
#Change Biomart 'gene name' column to 'SUB_GENE'to simplify the joining of dataframes.
Biomart_df=Biomart_df.rename(columns={"Gene name": "SUB_GENE"})

#join genomic locations from Biomart with relevant columns from kinase/substrate dataframe.
#This can be joined because each substrate gene has a single genomic location, although can have multiple phosphosites and kinases.
Biomart_and_kinase=kinase_sub_rsd_df.join(Biomart_df.set_index('SUB_GENE'), on='SUB_GENE')


In [6]:
#Change PS_gl_df 'PS' column to 'SUB_MOD_RSD' to simplify merging of dataframes.
PS_gl_df=PS_gl_df.rename(columns={"PS": "SUB_MOD_RSD"})


##Merge phosphosite genomic coordinates with rest of the relevant information.
#This has to be merged as the same substrate will have multiple phosphosites which have different coordiantes, so merge by 2 columns ('SUB_ACC_ID' and 'SUB_MOD_RSD'). 
Phosphosite_genomic_locations=pd.merge(Biomart_and_kinase, PS_gl_df, on=['SUB_ACC_ID', 'SUB_MOD_RSD'])

#Create final csv file from resulting dataframe.
Phosphosite_genomic_locations.to_csv('Phosphosite_genomic_locations.csv', sep=',', header=True, index=False) 