In [3]:
import csv #loading csv package
import pandas as pd #loading pandas package
import requests #loading requests package
import re #loading regex package

In [8]:
df = pd.read_csv("clean_human_kinase.csv", index_col=0) #Read the kinase list csv file into a dataframe using pandas (pd)
identifier = list(df.Uniprot_number) #create an object that contains everything under the Uniprot Number

#Need to produce 2 URLs for each protein, that contains the unique kinase identifier number as well as the subcellular location from the database
#As there are sometimes 2 pages of results, need to make sure there is a URL for page 1, and a URL for results on page 2
url1= 'https://www.ebi.ac.uk/QuickGO/services/annotation/search?includeFields=goName&geneProductId=' #make an object containing the section of the quickGO URL before the kinase uniprot number
url2= '&aspect=cellular_component&limit=100&page=1'#make an object containing the section of the quickGO URL after the kinase uniprot identifier (page 1)
url3= '&aspect=cellular_component&limit=100&page=2'#make an object containing the section of the quickGO URL after the kinase uniprot identifier (page 2)

quickGODataList=[]#Create empty list that will contain subcellular location information from quickGO from page 1 of results
quickGODataList2=[]#Create empty list that will contain the subcellular location information from quickGO
errorList=[]#create an empty list for any protein names that may not be found using quickGO
 
for i in identifier: #for each kinase identifier
    
    try: #if no error is produced 
        url4=url1+i+url2 #Merge the 2 URL objects for page 1 of results, separated by the unique kimase idenifier name (i)
        url5=url1+i+url3 #merge the 2 URLs for page 2 of results
        quickGODataList.append(requests.get(url4).text) #append the information for each kinase to the list
        quickGODataList2.append(requests.get(url5).text) #append the information for each kinase to the list
    
    except: #If an error is produced
        errorList.append(i)#append the list to the output error list

[u'{"numberOfHits":104,"results":[{"id":"UniProtKB:P31749!557970642","geneProductId":"UniProtKB:P31749","qualifier":"part_of","goId":"GO:0005634","goName":"nucleus","goEvidence":"IEA","goAspect":"cellular_component","evidenceCode":"ECO:0000322","reference":"GO_REF:0000037","withFrom":[{"connectedXrefs":[{"db":"UniProtKB-KW","id":"KW-0539"}]}],"taxonId":9606,"taxonName":null,"assignedBy":"UniProt","extensions":null,"targetSets":["BHF-UCL","Exosome","KRUK"],"symbol":"AKT1","date":"20191123","synonyms":null,"name":null},{"id":"UniProtKB:P31749!557981331","geneProductId":"UniProtKB:P31749","qualifier":"part_of","goId":"GO:0016020","goName":"membrane","goEvidence":"IEA","goAspect":"cellular_component","evidenceCode":"ECO:0000322","reference":"GO_REF:0000037","withFrom":[{"connectedXrefs":[{"db":"UniProtKB-KW","id":"KW-0472"}]}],"taxonId":9606,"taxonName":null,"assignedBy":"UniProt","extensions":null,"targetSets":["BHF-UCL","Exosome","KRUK"],"symbol":"AKT1","date":"20191123","synonyms":null,

In [6]:
#Make 2 empty lists, 1 for the list created from the results of searching the UniprotDataList using regex1
#the other empty list is for the results of searching the results from regex1 using regex2
kinaseInfoList=[]
kinaseInfoList2=[]

#Make a regex that will location the cellular component information from the quickGO results
regex1=re.compile(r'"goName":"[A-Za-z]*\,*\s*\-*[a-z]*\,*\:*\-*\s*[A-Za-z]*\<*\-*\s*[A-Za-z]*\,*\s*[A-Za-z]*\-*\,*\s*[A-Za-z]*"') #Created a regex that finds the subcellular location information

#Search the quickGODataList using regex1, append results to KinaseInfoList
for value in quickGODataList: #for each value (value is the data for one kinase)
    kinaseInfoList.append(regex1.findall(value)) #append the results from the regex1 search to an empty list

for value in quickGODataList2: #for each value (value is the data for one kinase)
    kinaseInfoList2.append(regex1.findall(value))    

#create two lists that will contain the strings after unnecessary characters are removed
splitList=[]
splitList2=[]

#For each value in kinaseInfoList, remove extra characters that are not needed
for i in kinaseInfoList:
    i=str(i)
    splitList.append(i.replace('"','').replace('goName','').replace("[]", "").replace("]","").replace("[","").replace("u':","").replace("'",""))

#For each value in kinaseInfoList, remove extra characters that are not needed
for j in kinaseInfoList2:
    j=str(j)
    splitList2.append(j.replace('"','').replace('goName','').replace("[]", "").replace("]","").replace("[","").replace("u':","").replace("'",""))

#Make a dictionary containing the Uniprot Kinase Number identifier information and subcellular location information from each page of results
kinaseDict= {'Uniprot Number':identifier,'Subcellular Location1':splitList, "Subcellular Location2":splitList2} #create a dictionary, with 'Protein', 'Position' and 'Residue'

#Use pandas to make a dataframe from kinaseDict
df=pd.DataFrame(kinaseDict) 

#Replace empty strings 'NaN' with 0
df = df.replace(np.nan, 0)
df['Subcellular Location']=df['Subcellular Location1'].astype(str)+','+df['Subcellular Location2'].astype(str)

#Delete the columns for page 1 and 2
del df['Subcellular Location1']
del df['Subcellular Location2']

#Need to Separate the list within cells of the dataframe so that each subcellular location is a separate row
new_df=(df.set_index(['Uniprot Number']) #set index to uniprot number
   .stack() #.stack() function reshapes the dataframe by converting the data into stacked form (pivots dataframe around index, which is protein, so data is rearranged vertically)
   .str.split(',', expand=True) #split the values in the subcellular location column where there is a comma separating values 
   .stack()# #Use .stack() to once again pivot the dataframe around kinase identifier, so the data is stacked on top of eachother, removing the NULL values
   .unstack(-2) #make the second to last (-2)index level the columns 
   .reset_index(-1, drop=True)#get rid of the last level using reset_index
   .reset_index() #reset the index
)

#Remove any whitespace in the subcellular location column
new_df['Subcellular Location']=new_df['Subcellular Location'].str.strip()

#Tidy subcellular locations column by making all start of words capital letters
new_df['Subcellular Location']=new_df['Subcellular Location'].str.title()

#Drop rows where there are duplications in data(same subcellular location more than once for a kinase)
new_df2=new_df.drop_duplicates()

#Remove all rows where there are empty values in the Subcellular location column
final_df = new_df2[new_df2['Subcellular Location'] != '']

#Set uniprot number as index
new_df.set_index(['Uniprot Number'], inplace = True)

#Save the dataframe to a csv file
return final_df.to_csv('Subcellular_location.csv')  