In [1]:
import requests
import json

In [2]:
class InterPro:
    def __init__(self, ipr_number):
        self.ipr_number = ipr_number
        self.data_count = 0
        self.data_list  = []
        
    def get_data(self, make_cursor=False):
        if make_cursor:
            api_url = make_cursor
        else:
            api_url = f'https://www.ebi.ac.uk/interpro/api/protein/UniProt/entry/InterPro/{self.ipr_number}/'
        print(f'working on  {api_url}')
        
        response = requests.get(api_url)
        if response.ok:
            data = json.loads(response.text)
            self.data_count = data['count']
            
            for result in data['results']:
                self.data_list.append((result))
            #self.data_list.append(data['results']) 
            
            cursor = data['next']
            
            if cursor:
                self.get_data(make_cursor=cursor)
            
           
        else:
            return False
    
    def get_accessions(self, show_reviewed=True):
        accessions = []
        for result in self.data_list:
            accession = result['metadata']['accession']
            is_reviewed = result['metadata']['source_database']
            if not show_reviewed:
                accessions.append(accession)
            elif show_reviewed and is_reviewed == 'reviewed':
                accessions.append(accession)
        return accessions
    
    def get_start_and_end(self):
        places = []
        for result in self.data_list:
            start = result['entries'][0]['entry_protein_locations'][0]['fragments'][0]['start']
            end = result['entries'][0]['entry_protein_locations'][0]['fragments'][0]['end']
            places.append((start, end))
        return places
    

In [3]:
data = InterPro('IPR000018')
data.get_data()

working on  https://www.ebi.ac.uk/interpro/api/protein/UniProt/entry/InterPro/IPR000018/
working on  https://www.ebi.ac.uk/interpro/api/protein/UniProt/entry/InterPro/IPR000018/?cursor=a0a0q3p7a8
working on  https://www.ebi.ac.uk/interpro/api/protein/UniProt/entry/InterPro/IPR000018/?cursor=a0a1s3a7x1
working on  https://www.ebi.ac.uk/interpro/api/protein/UniProt/entry/InterPro/IPR000018/?cursor=a0a2k5ud43
working on  https://www.ebi.ac.uk/interpro/api/protein/UniProt/entry/InterPro/IPR000018/?cursor=a0a3b1jcg9
working on  https://www.ebi.ac.uk/interpro/api/protein/UniProt/entry/InterPro/IPR000018/?cursor=a0a3p9bdb7
working on  https://www.ebi.ac.uk/interpro/api/protein/UniProt/entry/InterPro/IPR000018/?cursor=a0a3q7qq50
working on  https://www.ebi.ac.uk/interpro/api/protein/UniProt/entry/InterPro/IPR000018/?cursor=a0a4w3i869
working on  https://www.ebi.ac.uk/interpro/api/protein/UniProt/entry/InterPro/IPR000018/?cursor=a0a663ex97
working on  https://www.ebi.ac.uk/interpro/api/protein/

In [4]:
accesions = data.get_accessions(False)
accesions

['A0A060WN59',
 'A0A060XBK7',
 'A0A087RIV8',
 'A0A087V4V8',
 'A0A087Y1F9',
 'A0A091DHW1',
 'A0A091EVS2',
 'A0A091FQX6',
 'A0A091JUB2',
 'A0A091LDW0',
 'A0A091QUP8',
 'A0A091UE59',
 'A0A093NVZ3',
 'A0A094L093',
 'A0A096N638',
 'A0A099ZG24',
 'A0A0D9SAJ4',
 'A0A0N8JYX2',
 'A0A0P6IZI8',
 'A0A0Q3P7A8',
 'A0A0S7LJM8',
 'A0A0S7LKT5',
 'A0A151NNH3',
 'A0A1A6I088',
 'A0A1A7Z7J5',
 'A0A1A7ZMD4',
 'A0A1A8C7Y5',
 'A0A1A8DRZ8',
 'A0A1A8DU39',
 'A0A1A8ER71',
 'A0A1A8IBF1',
 'A0A1A8N0F8',
 'A0A1A8N105',
 'A0A1A8NBF6',
 'A0A1A8NBG9',
 'A0A1A8NS01',
 'A0A1A8RS74',
 'A0A1A8UK96',
 'A0A1L8HB65',
 'A0A1S3A7X1',
 'A0A1S3GE20',
 'A0A1S3RSD7',
 'A0A1S3STT9',
 'A0A1U7R550',
 'A0A1U7TAD7',
 'A0A1U8D9V1',
 'A0A1V4KXY7',
 'A0A212C182',
 'A0A218UIV9',
 'A0A226N5Z6',
 'A0A226PJA9',
 'A0A2D0RK43',
 'A0A2F0BFS1',
 'A0A2I0LRI5',
 'A0A2I0TGI7',
 'A0A2I4AYA9',
 'A0A2J8UBE1',
 'A0A2K5HBN5',
 'A0A2K5L3K2',
 'A0A2K5UD43',
 'A0A2K5XPQ4',
 'A0A2K6AN49',
 'A0A2K6JN28',
 'A0A2K6NAY3',
 'A0A2P4S869',
 'A0A2R8Z6L4',
 'A0A2U3VN

In [5]:
def retrieve_ids(ids_list):
    url = 'https://www.uniprot.org/uploadlists/'
    
    query_list=' '.join(ids_list)
    
    params = {
        'from': 'ACC+ID',
        'to': 'KEGG_ID',
        'format': 'tab',
        'query': query_list
    }
    
    response = requests.get(url, params)
    
    if response.ok:
        results = response.text.replace('\t', ' ').split('\n')
        results = [result for result in results[1:] if result]
        
        return results
    else:
        return False

In [6]:
uniprot_kegg_list= retrieve_ids(accesions)
uniprot_kegg_list

['A0A0D9SAJ4 csab:103232129',
 'A0A0N8JYX2 sfm:108922871',
 'A0A151NNH3 amj:102576992',
 'A0A1A7ZMD4 nfu:107378664',
 'A0A1L8HB65 xla:108710046',
 'A0A1S3RSD7 sasa:106604737',
 'A0A1S3STT9 sasa:106611738',
 'A0A1U8D9V1 asn:102375685',
 'A0A2D0RK43 ipu:108269262',
 'A0A2I0LRI5 clv:102085515',
 'A0A2I4AYA9 alim:106515297',
 'A0A2K5UD43 mcf:102131246',
 'A0A2K6JN28 rbb:108526319',
 'A0A2K6NAY3 rro:104678783',
 'A0A2R8Z6L4 pps:100972172',
 'A0A2U3VNI8 oro:101378535',
 'A0A2Y9DT78 tmu:101355448',
 'A0A2Y9FG53 pcad:102983695',
 'A0A2Y9L445 elk:111161002',
 'A0A2Y9M3Z4 dle:111167799',
 'A0A340YAU8 lve:103068968',
 'A0A384ACL9 bacu:103013742',
 'A0A384DUB1 umr:103682600',
 'A0A3P8UX94 csem:103390764',
 'A0A3P9BDB7 mze:101484656',
 'A0A3Q2G8S0 cvg:107102645',
 'A0A3Q2ZXK7 kmr:108231187',
 'A0A3Q7SB79 vvp:112907631',
 'A0A3Q7VUJ8 uah:113245647',
 'A0A4W3KDP1 cmk:103177162',
 'A0A4W6EJR4 lcf:108872542',
 'A0A6D2XAW3 ptr:473656',
 'D2H897 aml:100478065',
 'F1MDW9 bta:514653',
 'F1NVI2 gga:10085768

In [7]:
def make_dict_from_results(results_list):
    results_dict={}
    
    for result in results_list:
        _id, _value = result.split()
        
        results_dict[_id] = {'kegg_id': _value} 
    return results_dict

In [8]:
results_dict = make_dict_from_results(uniprot_kegg_list)
print(results_dict)

{'A0A0D9SAJ4': {'kegg_id': 'csab:103232129'}, 'A0A0N8JYX2': {'kegg_id': 'sfm:108922871'}, 'A0A151NNH3': {'kegg_id': 'amj:102576992'}, 'A0A1A7ZMD4': {'kegg_id': 'nfu:107378664'}, 'A0A1L8HB65': {'kegg_id': 'xla:108710046'}, 'A0A1S3RSD7': {'kegg_id': 'sasa:106604737'}, 'A0A1S3STT9': {'kegg_id': 'sasa:106611738'}, 'A0A1U8D9V1': {'kegg_id': 'asn:102375685'}, 'A0A2D0RK43': {'kegg_id': 'ipu:108269262'}, 'A0A2I0LRI5': {'kegg_id': 'clv:102085515'}, 'A0A2I4AYA9': {'kegg_id': 'alim:106515297'}, 'A0A2K5UD43': {'kegg_id': 'mcf:102131246'}, 'A0A2K6JN28': {'kegg_id': 'rbb:108526319'}, 'A0A2K6NAY3': {'kegg_id': 'rro:104678783'}, 'A0A2R8Z6L4': {'kegg_id': 'pps:100972172'}, 'A0A2U3VNI8': {'kegg_id': 'oro:101378535'}, 'A0A2Y9DT78': {'kegg_id': 'tmu:101355448'}, 'A0A2Y9FG53': {'kegg_id': 'pcad:102983695'}, 'A0A2Y9L445': {'kegg_id': 'elk:111161002'}, 'A0A2Y9M3Z4': {'kegg_id': 'dle:111167799'}, 'A0A340YAU8': {'kegg_id': 'lve:103068968'}, 'A0A384ACL9': {'kegg_id': 'bacu:103013742'}, 'A0A384DUB1': {'kegg_id':

In [9]:
def get_data_from_kegg(results_dict):
    kegg_api_url = 'http://rest.kegg.jp/get/{}'
    
    for _id in results_dict.keys():
        kegg_id = results_dict[_id]['kegg_id']
        kegg_id_url = kegg_api_url.format(kegg_id)
        
        print(f'getting UnProt: {_id} -> KEGG: {kegg_id}')
        
        response = requests.get(kegg_id_url)
        
        if response.ok:
            kegg_result = response.text
            print(kegg_result)
    
    return results_dict

In [10]:
def get_AA_and_NT_from_kegg(kegg_id):
    kegg_id_url = 'http://rest.kegg.jp/get/{}'.format(kegg_id)  
    response = requests.get(kegg_id_url)
    aa = ''
    nt = ''
    if response.ok:
        kegg_result = response.text
        mode = None
        for line in kegg_result.strip().splitlines():
            if line.startswith('AASEQ'):
                mode = 'aa'
                continue
            elif line.startswith('NTSEQ'):
                mode = 'nt'
                continue
            elif line.startswith('///'):
                mode = None
            if mode == 'aa':
                aa += line[12:]
            if mode == 'nt':
                nt += line[12:]
    return (aa, nt)

In [15]:
to_aa = ''
to_nt = ''
for aa_places, results in zip(data.get_start_and_end(), results_dict.values()):
    kegg = get_AA_and_NT_from_kegg(results['kegg_id'])
    to_aa += results['kegg_id']+'\n'
    to_aa += kegg[0][aa_places[0]:aa_places[1]]+'\n'
    
    to_nt += results['kegg_id']+'\n'
    to_nt += kegg[1][aa_places[0]*3:aa_places[1]*3]+'\n'
    print
    
aa = open("aa.fasta", "w")
aa.write(to_aa)
aa.close()
nt = open("nt.fasta", "w")
nt.write(to_nt)
nt.close()