In [12]:
import os
path='/Users/tonglen/downloads/rub-co-occurrence-master_2/'
os.chdir(path)

os.getcwd()

'/Users/tonglen/Downloads/rub-co-occurrence-master_2'

In [13]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
from os import path
from matplotlib import pyplot as plt
from urllib.parse import urlencode
from urllib.request import urlretrieve

In [14]:
# Helper function for making an UniprotKB query string for their REST API.
# Must provide ONE of pfam, interpro or EC
def format_query_url(pfam=None, interpro=None, EC=None):
    base_url = 'https://www.uniprot.org/uniprot/?'
    columns = (  # columns we want to request
        'id,entry name,genes,genes(ORF),organism,organism-id,length,mass,sequence,ec,fragment,' +
        'comment(PATHWAY),annotation score,reviewed,' + 
        'feature(INTRAMEMBRANE),feature(TRANSMEMBRANE),database(INTERPRO),database(PFAM),database(PROSITE),' +
        'lineage(SUPERKINGDOM),lineage(PHYLUM),lineage(CLASS),lineage(ORDER),comment(COFACTOR),protein names'
    )
    
    
    # build the query in a manner that depends on the parameters supplied
    query_str = ''
    if pfam is not None:
        query_str = 'database:(type:pfam %s)' % pfam
    elif interpro is not None:
        query_str = 'database:(type:interpro %s)' % interpro
    elif EC is not None:
        query_str = 'ec:%s' % EC
    else:
        assert False, 'you suck, you didnt set any params for search.'
    
    # urlencode the query string and format the full URL.
    url_query = {'query': query_str,
                 'sort': 'score',
                 'columns': columns,
                 'format': 'tab'}
    encoded_query = urlencode(url_query)
    
    full_url = ''.join([base_url, encoded_query])
    return full_url

In [15]:
# Opening the queries as a dataframe - one query per row determined by a PFAM, INTERPRO ID or EC class.
queries_filename = 'uniprotKB_queries.csv'
queries_df = pd.read_csv(queries_filename, sep = '\t')
# Hack - pandas sets empty values to NaN which do not evaluate to False.
# Replace NaN with None which does evaluate to False.
queries_df[queries_df.isnull()] = None
queries_df

Unnamed: 0,gene_name,pfam,interpro,EC,notes,Unnamed: 5,Unnamed: 6
0,rubisco_small,pf00101,,,,,
1,FormIV,,,,,,
2,FormIIIc,,,,,,
3,FormIIIb,,,,,,
4,FormIe,,,,,,
5,FormIC,,,,,,
6,FormIBc,,,,,,
7,FormIAq,,,,,,
8,FormIAc,,,,,,
9,FormIA_all,,,,,,


In [16]:
# For each row in the dataframe
for row_id, row in queries_df.iterrows():
    # Determine if there is enough information to make a query
    # - need one of the three fields below.
    if not row.pfam and not row.interpro and not row.EC:
        print('No query, skipping')
        continue
            
    # Make the query
    q = format_query_url(pfam=row.pfam, interpro=row.interpro, EC=row.EC)
    print(row.gene_name)
    print('%s' % q)
    
    # Format the output filename
    out_fname = 'uniprotKB_downloads/%s.csv' % row.gene_name
    # read the file to this folder.
    
    # skip if output exists.
    if path.exists(out_fname):
        print('file exists, skipping')
        continue
    
    try:
        # Try to read the file from the Uniprot API into a local file.
        urlretrieve(q, out_fname)
    except Error as e:
        print(e)
        continue

rubisco_small
https://www.uniprot.org/uniprot/?query=database%3A%28type%3Apfam+pf00101%29&sort=score&columns=id%2Centry+name%2Cgenes%2Cgenes%28ORF%29%2Corganism%2Corganism-id%2Clength%2Cmass%2Csequence%2Cec%2Cfragment%2Ccomment%28PATHWAY%29%2Cannotation+score%2Creviewed%2Cfeature%28INTRAMEMBRANE%29%2Cfeature%28TRANSMEMBRANE%29%2Cdatabase%28INTERPRO%29%2Cdatabase%28PFAM%29%2Cdatabase%28PROSITE%29%2Clineage%28SUPERKINGDOM%29%2Clineage%28PHYLUM%29%2Clineage%28CLASS%29%2Clineage%28ORDER%29%2Ccomment%28COFACTOR%29%2Cprotein+names&format=tab
file exists, skipping
No query, skipping
No query, skipping
No query, skipping
No query, skipping
No query, skipping
No query, skipping
No query, skipping
No query, skipping
No query, skipping
No query, skipping
No query, skipping
No query, skipping
No query, skipping
No query, skipping
sbpase
https://www.uniprot.org/uniprot/?query=ec%3A3.1.3.37&sort=score&columns=id%2Centry+name%2Cgenes%2Cgenes%28ORF%29%2Corganism%2Corganism-id%2Clength%2Cmass%2Csequenc

hps
https://www.uniprot.org/uniprot/?query=ec%3A4.1.2.43&sort=score&columns=id%2Centry+name%2Cgenes%2Cgenes%28ORF%29%2Corganism%2Corganism-id%2Clength%2Cmass%2Csequence%2Cec%2Cfragment%2Ccomment%28PATHWAY%29%2Cannotation+score%2Creviewed%2Cfeature%28INTRAMEMBRANE%29%2Cfeature%28TRANSMEMBRANE%29%2Cdatabase%28INTERPRO%29%2Cdatabase%28PFAM%29%2Cdatabase%28PROSITE%29%2Clineage%28SUPERKINGDOM%29%2Clineage%28PHYLUM%29%2Clineage%28CLASS%29%2Clineage%28ORDER%29%2Ccomment%28COFACTOR%29%2Cprotein+names&format=tab
file exists, skipping
phi
https://www.uniprot.org/uniprot/?query=ec%3A5.3.1.27&sort=score&columns=id%2Centry+name%2Cgenes%2Cgenes%28ORF%29%2Corganism%2Corganism-id%2Clength%2Cmass%2Csequence%2Cec%2Cfragment%2Ccomment%28PATHWAY%29%2Cannotation+score%2Creviewed%2Cfeature%28INTRAMEMBRANE%29%2Cfeature%28TRANSMEMBRANE%29%2Cdatabase%28INTERPRO%29%2Cdatabase%28PFAM%29%2Cdatabase%28PROSITE%29%2Clineage%28SUPERKINGDOM%29%2Clineage%28PHYLUM%29%2Clineage%28CLASS%29%2Clineage%28ORDER%29%2Ccomment%2