In [1]:
import sys, os, json
sys.path.append('../')
from scripts.uniprot import UniprotInterface
import pandas as pd

In [2]:
def replace_char_at_index(s, i, new_char):
    if i < 0 or i >= len(s):
        raise IndexError("Index out of range.")
    return s[:i] + new_char + s[i+1:]

In [11]:
ids = ["Q75UA4"]
from_db = 'UniProtKB_AC-ID'
to_db = 'UniProtKB'
disease = "CRC"

In [12]:
downloader = UniprotInterface()

job_id = downloader.submit_id_mapping(from_db=from_db, to_db=to_db, ids=ids)

In [13]:
if downloader.check_id_mapping_results_ready(job_id):
    link = downloader.get_id_mapping_results_link(job_id)
    results = downloader.get_id_mapping_results_search(link)

Fetched: 1 / 1


In [10]:
with open("results.json", "w") as f:
    json.dump(results, f)

In [14]:
results['results'][0]['to']['sequence']['value']

'MKFGKFVLLAASTALAVVGLGGPAAADSTPQAQPSIIGGSNATSGPWAARLFVNGRQNCTATIIAPQYILTAKHCVSSSGTYTFRIGSLDQTSGGTMATGSTITRYPGSADLAIVRLTTSVNATYSPLGSVGDVSVGQNVSVYGWGATSQCGSEINCQSRYLKVATVRVNSISCSDYTGGVAVCANRVNGITAGGDSGGPMFASGRQVGVASTSDRVNNTAYTNITRYRSWISQVAGV'

In [15]:
for result in results['results']:
    print(result['from'])

Q75UA4


In [None]:
export_data = []
sequence = results['results'][0]['to']['sequence']['value']
for feature in results['results'][0]['to']['features']:
    row = []
    if feature['type'] == 'Natural variant' and disease in feature['description']:     
        row.append(feature['featureId'])
        location_start = feature['location']['start']['value']
        location_end = feature['location']['end']['value']
        if location_start == location_end:
            row.append(location_start)
            original_sequence = feature['alternativeSequence']['originalSequence']
            new_sequence = feature['alternativeSequence']['alternativeSequences'][0]
            row.append(f"{original_sequence}->{new_sequence}")
            row.append(replace_char_at_index(sequence, int(location_start)-1, new_sequence))
        else:
            row.append(f"{location_start}-{location_end}")
            row.append("missing")
            row.append(sequence[:int(location_start)-1] + sequence[int(location_end)-1:])
        export_data.append(row)
export_data

In [None]:
df = pd.DataFrame(export_data, columns=["variant id", "position", "change", "sequence"])
df

In [None]:
df.to_csv("results.csv", index=False)

In [46]:
result = results['results'][0]
for reference in result['to']['references']:
    print(reference['citation']['citationCrossReferences'])

[{'database': 'PubMed', 'id': '11133465'}, {'database': 'DOI', 'id': '10.1128/AEM.67.1.345-353.2001'}]
[{'database': 'PubMed', 'id': '16237016'}, {'database': 'DOI', 'id': '10.1128/JB.187.21.7333-7340.2005'}]


In [49]:
references_list = []
result = results['results'][0]                 

try:
    for r in result['to']['references']:
        tmp = {}
        tmp["citacionCrossReferences"] = r['citation']['citationCrossReferences']
        tmp.update({"title": r['citation']['title']})
        references_list.append(tmp)
except KeyError:
    pass

In [50]:
references_list

[{'citacionCrossReferences': [{'database': 'PubMed', 'id': '11133465'},
   {'database': 'DOI', 'id': '10.1128/AEM.67.1.345-353.2001'}],
  'title': 'Purification and characterization of an extracellular poly(L-lactic acid) depolymerase from a soil isolate, Amycolatopsis sp. strain K104-1.'},
 {'citacionCrossReferences': [{'database': 'PubMed', 'id': '16237016'},
   {'database': 'DOI', 'id': '10.1128/JB.187.21.7333-7340.2005'}],
  'title': 'Gene cloning and molecular characterization of an extracellular poly(L-lactic acid) depolymerase from Amycolatopsis sp. strain K104-1.'}]