# Read MANE jsons

In [1]:
import json
import pandas as pd

In [2]:
# Read in a JSON file 

def read_json(path):
    # Open and read the JSON file
    with open(path, 'r') as file:
        data = json.load(file)
    return data


In [3]:
# MANE Database
path = 'C:\\Users\\tscha\\OneDrive - FHNW\\Desktop\\Isoform DB\\IsoformDB_research\\IsoformDB_research\\data\\uniprotkb_database_MANE_Select_2025_03_19.json'
data = read_json(path)

Some Data exploration might get removed later on 

In [4]:
for i in range(19173):
    if data["results"][i]["primaryAccession"] == "A0AVT1":
        print("True")
        print(data["results"][i])
        print(i)


True
{'entryType': 'UniProtKB reviewed (Swiss-Prot)', 'primaryAccession': 'A0AVT1', 'secondaryAccessions': ['A6N8M7', 'B2RAV3', 'Q4W5K0', 'Q6UV21', 'Q86T78', 'Q86TC7', 'Q8N5T3', 'Q8N9E4', 'Q9H3T7', 'Q9NVC9'], 'uniProtkbId': 'UBA6_HUMAN', 'entryAudit': {'firstPublicDate': '2007-02-20', 'lastAnnotationUpdateDate': '2025-02-05', 'lastSequenceUpdateDate': '2006-11-28', 'entryVersion': 158, 'sequenceVersion': 1}, 'annotationScore': 5.0, 'organism': {'scientificName': 'Homo sapiens', 'commonName': 'Human', 'taxonId': 9606, 'lineage': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo']}, 'proteinExistence': '1: Evidence at protein level', 'proteinDescription': {'recommendedName': {'fullName': {'value': 'Ubiquitin-like modifier-activating enzyme 6'}, 'shortNames': [{'value': 'Ubiquitin-activating enzyme 6'}], 'ecNumbers': [{'evidences': [{'evidenceCode': 'ECO:000

In [5]:
list(data["results"][17].keys())

['entryType',
 'primaryAccession',
 'secondaryAccessions',
 'uniProtkbId',
 'entryAudit',
 'annotationScore',
 'organism',
 'proteinExistence',
 'proteinDescription',
 'genes',
 'comments',
 'features',
 'keywords',
 'references',
 'uniProtKBCrossReferences',
 'sequence',
 'extraAttributes']

In [6]:
for entry in data["results"][17]["uniProtKBCrossReferences"]:
    print(entry)

{'database': 'EMBL', 'id': 'AY359880', 'properties': [{'key': 'ProteinId', 'value': 'AAQ63403.1'}, {'key': 'Status', 'value': '-'}, {'key': 'MoleculeType', 'value': 'mRNA'}]}
{'database': 'EMBL', 'id': 'EF623993', 'properties': [{'key': 'ProteinId', 'value': 'ABR25253.1'}, {'key': 'Status', 'value': '-'}, {'key': 'MoleculeType', 'value': 'mRNA'}]}
{'database': 'EMBL', 'id': 'AB014773', 'properties': [{'key': 'ProteinId', 'value': 'BAB19785.1'}, {'key': 'Status', 'value': '-'}, {'key': 'MoleculeType', 'value': 'mRNA'}]}
{'database': 'EMBL', 'id': 'AK001670', 'properties': [{'key': 'ProteinId', 'value': 'BAA91824.1'}, {'key': 'Status', 'value': 'ALT_INIT'}, {'key': 'MoleculeType', 'value': 'mRNA'}]}
{'database': 'EMBL', 'id': 'AK094969', 'properties': [{'key': 'ProteinId', 'value': 'BAC04463.1'}, {'key': 'Status', 'value': '-'}, {'key': 'MoleculeType', 'value': 'mRNA'}]}
{'database': 'EMBL', 'id': 'AK314371', 'properties': [{'key': 'ProteinId', 'value': 'BAG37000.1'}, {'key': 'Status', '

In [7]:
data["results"][2341]["uniProtKBCrossReferences"][2].keys()

dict_keys(['database', 'id', 'properties'])

In [None]:
"""Find all entries in the MANE Database which have a PDB entry,
which means that they have an experimentally determined structure.
The Plan is to use these as canonical Proteins to find Isoforms
so that we later can compare Alphafold 2 Predicted Isoforms to 
experimentally determind Isoforms.  Some Proteins in this DB already
have Isoforms associated with them so these are also extracted."""


df = pd.DataFrame(columns=["UNIPROT_ACCES", "ISOFORM_IDS"])
mult_pdb_entries = []
for i in range(19173): #loop over the whole MANE Database 
    pdb_counter = 0 # Track if a MANE entry has multiple pdb entries
    for database in data["results"][i]["uniProtKBCrossReferences"]: #check if in the MANE Database has a PDB entry
        if database["database"] == "PDB":
            if database["properties"][0]["value"] != "NMR": #filter and NMR pdb entries as they are not usefull for us now 
                pdb_counter += 1
                isoforms = []
                if "comments" in data["results"][i]: #Check if a entry has Isoforms assocciated with it 
                    for j in data["results"][i]["comments"]:
                        if j["commentType"] == "ALTERNATIVE PRODUCTS":
                                    for k in j["isoforms"]:
                                        isoforms.append(k["isoformIds"][0])
    if (len(isoforms) > 0) and pdb_counter > 1:
        df.loc[i] = [data["results"][i]["primaryAccession"], isoforms]



In [22]:
df

Unnamed: 0,UNIPROT_ACCES,ISOFORM_IDS
17,A0AVT1,"[A0AVT1-1, A0AVT1-2, A0AVT1-3, A0AVT1-4]"
43,A1Z1Q3,"[A1Z1Q3-2, A1Z1Q3-1, A1Z1Q3-4, A1Z1Q3-5, A1Z1Q..."
54,A2RUC4,"[A2RUC4-1, A2RUC4-2]"
62,A4D1E9,"[A4D1E9-1, A4D1E9-2, A4D1E9-3]"
63,A4D1P6,"[A4D1P6-1, A4D1P6-2, A4D1P6-3]"
...,...,...
17243,Q9BSC4,"[Q9BSC4-1, Q9BSC4-2, Q9BSC4-3, Q9BSC4-4]"
17409,Q9P016,"[Q9P016-1, Q9P016-2]"
17431,Q9UGV2,"[Q9UGV2-1, Q9UGV2-2, Q9UGV2-3]"
17463,Q9Y343,"[Q9Y343-1, Q9Y343-2]"


In [8]:
df

Unnamed: 0,UNIPROT_ACCES,PDB_ID,METHOD,ISOFORM_IDS
13,A0AV96,2DIS,NMR,"[A0AV96-1, A0AV96-2]"
16,A0AVK6,4YO2,X-ray,[]
17,A0AVT1,7SOL,X-ray,"[A0AVT1-1, A0AVT1-2, A0AVT1-3, A0AVT1-4]"
24,A0PK00,7F73,EM,[]
26,A1A4S6,2MIO,NMR,[]
...,...,...,...,...
18673,Q9P298,2LON,NMR,[]
18678,Q9UIY3,2DAW,NMR,"[Q9UIY3-1, Q9UIY3-2]"
18684,Q9UNZ5,8INF,EM,[]
19072,E9PRG8,8INF,EM,[]


In [9]:
df["UNIPROT_ACCES"].unique()

array(['A0AV96', 'A0AVK6', 'A0AVT1', ..., 'Q9UNZ5', 'E9PRG8', 'Q96QC4'],
      shape=(8182,), dtype=object)

In [10]:
canonical_prots_w_isoforms = df[df["ISOFORM_IDS"].apply(lambda x: isinstance(x, list) and len(x) > 0)]
canonical_prots_w_isoforms


Unnamed: 0,UNIPROT_ACCES,PDB_ID,METHOD,ISOFORM_IDS
13,A0AV96,2DIS,NMR,"[A0AV96-1, A0AV96-2]"
17,A0AVT1,7SOL,X-ray,"[A0AVT1-1, A0AVT1-2, A0AVT1-3, A0AVT1-4]"
42,A1XBS5,8CEG,X-ray,"[A1XBS5-1, A1XBS5-2, A1XBS5-3, A1XBS5-4, A1XBS..."
43,A1Z1Q3,6Y73,X-ray,"[A1Z1Q3-2, A1Z1Q3-1, A1Z1Q3-4, A1Z1Q3-5, A1Z1Q..."
54,A2RUC4,3AL6,X-ray,"[A2RUC4-1, A2RUC4-2]"
...,...,...,...,...
17431,Q9UGV2,6L4H,X-ray,"[Q9UGV2-1, Q9UGV2-2, Q9UGV2-3]"
17463,Q9Y343,8U9G,X-ray,"[Q9Y343-1, Q9Y343-2]"
17464,Q9Y3B1,6I4Y,X-ray,"[Q9Y3B1-1, Q9Y3B1-2]"
17466,Q9Y3C1,8FKY,EM,"[Q9Y3C1-1, Q9Y3C1-2, Q9Y3C1-3]"


In [None]:
df.to_csv("Canonical_PDB_entries", index = False)
canonical_prots_w_isoforms.to_csv("canonical_prots_w_isoforms", index = False)

## Identify if a Protein has annotaed Isoforms 

In [None]:
iso = read_json('C:\\Users\\tscha\\OneDrive - FHNW\\Desktop\\Isoform DB\\IsoformDB_research\\IsoformDB_research\\data\\uniprotkb_accession_A0A1B0GTW7_2025_03_19.json'
)

In [None]:
iso

In [None]:
iso["results"][0].keys()

In [None]:
for i in iso["results"][0]["comments"]:
    print(i)

In [None]:
for i in iso["results"][0]["comments"]:
    if i["commentType"] == "ALTERNATIVE PRODUCTS":
        for j in i["isoforms"]:
            print(j["isoformIds"])

In [14]:
isoform_ids = pd.DataFrame(columns=["ISOFORM_ID"])
print("start")
for i, entry in canonical_prots_w_isoforms.iterrows():
    iso_list = entry["ISOFORM_IDS"]
    if i%5 == 0:
        iso_list = entry["ISOFORM_IDS"]
        print(iso_list)
    for j, id in enumerate(iso_list):
        if j == 0:
            continue
        else:
            new_row = {'ISOFORM_ID': id}
            isoform_ids.loc[len(isoform_ids)] = new_row

isoform_ids

start
['A6H8Y1-1', 'A6H8Y1-2', 'A6H8Y1-3', 'A6H8Y1-4', 'A6H8Y1-5', 'A6H8Y1-6', 'A6H8Y1-7', 'A6H8Y1-8']
['A6NJ78-1', 'A6NJ78-2', 'A6NJ78-3', 'A6NJ78-4']
['L0R8F8-1', 'Q9NQG6-1', 'Q9NQG6-2']
['O00167-1', 'O00167-2', 'O00167-3']
['O00206-1', 'O00206-2', 'O00206-3']
['O00264-1', 'O00264-2']
['O00338-1', 'O00338-2']
['O00410-1', 'O00410-2', 'O00410-3']
['O00423-1', 'O00423-3']
['O00444-1', 'O00444-2', 'O00444-3']
['O00470-1', 'O00470-2']
['O00481-1', 'O00481-2', 'O00481-3', 'O00481-4']
['O00499-1', 'O00499-2', 'O00499-3', 'O00499-4', 'O00499-5', 'O00499-6', 'O00499-7', 'O00499-8', 'O00499-9', 'O00499-10', 'O00499-11']
['O00541-1', 'O00541-2']
['O00584-1', 'O00584-2']
['O00746-1', 'O00746-2']
['O14490-1', 'O14490-2', 'O14490-3', 'O14490-4', 'O14490-5', 'O14490-6', 'O14490-7']
['O14497-1', 'O14497-2', 'O14497-3']
['O14523-1', 'O14523-2']
['O14662-1', 'O14662-2', 'O14662-3', 'O14662-4', 'O14662-5', 'O14662-6']
['O14727-1', 'O14727-2', 'O14727-3', 'O14727-4', 'O14727-5', 'O14727-6']
['O14786-1'

Unnamed: 0,ISOFORM_ID
0,A0AV96-2
1,A0AVT1-2
2,A0AVT1-3
3,A0AVT1-4
4,A1XBS5-2
...,...
10985,Q9Y343-2
10986,Q9Y3B1-2
10987,Q9Y3C1-2
10988,Q9Y3C1-3


In [15]:
isoform_ids.to_csv("isoform_ids", index=False)