In [1]:
import os
import json

# Numberings are 1-indexed

types = set()
files = os.listdir("PTM_indiv")
for file in files:
    protein_id = file.split(".")[0]
    with open("PTM_indiv/" + file, "r") as f:
        data = json.load(f)
        for entity in data[protein_id]:
            types.add(entity["type"])

print(types)
print(len(types))

{'Disulfide bond', 'Glycosylation', 'Signal', 'Chain', 'Modified residue', 'Cross-link', 'Peptide'}
7


In [2]:
with open("../PCFs/files_for_ml/protein_props.json") as f:
    uniprot_human_proteins = json.load(f)

In [3]:
uniprot_human_proteins["A0A0A0MRZ7"]["Sequence Length"]

120

We realise that not all locations are occupied by either chain or signal types. But having a chain may mean something

Checking how many have signals and how many signals 


In [6]:
signal_count = {}
for protein in uniprot_human_proteins:
    with open("PTM_indiv/" + protein + ".json", "r") as f:
        data = json.load(f)
        for entity in data[protein]:
            if entity["type"] == "Signal":
                if protein not in signal_count:
                    signal_count[protein] = 0
                signal_count[protein] += 1

print(max(signal_count.values()))
print(len(signal_count))

1
3612


Therefore only 3612 have signals and there is exactly one signal in each of them. We can have a boolean about whether signal is present or not. Should we categorise this signal as well is another question?

Now check if Glycosylation, Modified Residue are at single locations

In [10]:
for protein in uniprot_human_proteins:
    with open("PTM_indiv/" + protein + ".json", "r") as f:
        data = json.load(f)
        for entity in data[protein]:
            if entity["type"] == "Glycosylation":
                if(entity["location"]["start"]["value"] != entity["location"]["end"]["value"]):
                    print("Protein", protein, "has a glycosylation site with multiple locations")
            if entity["type"] == "Modified residue":
                if(entity["location"]["start"]["value"] != entity["location"]["end"]["value"]):
                    print("Protein", protein, "has a modified residue with multiple locations")

No output in the previous cell. Meaning Modified residues and glycosylation all occur at single locations

We store the number of Glycosylation, Cross-link, Modified residue, Signal and Disulfide bond for each protein in PTM_counts.csv

In [12]:
import pandas as pd

# For every protein, get the number of Glycosylation, Cross-link, Modified residue, Signal and Disulfide bond.
PTM_info = {}
for protein in uniprot_human_proteins:
    with open("PTM_indiv/" + protein + ".json", "r") as f:
        data = json.load(f)
        PTM_info[protein] = {"Glycosylation": 0, "Cross-link": 0, "Modified residue": 0, "Signal": 0, "Disulfide bond": 0}
        for entity in data[protein]:
            if entity["type"] in PTM_info[protein]:
                PTM_info[protein][entity["type"]] += 1

df = pd.DataFrame.from_dict(PTM_info, orient='index')
df.to_csv("files_for_ml/PTM_counts.csv")

We plan to categorise Glycosylations as either N-linked or O-linked

In [66]:
for protein in uniprot_human_proteins:
    with open("PTM_indiv/" + protein + ".json", "r") as f:
        data = json.load(f)
        for entity in data[protein]:
            if entity["type"] == "Glycosylation":
                possible_links = ["O-linked", "N-linked", "C-linked", "N-beta-linked", "S-linked", "O-alpha-linked"]
                contains_link = 0
                for link in possible_links:
                    if entity["description"].find(link) != -1:
                        contains_link += 1
                
                if contains_link == 0:
                    print("Protein", protein, "has a glycosylation site that is not linked to any type")
                elif contains_link > 1:
                    print("Protein", protein, "has a glycosylation site that is linked to multiple types")

Each glycolysation site has exactly one link: O-linked/N-linked/C-linked/N-beta-linked/S-linked/O-alpha-linked

We extract all the above glycosylation info for every protein and store in glycosylation.csv

In [24]:
glycosylation_info = {}

for protein in uniprot_human_proteins:
    glycosylation_info[protein] = {"O-linked": 0, "N-linked": 0, "C-linked": 0, "N-beta-linked": 0, "S-linked": 0, "O-alpha-linked": 0}
    with open("PTM_indiv/" + protein + ".json", "r") as f:
        data = json.load(f)
        for entity in data[protein]:
            if entity["type"] == "Glycosylation":
                possible_links = ["O-linked", "N-linked", "C-linked", "N-beta-linked", "S-linked", "O-alpha-linked"]
                for link in possible_links:
                    if entity["description"].find(link) != -1:
                        glycosylation_info[protein][link] += 1
                        break # Since, only one link can be present every glycosylation site

len(glycosylation_info)

20434

In [25]:
df = pd.DataFrame.from_dict(glycosylation_info, orient='index')
df.to_csv("files_for_ml/glycosylation.csv")

Number of proteins containing non-zero number of 
1. O-linked = 371	
2. N-linked = 4436	
3. C-linked = 20
4. N-beta-linked =	11
5. S-linked = 3
6. O-alpha-linked = 4


Lets study categories for Modified residue entity type

In [78]:
none_count = 0
multiple_count = 0
for protein in uniprot_human_proteins:
    with open("PTM_indiv/" + protein + ".json", "r") as f:
        data = json.load(f)
        for entity in data[protein]:
            if entity["type"] == "Modified residue":
                possible_residues = ["Phosphoserine", "Phosphothreonine", "Phosphotyrosine", "acetylmethionine", "methylarginine","Dimethylated arginine", "acetyllysine", "acetylalanine", "acetylthreonine", "Cysteine", "hydroxyarginine","succinyllysine","acetylserine","polyglutamate","methyllysine","hydroxyasparagine", "pyridoxal phosphate", "Citrulline", "ribosylserine"]
                possible_residues.extend(["acetylglycine", "Phenylalanine","Proline", "Pyrrolidone","lipoyllysine", "biotinyllysine","glutamine","pantetheine 4'-phosphoryl","carboxyglutamate","acetylvaline","Phosphohistidine","Sulfotyrosine"])
                possible_residues.extend(["Methionine amide","Leucine amide","methylhistidine"])
                contains_residue = 0
                for link in possible_residues:
                    if entity["description"].lower().find(link.lower()) != -1:
                        contains_residue += 1
                
                if contains_residue == 0:
                    none_count += 1
                    if(none_count == 1):
                        print("Protein", protein, "has a modified residue that is not of the known types")
                elif contains_residue > 1:
                    multiple_count += 1
                    # print("Protein", protein, "has a modified residue that is of multiple types")

print(none_count)
print(multiple_count) # There are no multiples

Protein O60563 has a modified residue that is not of the known types
2193
0
