In [1]:
import json

with open('entire_pdb_info.json') as f:
    pdb_info = json.load(f)
    proteins = list(pdb_info.keys())

print("Total Human proteins from Uniprot that are Swissprot Verified:",  len(proteins))

Total Human proteins from Uniprot that are Swissprot Verified: 20434


In [3]:
with open("../PCFs/files_for_ml/protein_props.json") as f:
    uniprot_data = json.load(f)

print("Total Human proteins from Uniprot:", len(uniprot_data))

Total Human proteins from Uniprot: 20434


In [6]:
print(uniprot_data["P05067"]["Sequence"])
print(uniprot_data["P05067"]["Sequence Length"])

MLPGLALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMNVQNGKWDSDPSGTKTCIDTKEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPHFVIPYRCLVGEFVSDALLVPDKCKFLHQERMDVCETHLHWHTVAKETCSEKSTNLHDYGMLLPCGIDKFRGVEFVCCPLAEESDNVDSADAEEDDSDVWWGGADTDYADGSEDKVVEVAEEEEVAEVEEEEADDDEDDEDGDEVEEEAEEPYEEATERTTSIATTTTTTTESVEEVVREVCSEQAETGPCRAMISRWYFDVTEGKCAPFFYGGCGGNRNNFDTEEYCMAVCGSAMSQSLLKTTQEPLARDPVKLPTTAASTPDAVDKYLETPGDENEHAHFQKAKERLEAKHRERMSQVMREWEEAERQAKNLPKADKKAVIQHFQEKVESLEQEAANERQQLVETHMARVEAMLNDRRRLALENYITALQAVPPRPRHVFNMLKKYVRAEQKDRQHTLKHFEHVRMVDPKKAAQIRSQVMTHLRVIYERMNQSLSLLYNVPAVAEEIQDEVDELLQKEQNYSDDVLANMISEPRISYGNDALMPSLTETKTTVELLPVNGEFSLDDLQPWHSFGADSVPANTENEVEPVDARPAADRGLTTRPGSGLTNIKTEEISEVKMDAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIATVIVITLVMLKKKQYTSIHHGVVEVDAAVTPEERHLSKMQQNGYENPTYKFFEQMQN
770


In [19]:
def generate_pdb_sequence(uniprot_id):
    info = pdb_info[uniprot_id]
    seqlen = uniprot_data[uniprot_id]["Sequence Length"]
    seq = uniprot_data[uniprot_id]["Sequence"]
    pdb_seq = "N"*seqlen
    for component in info:
        start = component["location"]["start"]["value"] # 1-indexed
        end = component["location"]["end"]["value"]
        comp_type = component["type"]
        if comp_type == "Helix":
            pdb_seq = pdb_seq[:start-1] + "H"*(end-start+1) + pdb_seq[end:]
        elif comp_type == "Beta strand":
            pdb_seq = pdb_seq[:start-1] + "E"*(end-start+1) + pdb_seq[end:]
        elif comp_type == "Turn":
            pdb_seq = pdb_seq[:start-1] + "T"*(end-start+1) + pdb_seq[end:]
        else:
            print("Unknown component type:", comp_type)
    return seq, pdb_seq


In [20]:
seq, pdb_seq = generate_pdb_sequence("P05067")
len(seq), len(pdb_seq), seq, pdb_seq

(770,
 770,
 'MLPGLALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMNVQNGKWDSDPSGTKTCIDTKEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPHFVIPYRCLVGEFVSDALLVPDKCKFLHQERMDVCETHLHWHTVAKETCSEKSTNLHDYGMLLPCGIDKFRGVEFVCCPLAEESDNVDSADAEEDDSDVWWGGADTDYADGSEDKVVEVAEEEEVAEVEEEEADDDEDDEDGDEVEEEAEEPYEEATERTTSIATTTTTTTESVEEVVREVCSEQAETGPCRAMISRWYFDVTEGKCAPFFYGGCGGNRNNFDTEEYCMAVCGSAMSQSLLKTTQEPLARDPVKLPTTAASTPDAVDKYLETPGDENEHAHFQKAKERLEAKHRERMSQVMREWEEAERQAKNLPKADKKAVIQHFQEKVESLEQEAANERQQLVETHMARVEAMLNDRRRLALENYITALQAVPPRPRHVFNMLKKYVRAEQKDRQHTLKHFEHVRMVDPKKAAQIRSQVMTHLRVIYERMNQSLSLLYNVPAVAEEIQDEVDELLQKEQNYSDDVLANMISEPRISYGNDALMPSLTETKTTVELLPVNGEFSLDDLQPWHSFGADSVPANTENEVEPVDARPAADRGLTTRPGSGLTNIKTEEISEVKMDAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIATVIVITLVMLKKKQYTSIHHGVVEVDAAVTPEERHLSKMQQNGYENPTYKFFEQMQN',
 'NNNNNNNNNNNNNNNNNNNNNNNNNHHHNNNNEEENNNNNNNEEENTTTNNEEENEEENNNNNNNHHHHHHHHHHHNNNNNEEEEEENNNNEEENNEEETTTEEEENNNEEENNEEEEENNNNNNNNNNNNNNEEEEEENNNNNNNHHHHHHHHHHHHHHNNEEEEEEEEEEEETTTEEEEEEEEEEENNNNNNNNNNNNNNNNNNNNNNN

In [33]:
pdb_sequences = {}
for protein in proteins:
    seq, pdb_seq = generate_pdb_sequence(protein)
    pdb_sequences[protein] = {"seq": seq, "pdb_seq": pdb_seq}

with open("files_for_ml/pdb_sequences.json", "w") as f:
    json.dump(pdb_sequences, f, indent=4)