In [3]:
def parse_input_file(file_path):
    """
    Reads a file containing tab-separated data and returns a dictionary.
    The first entry is the key, and the second is the SMILES string.
    SMILES strings are broken into lines of maximally 70 characters for readability.
    """
    peptide_dict = {}

    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line:
                entries = line.split('\t')
                if len(entries) == 2:
                    key, smiles = entries
                    # Break the SMILES string into 70-character lines
                    smiles_lines = [smiles[i:i+70] for i in range(0, len(smiles), 70)]
                    peptide_dict[key] = smiles_lines

    return peptide_dict

def write_output_file(peptide_dict, output_file):
    """
    Writes the peptide dictionary to a Python file with formatted SMILES strings.
    """
    with open(output_file, 'w') as file:
        file.write("peptide_dict = {\n")
        for key, smiles_lines in peptide_dict.items():
            file.write(f"    \"{key}\": (\n")
            for line in smiles_lines:
                file.write(f"        \"{line}\"\n")
            file.write("    ),\n")
        file.write("}\n")

def main():
    input_file = "/Users/ntw/Downloads/1276607468946545165.txt"  # Modify this if your input file has a different name
    output_file = "/Users/ntw/Downloads/peptide_dictionary.txt"

    peptide_dict = parse_input_file(input_file)
    write_output_file(peptide_dict, output_file)

    print(f"Dictionary has been written to {output_file}")

if __name__ == "__main__":
    main()


Dictionary has been written to /Users/ntw/Downloads/peptide_dictionary.txt


In [60]:
##Modified from a ChatGPT draft.
##Prompt: Using the uniprot ID as an input, make an input file for locally-installed 
## Boltz-1 with the protein sequence and post-translational modification data from uniprot, 
## including phosphorylation, methylation, glycosylation, and cross-linking. 


import requests
import os
import re

def download_uniprot_fasta(uniprot_id, fasta_outfile):
    """
    Downloads the FASTA sequence for a given UniProt ID.
    
    Args:
        uniprot_id (str): The UniProt accession number.

    Returns:
        str: The FASTA formatted protein sequence.
    """
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
    response = requests.get(url)
    
    if response.status_code == 200:
        with open(fasta_outfile, "w") as file:
            file.write(response.text)
    else:
        raise ValueError(f"Error downloading sequence for {uniprot_id}: {response.status_code}")

def append_matching_parts(text, possibilities, result_list):
    """
    Appends the parts of a string that match a list of possibilities to a result list.

    Args:
        text: The string to search within.
        possibilities: A list of possible strings to match.
        result_list: The list to append the matching parts to.
    """
    
    pattern = r"(" + "|".join(re.escape(p) for p in possibilities) + r")"
    
    for match in re.finditer(pattern, text):
        result_list.append(match.group(0))



def file_to_list(file_path):
    lines = []
    try:
        with open(file_path, 'r') as file:
            for line in file:
                lines.append(line.rstrip('\n'))
    except FileNotFoundError:
        print(f"Error: File not found: {file_path}")
    return lines

# Dictionary to map common PTMs to their CCD identifiers
ptm_to_ccd = {
    "phosphorylation": {
        "phosphoserine": "phs",     # Phosphorylation on serine
        "phosphothreonine": "pht",  # Phosphorylation on threonine
        "phosphotyrosine": "phty"   # Phosphorylation on tyrosine
    },
    "methylation": {
        "methylation (Lysine)": "me",       # Methylation on lysine
        "methylation (Arginine)": "mearg",  # Methylation on arginine
        "dimethylation (Lysine)": "me2",    # Dimethylation on lysine
        "trimethylation (Lysine)": "me3"    # Trimethylation on lysine
    },
    "glycosylation": {
        "N-linked Glycosylation": "ngly",   # N-linked glycosylation
        "O-linked Glycosylation": "ogly"    # O-linked glycosylation
    },
    "acetylation": {
        "acetylation (Lysine)": "ace"  # Acetylation on lysine
    },
    "ubiquitination": {
        "ubiquitination": "ubiq"  # Ubiquitin attachment
    },
    "sumoylation": {
        "sumoylation": "sumo"  # SUMO attachment
    },
    "crosslinking": {
        "disulfide Bond": "dsb",  # Disulfide bond
        "isopeptide Bond": "iso"  # Isopeptide bond
    }
}

seqGLP1R = { "Tirzepatide" : "YAEGTFTSDYSIALDKIAQKAFVQWLIAGGPSSGAPPPS", \
           "Exenatide" : "HGEGTFTSDLSKQMEEEAVRLFIEWLKNGGPSSGAPPPS", \
           "Liraglutide" : "HAEGTFTSDVSSYLEGQAAKEEFIAWLVRGRG", \
           "Lixisenatide" : "HGEGTFTSDLSKQMEEEAVRLFIEWLKNGGPSSGAPPSKKKKKK", \
           "Albiglutide" : "HGEGTFTSDVSSYLEGQAAKEFIAWLVKGRHGEGTFTSDVSSYLEGQAAKEFIAWLVKGR" \
           "DAHKSEVAHRFKDLGEENFKALVLIAFAQYLQQCPFEDHVKLVNEVTEFAKTCVADESAE" \
           "NCDKSLHTLFGDKLCTVATLRETYGEMADCCAKQEPERNECFLQHKDDNPNLPRLVRPEV" \
           "DVMCTAFHDNEETFLKKYLYEIARRHPYFYAPELLFFAKRYKAAFTECCQAADKAACLLP" \
           "KLDELRDEGKASSAKQRLKCASLQKFGERAFKAWAVARLSQRFPKAEFAEVSKLVTDLTK" \
           "VHTECCHGDLLECADDRADLAKYICENQDSISSKLKECCEKPLLEKSHCIAEVENDEMPA" \
           "DLPSLAADFVESKDVCKNYAEAKDVFLGMFLYEYARRHPDYSVVLLLRLAKTYETTLEKC" \
           "CAAADPHECYAKVFDEFKPLVEEPQNLIKQNCELFEQLGEYKFQNALLVRYTKKVPQVST" \
           "PTLVEVSRNLGKVGSKCCKHPEAKRMPCAEDYLSVVLNQLCVLHEKTPVSDRVTKCCTES" \
           "LVNRRPCFSALEVDETYVPKEFNAETFTFHADICTLSEKERQIKKQTALVELVKHKPKAT" \
           "KEQLKAVMDDFAAFVEKCCKADDKETCFAEEGKKLVAASQAALGL", \
           "Dulaglutide" : "HGEGTFTSDVSSYLEEQAAKEFIAWLVKGGGGGGGSGGGGSGGGGSAESKYGPPCPPCPA" \
           "PEAAGGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSQEDPEVQFNWYVDGVEVHNAKTKP" \
           "REEQFNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKGLPSSIEKTISKAKGQPREPQVYTL" \
           "PPSQEEMTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSRLT" \
           "VDKSRWQEGNVFSCSVMHEALHNHYTQKSLSLSLG", \
           "Semaglutide" : "HAEGTFTSDVSSYLEGQAAKEFIAWLVRGRG", \
            ##Photorhabdus Aegyptia
           "A0A2K2CCW6" : "MTCFSFLCGRRIDSSQESQHMGRDDEELGSIKNVRCYTYRELRNATEGFS" \
           "AANKIGEGGFGSVYKGRLKHGKIAAIKVLSAESRQGVEEFLAEIKAMSEIEHENLVKLYGCCVEDN" \
           "HRILVYNYLENNSLAQTLLDGGHSHSNIQFSWRTRTKICIGVARGLTFLHEEVKPYIVHRDIKASN" \
           "ILLDKDLTAKISDFGLAKLIPDNQTHVSTRVAGTLGYLAPEYAIRGKLTRKADLYSFGVLLLEIVS" \
           "GRNNTNTRLPVEEQYLLERTWELYERRELVSLVDASLNGDFNAEEACRFLKIGLLCTQDDPNLRPSM" \
           "STVVKMLTGRKNFDERKITKPGLISDFMDLKVRAPSKTKASASTSFNVSSGSDNQDTSILTSENSSSV" \
           "TMTAFTELYNRSI", \
           "A0A7X5QNK9" : "MFNLSIQSANTRIDINHNELSQATSTDKTCHCCEPLLNNTIDNPLTAPTSVIT" \
           "QWNEKSVVTNKQIEANISRLASESTAAHKTVDSLLSNLVKLFVRVEGNELTAITKMLNAYTGDKPGFS" \
           "ILGQLGAGGFTRIMSEAQQSGVRPDRANTEPLTLREKATAYYDLTNSAYFRDTLEKIYSSPELKSEFK" \
           "DIIDIGYLESKNFAPARGSTKENPLPDQLALYTFKNENKFNASENSELYQVTKQEKGKEVISNPALAG" \
           "NSLTKVQNDFRTPAPDKDSVWSSAASLEADNRLTNRELEFARLNPRNRLDRTDYQFGQSGPIKAHQEL" \
           "AKENIIVQRGNGFAVWNVKENTGFSKDTALHNLPTVAAPSGTTDRFITAARLLGTGLKNDLSLGTPA" \
           "NGESPEQSIQRGEREMKELTRWLATGYLVDDNHHSMIEVNLGAANHGLESQWGLNLYTEPFSSPIHA" \
           "KGFSISSQEILAELENREDVRTDYSTFKKDLYGGSRAVVNANGSIRTNSR", 
           "A0A022PL40" : "MPDFSIQRANTRIDIHQTELHQPTATDKTSHCCEPLLNNTIDNPLTAPTSVIT" \
           "RWNEKSVVTNKQIETNISRLASESTAAHKTVDSLLSNLVKLFVLAEGNELTAVTKMLNAYTGDKPGFS" \
           "IVGQLGAGGFTRIISEARQNGVRPDRADTEPLTLREKATAYYDLTNSAYFRDTLEKIYSSPELKSEFK" \
           "DIIDIGYLESKNFAPARGSTKEKPLPDQLNLYTFKNENKLNASENPELYQVTKQKDGKEVISNPALAG" \
           "NSLTEVQRDFRTAAPDKDSVWATAASLEADNRLTNRELAFASLNPRNRLDRTDYQFGRSEPIKAHQEL" \
           "AKENIIVQRGNGFSVWNVKENTGFSQDAALHNLPTVAAPSGTTDRFITAARLLGAGLKNDLALGTPTN" \
           "GESTEQSIQRGDREMKELTRWLATGYLVDDNHHSMIEVNLGAANHGLAPQWGLNLYTEPFSSPIYAKG" \
           "FSVSSQEILAELEGRDDVHTDYSTFRKDLYGGNRATVNADGSIKTSSK", 
           "A0A329XCW0" : "MPDFSIQRANTRIDIHQTELHQPTATDKTCHCCEPLLNNTIDNPLTAPTSVITG" \
           "WNEKSVVANKQIEANISRLASESTAAHKTVDSLLSNLVKLFVRAEGNELTAITKMLNAYTGDKPGFSI" \
           "VGQLGAGGFTRIISEARQNGVRPDRANTEPLTLREKATAYYDITNSAYFRDTLEKVYSSPELKSEFKD" \
           "IIDIGYLESKNFAPARGSTKEKPLPDQLALYTFKNENKFNASENPELYQVTKQANGQEVISNPALAGN" \
           "SLTKVQNDFRAPAPDKDSVWSTAASLEADNRLTNRELAFASLNPRNRLDRTDYQFGQSEPIKAHQELA" \
           "KENIIVQRGNGFSVWNVKENTGFSKDAALHNLPTVAAPSGTTDRFITAARLLSAGLKNDLALGAPTNG" \
           "ESPEQSIQRGEREMKELTRWLATGYLVDDNHHSMIEVNLGAANHGLAPQWGLNLYTEPFSSPIHAKGF" \
           "SVSSQEILAELEGRDDVHTDYSIFRKDLYGGNRATVNADGSIKTSSK",
            "Big_dynorphin": (
                "CC[C@H](C)[C@@H](C(=O)N[C@@H](CCCN=C(N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H]("
                "CCCCN)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC2=CNC3=CC="
                "CC=C32)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CC(=O)N)C(=O)N[C@@H](CCC(=O)N"
                ")C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCN=C(N)N)C(=O)N[C@@H](CC4=CC=C(C=C4"
                ")O)C(=O)NCC(=O)NCC(=O)N[C@@H](CC5=CC=CC=C5)C(=O)N[C@@H](CC(C)C)C(=O)N["
                "C@@H](CCCN=C(N)N)C(=O)N[C@@H](CCCN=C(N)N)C(=O)N[C@@H](CCC(=O)N)C(=O)N["
                "C@@H](CC6=CC=CC=C6)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](C(C)C)C(=O)N[C@@H]("
                "C(C)C)C(=O)N[C@@H]([C@@H](C)O)C(=O)O)NC(=O)[C@H](CCCN=C(N)N)NC(=O)[C@H"
                "](CCCN=C(N)N)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC7=CC=CC=C7)NC(=O)CNC(=O)"
                "CNC(=O)[C@H](CC8=CC=C(C=C8)O)N"
            ),
            "Dynorphin_A_1-6": (
                "CC(C)CC(C(=O)NC(CCCN=C(N)N)C(=O)O)NC(=O)C(CC1=CC=CC=C1)NC(=O)CNC(=O)CN"
                "C(=O)C(CC2=CC=C(C=C2)O)N"
            ),
            "Dynorphin_A_1-13": (
                "CC[C@H](C)[C@@H](C(=O)N[C@@H](CCCN=C(N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H]("
                "CCCCN)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCCN)C(=O)O)NC(=O)[C@H](CCCN=C"
                "(N)N)NC(=O)[C@H](CCCN=C(N)N)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC2=CC=CC=C"
                "2)NC(=O)CNC(=O)CNC(=O)[C@H](CC3=CC=C(C=C3)O)N"
            ),
            "Dynorphin_A_2-17": (
                "CC[C@H](C)[C@@H](C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H]("
                "CCCCN)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC2=CNC3=CC="
                "CC=C32)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CC(=O)N)C(=O)N[C@@H](CCC(=O)N"
                ")C(=O)O)NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CC(C"
                ")C)NC(=O)[C@H](CC4=CC=CC=C4)NC(=O)CNC(=O)CN"
            ),
            "Dynorphin_B": (
                "C[C@@H]([C@@H](C(=O)O)NC(=O)[C@H](C(C)C)NC(=O)[C@H](C(C)C)NC(=O)[C@H]("
                "CCCCN)NC(=O)[C@H](CC1=CC=CC=C1)NC(=O)[C@H](CCC(=O)N)NC(=O)[C@H](CCCN=C"
                "(N)N)NC(=O)[C@H](CCCN=C(N)N)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC2=CC=CC=C"
                "2)NC(=O)CNC(=O)CNC(=O)[C@H](CC3=CC=C(C=C3)O)N)O"
            ),
            "alpha-Neoendorphin": (
                "CC(C)C[C@@H](C(=O)N[C@@H](CCCN=C(N)N)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](C"
                "C1=CC=C(C=C1)O)C(=O)N2CCC[C@H]2C(=O)N[C@@H](CCCCN)C(=O)O)NC(=O)[C@H](C"
                "C3=CC=CC=C3)NC(=O)CNC(=O)CNC(=O)[C@H](CC4=CC=C(C=C4)O)N"
            ),
            "beta-Neoendorphin": (
                "CC(C)C[C@@H](C(=O)N[C@@H](CCCN=C(N)N)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](C"
                "C1=CC=C(C=C1)O)C(=O)N2CCC[C@H]2C(=O)O)NC(=O)[C@H](CC3=CC=CC=C3)NC(=O)C"
                "NC(=O)CNC(=O)[C@H](CC4=CC=C(C=C4)O)N"
            ),
            "Leu-Enkephalin": (
                "CC(C)C[C@@H](C(=O)O)NC(=O)[C@H](CC1=CC=CC=C1)NC(=O)CNC(=O)CNC(=O)[C@H]"
                "(CC2=CC=C(C=C2)O)N"
            ),
            "Met-Enkephalin": (
                "CSCC[C@@H](C(=O)O)NC(=O)[C@H](CC1=CC=CC=C1)NC(=O)CNC(=O)CNC(=O)[C@H](C"
                "C2=CC=C(C=C2)O)N"
            ),
            "Adrenorphin": (
                "CC(C)C(C(=O)N)NC(=O)C(CCCN=C(N)N)NC(=O)C(CCCN=C(N)N)NC(=O)C(CCSC)NC(=O"
                ")C(CC1=CC=CC=C1)NC(=O)CNC(=O)CNC(=O)C(CC2=CC=C(C=C2)O)N"
            ),
            "Peptide_E": (
                "CC(C)C[C@@H](C(=O)O)NC(=O)[C@H](CC1=CC=CC=C1)NC(=O)CNC(=O)CNC(=O)[C@H]"
                "(CC2=CC=C(C=C2)O)NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CCCCN)NC(=O)[C@H]("
                "CCC(=O)N)NC(=O)[C@H](CC3=CC=C(C=C3)O)NC(=O)[C@H](CC(=O)O)NC(=O)[C@H](C"
                "CSC)NC(=O)[C@H](CC4=CNC5=CC=CC=C54)NC(=O)[C@H](CC6=CNC7=CC=CC=C76)NC(="
                "O)[C@H](CCC(=O)O)NC(=O)[C@@H]8CCCN8C(=O)[C@H](CCCNC(=N)N)NC(=O)CNC(=O)"
                "[C@H](C(C)C)NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H]("
                "CCSC)NC(=O)[C@H](CC9=CC=CC=C9)NC(=O)CNC(=O)CNC(=O)[C@H](CC1=CC=C(C=C1)"
                "O)N"
            ),
            "Amidorphin": (
                "C[C@@H](C(=O)N[C@@H](CC(=O)N)C(=O)NCC(=O)NCC(=O)N[C@@H](CCC(=O)O)C(=O)"
                "N[C@@H](C(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N)NC(=O)[C@H](CCC(=O)O)NC(=O)[C"
                "@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](C(C)C)NC(=O)[C@H](CCC(=O"
                ")O)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H]1CCCN1C(=O)[C@H](CC2=CC=C(C=C2)O)NC("
                "=O)[C@H](CC(C)C)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CC(=O)O)NC(=O)[C@H](C"
                "CSC)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCSC)NC(=O)[C@H](C"
                "C3=CC=CC=C3)NC(=O)CNC(=O)CNC(=O)[C@H](CC4=CC=C(C=C4)O)N"
            ),
            "DADLE": (
                "C[C@H](C(=O)NCC(=O)N[C@@H](CC1=CC=CC=C1)C(=O)N[C@H](CC(C)C)C(=O)O)NC(="
                "O)[C@H](CC2=CC=C(C=C2)O)N"
            ),
            "DPDPE": (
                "CC1([C@H](C(=O)NCC(=O)N[C@H](C(=O)N[C@H](C(SS1)(C)C)C(=O)O)CC2=CC=CC=C"
                "2)NC(=O)[C@H](CC3=CC=C(C=C3)O)N)C"
            ),
            "DSLET": (
                "C[C@H]([C@@H](C(=O)O)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC1=CC=CC=C1)NC(=O"
                ")CNC(=O)[C@@H](CO)NC(=O)[C@H](CC2=CC=C(C=C2)O)N)O"
            ),
            "DTLET": (
                "C[C@@H]([C@H](C(=O)NCC(=O)N[C@@H](CC1=CC=CC=C1)C(=O)N[C@@H](CC(C)C)C(="
                "O)N[C@H]([C@@H](C)O)C(=O)O)NC(=O)[C@H](CC2=CC=C(C=C2)O)N)O"
            ),
            "Nociceptin": (
                "C[C@H]([C@@H](C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@"
                "H](CCCCN)C(=O)N[C@@H](CO)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N"
                "[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](C)C(=O)N[C@@H](CC(=O)N)C"
                "(=O)N[C@@H](CCC(=O)N)C(=O)O)NC(=O)[C@H](CC1=CC=CC=C1)NC(=O)CNC(=O)CNC("
                "=O)[C@H](CC2=CC=CC=C2)N)O"
            ),
            "beta-Endorphin": (
                "CC[C@H](C)[C@@H](C(=O)N[C@@H]([C@@H](C)CC)C(=O)N[C@@H](CCCCN)C(=O)N[C@"
                "@H](CC(=O)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CC1=CC=C(C=C1)O)C(=O)N[C@@H](C"
                "CCCN)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CCC(=O)O)C(=O)O)NC(=O)[C@H"
                "](C)NC(=O)[C@H](CC(=O)N)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC2=CC=CC=C2)NC("
                "=O)[C@H](CC(C)C)NC(=O)[C@H]([C@@H](C)O)NC(=O)[C@H](C(C)C)NC(=O)[C@H](C"
                "C(C)C)NC(=O)[C@@H]3CCCN3C(=O)[C@H]([C@@H](C)O)NC(=O)[C@H](CCC(=O)N)NC("
                "=O)[C@H](CO)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CO)NC(="
                "O)[C@H]([C@@H](C)O)NC(=O)[C@@H](CCSC)NC(=O)[C@H](CC4=CC=CC=C4)NC(=O)CN"
                "C(=O)CNC(=O)[C@H](CC5=CC=C(C=C5)O)N"
            ),
            "beta-Endorphin_1-27": (
                "CC[C@H](C)[C@@H](C(=O)N[C@@H]([C@@H](C)CC)C(=O)N[C@@H](CCCCN)C(=O)N[C@"
                "@H](CC(=O)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CC1=CN=CN1)C(=O)O)NC(=O)[C@H]("
                "C)NC(=O)[C@H](CC(=O)N)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC2=CC=CC=C2)NC(=O"
                ")[C@H](CC(C)C)NC(=O)[C@H]([C@@H](C)O)NC(=O)[C@H](C(C)C)NC(=O)[C@H](CC("
                "C)C)NC(=O)[C@@H]3CCCN3C(=O)[C@H]([C@@H](C)O)NC(=O)[C@H](CCC(=O)N)NC(=O"
                ")[C@H](CO)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CO)NC(=O)"
                "[C@H]([C@@H](C)O)NC(=O)[C@H](CCSC)NC(=O)[C@H](CC4=CC=CC=C4)NC(=O)CNC(="
                "O)CNC(=O)[C@H](CC5=CC=C(C=C5)O)N"
            ),
            "gamma-Endorphin": (
                "CC(C)C[C@@H](C(=O)N[C@@H](C(C)C)C(=O)N[C@@H](C(C)O)C(=O)N[C@@H](CC(C)C"
                ")C(=O)O)NC(=O)[C@@H]1CCCN1C(=O)[C@H](C(C)O)NC(=O)[C@H](CCC(=O)N)NC(=O)"
                "[C@H](CO)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CO)NC(=O)["
                "C@H](C(C)O)NC(=O)[C@H](CCSC)NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)CNC(=O)CNC("
                "=O)[C@H](CC3=CC=C(C=C3)O)N"
            ),
            "Endomorphin-1": (
                "C1C[C@H](N(C1)C(=O)[C@H](CC2=CC=C(C=C2)O)N)C(=O)N[C@@H](CC3=CNC4=CC=CC"
                "=C43)C(=O)N[C@@H](CC5=CC=CC=C5)C(=O)N"
            ),
            "Endomorphin-2": (
                "C1C[C@H](N(C1)C(=O)[C@H](CC2=CC=C(C=C2)O)N)C(=O)N[C@@H](CC3=CC=CC=C3)C"
                "(=O)N[C@@H](CC4=CC=CC=C4)C(=O)N"
            ),
            "Endomorphin_1": (
                "C1C[C@H](N(C1)C(=O)[C@H](CC2=CC=C(C=C2)O)N)C(=O)N[C@@H](CC3=CNC4=CC=CC"
                "=C43)C(=O)N[C@@H](CC5=CC=CC=C5)C(=O)N"
            ),
            "Endomorphin_2": (
                "C1C[C@H](N(C1)C(=O)[C@H](CC2=CC=C(C=C2)O)N)C(=O)N[C@@H](CC3=CC=CC=C3)C"
                "(=O)N[C@@H](CC4=CC=CC=C4)C(=O)N"
            ),
            "Thiorphan": (
                "C1=CC=C(C=C1)CC(CS)C(=O)NCC(=O)O"
            ),
            "Spinorphin": (
                "C[C@H]([C@@H](C(=O)O)NC(=O)[C@H](CC1=CNC2=CC=CC=C21)NC(=O)[C@@H]3CCCN3"
                "C(=O)[C@H](CC4=CC=C(C=C4)O)NC(=O)[C@H](C(C)C)NC(=O)[C@H](C(C)C)NC(=O)["
                "C@H](CC(C)C)N)O"
            ),
            "Kyotorphin": (
                "C1=CC(=CC=C1C[C@@H](C(=O)N[C@@H](CCCN=C(N)N)C(=O)O)N)O"
            ),
            "Morphiceptin": (
                "C1C[C@H](N(C1)C(=O)[C@H](CC2=CC=C(C=C2)O)N)C(=O)N[C@@H](CC3=CC=CC=C3)C"
                "(=O)N4CCC[C@H]4C(=O)N"
            ),
            "D-Ala2-Deltorphin_II": (
                "C[C@H](C(=O)N[C@@H](CC1=CC=CC=C1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](C("
                "C)C)C(=O)N[C@@H](C(C)C)C(=O)NCC(=O)N)NC(=O)[C@H](CC2=CC=C(C=C2)O)N"
            ),
            "Hemorphin-4": (
                "C[C@H]([C@@H](C(=O)O)NC(=O)[C@H](CC1=CNC2=CC=CC=C21)NC(=O)[C@@H]3CCCN3"
                "C(=O)[C@H](CC4=CC=C(C=C4)O)N)O"
            ),
            "TIPP": (
                "C1[C@H](N(CC2=CC=CC=C21)C(=O)[C@H](CC3=CC=C(C=C3)O)N)C(=O)N[C@@H](CC4="
                "CC=CC=C4)C(=O)N[C@@H](CC5=CC=CC=C5)C(=O)O"
            ),
            "beta-Casomorphin_5": (
                "C1C[C@H](N(C1)C(=O)[C@H](CC2=CC=C(C=C2)O)N)C(=O)N[C@@H](CC3=CC=CC=C3)C"
                "(=O)N4CCC[C@H]4C(=O)NCC(=O)O"
            ),
}


#Dictionary to convert the number of entities to their alphabetic equivalents
def to_alphabet_equiv(n):
    result = ''
    while n > 0:
        n, remainder = divmod(n - 1, 26)
        result = chr(65 + remainder) + result
    return result
entity_to_id={i: to_alphabet_equiv(i) for i in range(1,200)}
#entity_to_id={i: chr(64 + i) for i in range(1, 27)}

# Example usage: Retrieve CCD identifier for Phosphoserine
ptm_type = "phosphorylation"
modification = "phosphoserine"
ccd_id = ptm_to_ccd[ptm_type][modification]
#print(f"The CCD identifier for {modification} is: {ccd_id}")



# Function to fetch protein data from UniProt
def fetch_uniprot_data(uniprot_id):
    # Define the UniProt API endpoint for retrieving protein data in JSON format
    url = f"https://www.uniprot.org/uniprot/{uniprot_id}.json"
    response = requests.get(url)
    
    if response.status_code != 200:
        print(f"Error: Unable to fetch data for UniProt ID {uniprot_id}.")
        return None
    
    # Parse the JSON response
    data = response.json()
    
    # Extract protein sequence and PTMs from the response
    sequence = data['sequence']['value']
    features = data.get('features', [])

    #print(features)
    
    ptms = {
        'phosphorylation': [],
        'methylation': [],
        'glycosylation': [],
        'constraints': [],
        'crosslinking': []
    }
    
    # Collect the PTM data
    for feature in features:
        feature_type = feature['type']
        description = feature.get('description', '').lower()
        if feature['type'] == 'Modified residue':
            if 'phosphothreonine' in description or 'phosphoserine' in description or 'phosphotyrosine' in description:
                text = description
                possibilities = ['phosphothreonine', 'phosphoserine', 'phosphotyrosine']
                result_list = []
                append_matching_parts(text, possibilities, result_list)
                ptms['phosphorylation'].append({
                'position': feature['location']['start']['value'],
                'modification': result_list[0]
            })
        elif feature['type'] == 'Methylation':
            ptms['methylation'].append({
                'position': feature['location']['start'],
                'modification': feature['type']
            })
        elif feature['type'] == 'Glycosylation':
            ptms['glycosylation'].append({
                'position': feature['location']['start'],
                'modification': feature['type'],
                'description': feature['description']
            })
            ptms['constraints'].append({
                'position': feature['location']['start'],
                'modification': feature['type']
            })
            
        elif feature['type'] == 'Cross-link':
            ptms['crosslinking'].append({
                'position': feature['location']['start'],
                'modification': feature['type']
            })
    return sequence, ptms

# Function to create the Boltz-1 input file
def create_boltz1_input(entities, glp1r, sequence, ptms, output_filename):
    gly_constraints=[]
    # Create a string for the input file
    input_content = "version: 1\n"
    input_content += "sequences:\n"
    input_content += "  - ligand:\n"
    input_content += "      id: " + str(entity_to_id[entities]) + "\n"
    input_content += f"      smiles: {seqGLP1R[glp1r]}\n"
    entities+=1


    
    input_content += "  - protein:\n"
    input_content += "      id: " + str(entity_to_id[entities]) + "\n"
    input_content += f"      sequence: {sequence}\n"
    input_content += "      msa: /lustre/orion/proj-shared/syb111/Personal/twalker/Combined_Pipeline/Sequence_Alignments/" + uniprot_id + "_" + glp1r + ".a3m\n"
    entity_for_mod=entities
    # Add PTM information to the input

    # Phosphorylation
    if ptms['phosphorylation']:
        input_content += "    modifications:\n"
        for mod in ptms['phosphorylation']:
            input_content += f"      - position: {mod['position']}"
            input_content += "\n"
            input_content += "        ccd: "
            input_content += ptm_to_ccd['phosphorylation'][mod['modification']]
            input_content += "\n"
            
    # Methylation
    #if ptms['methylation']:
    #    input_content += "Methylation:\n"
    #    for mod in ptms['methylation']:
    #        input_content += f"  Position: {mod['position']}, Modification: {mod['modification']}\n"
    
    # Glycosylation
    if ptms['glycosylation']:
        for mod in ptms['glycosylation']:
            entities += 1
            input_content += f"  - ligand: \n" 
            #{mod['position']}
            #Note: This assumes that glycosylation occurs between nitrogen and anomeric carbon,
            # nitrogen from the amino acid residue and the glycan's anomeric carbon.
            input_content += "      id: "  + str(entity_to_id[entities]) + "\n"
            if "N-linked (GlcNAc" in mod['description']:
                input_content += "      ccd: "
                input_content += "NAG \n"
                gly_constraints.append([str(entity_to_id[entity_for_mod]), mod['position']['value'], 'N'])
                gly_constraints.append([str(entity_to_id[entities]), 1, 'C1'])
            if "O-linked (GlcNAc" in mod['description']:
                input_content += "      ccd: "
                input_content += "NAG \n"
                gly_constraints.append([str(entity_to_id[entity_for_mod]), mod['position']['value'], 'O'])
                gly_constraints.append([str(entity_to_id[entities]), 1, 'C1'])
    
    # Cross-linking
    #if ptms['crosslinking']:
    #    input_content += "Cross-linking:\n"
    #    for mod in ptms['crosslinking']:
    #        input_content += f"  Position: {mod['position']}, Modification: {mod['modification']}\n"
    
    if ptms['glycosylation']: #or ptms['crosslinking']: not yet implemented
        i=0
        input_content += f"constraints: \n" 
        for mod in ptms['glycosylation']:
            try:
                input_content += f"  - bond: \n" 
                input_content += "      atom1" + ": [" + \
                str(gly_constraints[i]).replace('\'', '').replace('[', '').replace(']', '') + "]\n"
                i += 1
                input_content += "      atom2" + ": [" + \
                str(gly_constraints[i]).replace('\'', '').replace('[', '').replace(']', '') + "]\n"
                i += 1
            except IndexError:
                continue

            
            
    
    # Write to the output file
    print(input_content)
    #print(gly_constraints)
    with open(output_filename, 'w') as f:
        f.write(input_content)
    
    print(f"Boltz-1 input file has been created: {output_filename}")

# Example usage
#   1) Assign the GLP1R agonist to pull its sequence from the seqGLP1R dictionary.
# Options include: 'Tirzepatide', 'Exenatide', 'Liraglutide', 
#'Lixisenatide', 'Albiglutide', 'Dulaglutide', 'Semaglutide.' 
# Use seqGLP1R.keys() to see all options.






In [53]:
print(seqGLP1R["Big_dynorphin"])

CC[C@H](C)[C@@H](C(=O)N[C@@H](CCCN=C(N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC2=CNC3=CC=CC=C32)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CC(=O)N)C(=O)N[C@@H](CCC(=O)N)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCN=C(N)N)C(=O)N[C@@H](CC4=CC=C(C=C4)O)C(=O)NCC(=O)NCC(=O)N[C@@H](CC5=CC=CC=C5)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCN=C(N)N)C(=O)N[C@@H](CCCN=C(N)N)C(=O)N[C@@H](CCC(=O)N)C(=O)N[C@@H](CC6=CC=CC=C6)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](C(C)C)C(=O)N[C@@H](C(C)C)C(=O)N[C@@H]([C@@H](C)O)C(=O)O)NC(=O)[C@H](CCCN=C(N)N)NC(=O)[C@H](CCCN=C(N)N)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC7=CC=CC=C7)NC(=O)CNC(=O)CNC(=O)[C@H](CC8=CC=C(C=C8)O)N


In [56]:
for i in seqGLP1R.keys():
    print(i)

Tirzepatide
Exenatide
Liraglutide
Lixisenatide
Albiglutide
Dulaglutide
Semaglutide
A0A2K2CCW6
A0A7X5QNK9
A0A022PL40
A0A329XCW0
Big_dynorphin
Dynorphin_A_1-6
Dynorphin_A_1-13
Dynorphin_A_2-17
Dynorphin_B
alpha-Neoendorphin
beta-Neoendorphin
Leu-Enkephalin
Met-Enkephalin
Adrenorphin
Peptide_E
Amidorphin
DADLE
DPDPE
DSLET
DTLET
Nociceptin
beta-Endorphin
beta-Endorphin_1-27
gamma-Endorphin
Endomorphin-1
Endomorphin-2
Endomorphin_1
Endomorphin_2
Thiorphan
Spinorphin
Kyotorphin
Morphiceptin
D-Ala2-Deltorphin_II
Hemorphin-4
TIPP
beta-Casomorphin_5


In [58]:
uniProtIDs='Protein_list.txt'
glp1r="Tirzepatide"
output_prefix = '/Users/ntw/Desktop/OA_Tests/'
suffix= '.yaml'

file_path = output_prefix + uniProtIDs
protList = file_to_list(file_path)
#print(my_list)
for a in seqGLP1R.keys():
    glp1r=str(a)
    for i in range(len(protList)):
        uniprot_id = (protList[i])
        
        output_filename = output_prefix + uniprot_id + "_" + glp1r + "/job" + suffix
        output_directory = output_prefix + uniprot_id + "_" + glp1r + "/"
        fasta_outfile = output_directory +"job.fasta"
        print(output_filename)
        
        os.makedirs(output_directory, exist_ok=True)
        entities=0
        data = fetch_uniprot_data(uniprot_id)
        
        if data:
            sequence, ptms = data
            entities += 1
        
            #print(ptms)
            # Create the Boltz-1 input file
            create_boltz1_input(entities, glp1r, sequence, ptms, output_filename)
            download_uniprot_fasta(uniprot_id, fasta_outfile)

/Users/ntw/Desktop/OA_Tests/P25106_Tirzepatide/job.yaml
version: 1
sequences:
  - ligand:
      id: A
      smiles: YAEGTFTSDYSIALDKIAQKAFVQWLIAGGPSSGAPPPS
  - protein:
      id: B
      sequence: MDLHLFDYSEPGNFSDISWPCNSSDCIVVDTVMCPNMPNKSVLLYTLSFIYIFIFVIGMIANSVVVWVNIQAKTTGYDTHCYILNLAIADLWVVLTIPVWVVSLVQHNQWPMGELTCKVTHLIFSINLFGSIFFLTCMSVDRYLSITYFTNTPSSRKKMVRRVVCILVWLLAFCVSLPDTYYLKTVTSASNNETYCRSFYPEHSIKEWLIGMELVSVVLGFAVPFSIIAVFYFLLARAISASSDQEKHSSRKIIFSYVVVFLVCWLPYHVAVLLDIFSILHYIPFTCRLEHALFTALHVTQCLSLVHCCVNPVLYSFINRNYRYELMKAFIFKYSAKTGLTKLIDASRVSETEYSALEQSTK
      msa: /lustre/orion/proj-shared/syb111/Personal/twalker/Combined_Pipeline/Sequence_Alignments/Tirzepatide_P25106.a3m
    modifications:
      - position: 347
        ccd: phs
      - position: 350
        ccd: phs
      - position: 355
        ccd: phs
  - ligand: 
      id: C
      ccd: NAG 
  - ligand: 
      id: D
      ccd: NAG 
  - ligand: 
      id: E
      ccd: NAG 
constraints: 
  - bond: 
      atom1: [B, 13, N]
      atom