In [143]:
##Modified from a ChatGPT draft.
##Prompt: Using the uniprot ID as an input, make an input file for locally-installed 
## Boltz-1 with the protein sequence and post-translational modification data from uniprot, 
## including phosphorylation, methylation, glycosylation, and cross-linking. 


import requests

# Dictionary to map common PTMs to their CCD identifiers
ptm_to_ccd = {
    "phosphorylation": {
        "phosphoserine": "phs",     # Phosphorylation on serine
        "phosphothreonine": "pht",  # Phosphorylation on threonine
        "phosphotyrosine": "phty"   # Phosphorylation on tyrosine
    },
    "methylation": {
        "methylation (Lysine)": "me",       # Methylation on lysine
        "methylation (Arginine)": "mearg",  # Methylation on arginine
        "dimethylation (Lysine)": "me2",    # Dimethylation on lysine
        "trimethylation (Lysine)": "me3"    # Trimethylation on lysine
    },
    "glycosylation": {
        "N-linked Glycosylation": "ngly",   # N-linked glycosylation
        "O-linked Glycosylation": "ogly"    # O-linked glycosylation
    },
    "acetylation": {
        "acetylation (Lysine)": "ace"  # Acetylation on lysine
    },
    "ubiquitination": {
        "ubiquitination": "ubiq"  # Ubiquitin attachment
    },
    "sumoylation": {
        "sumoylation": "sumo"  # SUMO attachment
    },
    "crosslinking": {
        "disulfide Bond": "dsb",  # Disulfide bond
        "isopeptide Bond": "iso"  # Isopeptide bond
    }
}

#Dictionary to convert the number of entities to their albhabetic equivalents
entity_to_id={i: chr(64 + i) for i in range(1, 27)}

# Example usage: Retrieve CCD identifier for Phosphoserine
ptm_type = "phosphorylation"
modification = "phosphoserine"
ccd_id = ptm_to_ccd[ptm_type][modification]
print(f"The CCD identifier for {modification} is: {ccd_id}")



# Function to fetch protein data from UniProt
def fetch_uniprot_data(uniprot_id):
    # Define the UniProt API endpoint for retrieving protein data in JSON format
    url = f"https://www.uniprot.org/uniprot/{uniprot_id}.json"
    response = requests.get(url)
    
    if response.status_code != 200:
        print(f"Error: Unable to fetch data for UniProt ID {uniprot_id}.")
        return None
    
    # Parse the JSON response
    data = response.json()
    
    # Extract protein sequence and PTMs from the response
    sequence = data['sequence']['value']
    features = data.get('features', [])

    print(features)
    
    ptms = {
        'phosphorylation': [],
        'methylation': [],
        'glycosylation': [],
        'constraints': [],
        'crosslinking': []
    }
    
    # Collect the PTM data
    for feature in features:
        feature_type = feature['type']
        description = feature.get('description', '').lower()
        if feature['type'] == 'Modified residue':
            if 'phosphothreonine' in description or 'phosphoserine' in description or 'phosphotyrosine' in description:
                ptms['phosphorylation'].append({
                'position': feature['location']['start']['value'],
                'modification': description
            })
        elif feature['type'] == 'Methylation':
            ptms['methylation'].append({
                'position': feature['location']['start'],
                'modification': feature['type']
            })
        elif feature['type'] == 'Glycosylation':
            ptms['glycosylation'].append({
                'position': feature['location']['start'],
                'modification': feature['type'],
                'description': feature['description']
            })
            ptms['constraints'].append({
                'position': feature['location']['start'],
                'modification': feature['type']
            })
            
        elif feature['type'] == 'Cross-link':
            ptms['crosslinking'].append({
                'position': feature['location']['start'],
                'modification': feature['type']
            })
    return sequence, ptms

# Function to create the Boltz-1 input file
def create_boltz1_input(entities, sequence, ptms, output_filename):
    gly_constraints=[]
    # Create a string for the input file
    input_content = "version: 1\n"
    input_content += "sequences:\n"
    input_content += "  - protein:\n"
    input_content += "\tid: " + str(entity_to_id[entities]) + "\n"
    input_content += f"\tsequence: {sequence}\n"
    input_content += "\tmsa: --use_msa_server\n"
    # Add PTM information to the input
    input_content += "\tmodifications:\n"

    # Phosphorylation
    if ptms['phosphorylation']:
        for mod in ptms['phosphorylation']:
            input_content += f"\t\t  - position: {mod['position']}"
            input_content += "\n"
            input_content += "\t\t    ccd:"
            input_content += ptm_to_ccd['phosphorylation'][mod['modification']]
            input_content += "\n"
    
    # Methylation
    if ptms['methylation']:
        input_content += "Methylation:\n"
        for mod in ptms['methylation']:
            input_content += f"  Position: {mod['position']}, Modification: {mod['modification']}\n"
    
    # Glycosylation
    if ptms['glycosylation']:
        for mod in ptms['glycosylation']:
            entities += 1
            input_content += f"  - ligand: \n" 
            #{mod['position']}
            input_content += "\tid: "  + str(entity_to_id[entities]) + "\n"
            if "N-linked (GlcNAc" in mod['description']:
                input_content += "\tccd: "
                input_content += "nag \n"
                gly_constraints.append([str(entity_to_id[entities]), mod['position']['value'], 'N'])
            if "O-linked (GlcNAc" in mod['description']:
                input_content += "\tccd: "
                input_content += "nag \n"
                gly_constraints.append([str(entity_to_id[entities]), mod['position']['value'], 'O'])
    
    # Cross-linking
    if ptms['crosslinking']:
        input_content += "Cross-linking:\n"
        for mod in ptms['crosslinking']:
            input_content += f"  Position: {mod['position']}, Modification: {mod['modification']}\n"
    
    if ptms['glycosylation'] or ptms['crosslinking']:
        i=0
        for mod in ptms['glycosylation']:
            input_content += f"constraints: \n"
            input_content += "\tbond: "
            input_content += "atom" + str(i+1) + ": " + \
            str(gly_constraints[i]).replace('\'', '').replace('[', '').replace(']', '') + "\n"
            i += 1
            
    
    # Write to the output file
    print(input_content)
    with open(output_filename, 'w') as f:
        f.write(input_content)
    
    print(f"Boltz-1 input file has been created: {output_filename}")

# Example usage
uniprot_id = 'P21731'  # Replace with your UniProt ID
output_filename = '/Users/ntw/Desktop/boltz1_input.yaml'

# Fetch the UniProt data
entities=0
data = fetch_uniprot_data(uniprot_id)

if data:
    sequence, ptms = data
    entities += 1
    #print(ptms)
    # Create the Boltz-1 input file
    create_boltz1_input(entities, sequence, ptms, output_filename)




The CCD identifier for phosphoserine is: phs
[{'type': 'Chain', 'location': {'start': {'value': 1, 'modifier': 'EXACT'}, 'end': {'value': 343, 'modifier': 'EXACT'}}, 'description': 'Thromboxane A2 receptor', 'featureId': 'PRO_0000070138'}, {'type': 'Topological domain', 'location': {'start': {'value': 1, 'modifier': 'EXACT'}, 'end': {'value': 29, 'modifier': 'EXACT'}}, 'description': 'Extracellular', 'evidences': [{'evidenceCode': 'ECO:0000255'}]}, {'type': 'Transmembrane', 'location': {'start': {'value': 30, 'modifier': 'EXACT'}, 'end': {'value': 52, 'modifier': 'EXACT'}}, 'description': 'Helical; Name=1', 'evidences': [{'evidenceCode': 'ECO:0000255'}]}, {'type': 'Topological domain', 'location': {'start': {'value': 53, 'modifier': 'EXACT'}, 'end': {'value': 66, 'modifier': 'EXACT'}}, 'description': 'Cytoplasmic', 'evidences': [{'evidenceCode': 'ECO:0000255'}]}, {'type': 'Transmembrane', 'location': {'start': {'value': 67, 'modifier': 'EXACT'}, 'end': {'value': 87, 'modifier': 'EXACT'

In [5]:
ptm_to_ccd["phosphorylation"]["phosphoserine"]

'phs'

In [25]:
entities_to_ids[1]

'A'