In [1]:
## ChatGPT: Provide a script to automatically generate AlphaFold 3 input json 
## files for a protein given the uniprot identifier for that protein, pulling 
## and including post-translational modification data from the Uniprot entry, 
## including phosphorylation, glycosylation, acetylation, and methylation. 
## The glycosylation format should be compatible with a local installation of AlphaFold 3

import requests
import json

# Function to fetch data from UniProt
def fetch_uniprot_data(uniprot_id):
    # Define the UniProt API endpoint
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json"

    # Send the request to the UniProt API
    response = requests.get(url)

    # Check if the response is valid
    if response.status_code != 200:
        raise Exception(f"Failed to retrieve data for UniProt ID {uniprot_id}. HTTP Status Code: {response.status_code}")

    # Return the JSON response
    return response.json()

# Function to extract relevant data from the UniProt entry
def extract_protein_info(uniprot_data):
    #print(uniprot_data)
    # Extract protein sequence
    #print(uniprot_data['sequence']['value'])
    sequence = uniprot_data['sequence']['value']

    # Extract PTM data (phosphorylation, glycosylation, acetylation, and methylation)
    ptms = []
    print(uniprot_data.get('features', []))
    for feature in uniprot_data.get('features', []):
        feature_type = feature['type']
        description = feature.get('description', '').lower()
        print(feature_type)

        # Collect PTM features of interest
        if feature_type == 'Modified residue':
            # Check for specific PTMs
            print(description)
            if 'phosphothreonine' in description or 'phosphoserine' in description or 'phosphotyrosine' in description:
                ptms.append({
                    'type': 'Phosphorylation',
                    'description': feature.get('description', 'No description'),
                    'location': feature['location']['start']
                })
                print(feature.get('description', 'No description'))
            elif 'acetylation' in description:
                ptms.append({
                    'type': 'Acetylation',
                    'description': feature.get('description', 'No description'),
                    'location': feature['location']
                })
            elif 'methylation' in description:
                ptms.append({
                    'type': 'Methylation',
                    'description': feature.get('description', 'No description'),
                    'location': feature['location']
                })

        # If it's a cross-link, include it as a general PTM type
        elif feature_type == 'Glycosylation':
            ptms.append({
                'type': 'Glycosylation',
                'description': feature.get('description', 'No description'),
                'location': feature['location']['start'],
                'glycan_type': 'N-linked'  # Default assumption, could be updated to O-linked if specified
                })
        
        elif feature_type == 'Disulfide bond':
            ptms.append({
                'type': 'Disulfide bond',
                'description': feature.get('description', 'No description'),
                'location': feature['location']['start'],
                'end': feature['location']['end']
            })

    # Extract other annotations or metadata (e.g., protein name)
    protein_name = uniprot_data.get('proteinDescription', {}).get('recommendedName', {}).get('fullName', {}).get('value', 'Unknown')

    return {
        'sequence': sequence,
        'ptms': ptms,
        'protein_name': protein_name
    }

# Function to generate the AlphaFold input JSON structure
def generate_alphafold_input_json(protein_info):
    # Construct the AlphaFold input JSON structure
    alphafold_input = {
        "sequence": protein_info['sequence'],
        "name": protein_info['protein_name'],
        "ptms": protein_info['ptms'],
        "metadata": {
            "description": "Protein sequence and PTM data for AlphaFold",
            "source": "UniProt"
        }
    }

    # Format glycosylation PTMs to match the AlphaFold 3 format
    for ptm in alphafold_input['ptms']:
        if ptm['type'] == 'Glycosylation':
            # Check if glycosylation is N-linked or O-linked based on the description
            if 'N-linked' in ptm['description']:
                ptm['glycan_type'] = 'N-linked'
            elif 'O-linked' in ptm['description']:
                ptm['glycan_type'] = 'O-linked'
            else:
                ptm['glycan_type'] = 'Unknown'  # You could refine this logic further

    return alphafold_input

# Function to save the JSON to a file
def save_to_json_file(data, filename):
    with open(filename, 'w') as json_file:
        json.dump(data, json_file, indent=4)

# Main function to orchestrate the process
def main(uniprot_id, output_filename):
    # Step 1: Fetch UniProt data
    print(f"Fetching data for UniProt ID: {uniprot_id}")
    uniprot_data = fetch_uniprot_data(uniprot_id)

    # Step 2: Extract relevant protein information
    print("Extracting protein information...")
    protein_info = extract_protein_info(uniprot_data)

    # Step 3: Generate AlphaFold input JSON
    print("Generating AlphaFold input JSON...")
    alphafold_input = generate_alphafold_input_json(protein_info)

    # Step 4: Save to JSON file
    print(f"Saving input to {output_filename}...")
    save_to_json_file(alphafold_input, output_filename)

    print(f"AlphaFold input JSON for {uniprot_id} saved as {output_filename}")

# Example usage
if __name__ == "__main__":
    # Specify UniProt ID and output filename
    uniprot_id = "P11229"  # UniProt ID for SOD1
    output_filename = "alphafold_input_P11229.json"

    # Run the main function
    main(uniprot_id, output_filename)


Fetching data for UniProt ID: P11229
Extracting protein information...
[{'type': 'Chain', 'location': {'start': {'value': 1, 'modifier': 'EXACT'}, 'end': {'value': 460, 'modifier': 'EXACT'}}, 'description': 'Muscarinic acetylcholine receptor M1', 'featureId': 'PRO_0000069015'}, {'type': 'Topological domain', 'location': {'start': {'value': 1, 'modifier': 'EXACT'}, 'end': {'value': 22, 'modifier': 'EXACT'}}, 'description': 'Extracellular', 'evidences': [{'evidenceCode': 'ECO:0000269', 'source': 'PubMed', 'id': '32646996'}, {'evidenceCode': 'ECO:0007744', 'source': 'PDB', 'id': '6WJC'}]}, {'type': 'Transmembrane', 'location': {'start': {'value': 23, 'modifier': 'EXACT'}, 'end': {'value': 48, 'modifier': 'EXACT'}}, 'description': 'Helical; Name=1', 'evidences': [{'evidenceCode': 'ECO:0000269', 'source': 'PubMed', 'id': '32646996'}, {'evidenceCode': 'ECO:0007744', 'source': 'PDB', 'id': '6WJC'}]}, {'type': 'Topological domain', 'location': {'start': {'value': 49, 'modifier': 'EXACT'}, 'end