In [None]:
#Objective:

##This will create *separate* .json files for each protein in the .faa file

#Checklist before running:

##Change input file & directory as needed
##Change output file directory as needed

#Note: 

##This only works for proteins *without* post-translational modifications (PTMs) such as glycosylation, citrullination, phosphorylation, SUMOlation, etc.

In [2]:
#Importing modules
import json
import os
from Bio import SeqIO
import re

In [3]:
#Defining our first function "create_json_structure" to create a json file structure based off of the AF3 server requirements
def create_json_structure(proteins):
    jobs = []
    for protein in proteins:
        job = {
            "name": protein['name'],
            "modelSeeds": [],
            "sequences": [
                {
                    "proteinChain": {
                        "sequence": protein['sequence'],
                        "count": 1
                    }
                }
            ]
        }
        jobs.append(job)
    return jobs

#Defining a second function for cleaning the protein's name so that it is in a standardized format
def sanitize_name(name):
    return re.sub(r'[^\w\s]', '', name).replace(' ', '_')

#Defining a third function to process our input .faa file into our output .json file
def process_faa_file(faa_file, output_directory):
    proteins = []
    for record in SeqIO.parse(faa_file, "fasta"):
        protein_name = sanitize_name(record.description)
        protein_sequence = str(record.seq)
        proteins.append({"name": protein_name, "sequence": protein_sequence})

    json_data = create_json_structure(proteins)

    base_name = os.path.basename(faa_file)
    species_name = os.path.splitext(base_name)[0]
    species_output_directory = os.path.join(output_directory, sanitize_name(species_name))
    os.makedirs(species_output_directory, exist_ok=True)

    output_json_file = os.path.join(species_output_directory, f"{species_name}.json")
    with open(output_json_file, 'w') as jsonfile:
        json.dump(json_data, jsonfile, indent=4)

    print(f"JSON file created successfully at {output_json_file}")

#Defining a fourth function to deposit the processed .json file in the output directory
def process_directory(input_directory, output_directory):
    for filename in os.listdir(input_directory):
        if filename.endswith(".faa"):
            file_path = os.path.join(input_directory, filename)
            process_faa_file(file_path, output_directory)

#Time to use the functions! ^.^
faa_file = "/Path/To/Your/.faa/File"
output_json_directory = "/Path/To/Your/Output/Directory"
process_faa_file(faa_file, output_json_directory)

JSON file created successfully at /Users/rileyjones/Desktop/UCSD/BIOINFORMATICS/EXP002_AF3_EBV/OUTPUT/EBV_PROTEINS_JSON_FILES/EBV_Proteins/EBV_Proteins.json
