In [None]:
#Objective:

##This will create a .json file formatted for the AF3 server from a .gb file

#Checklist before running:

##Change the .gb input directory as needed
##Change the output directory as needed

#Note:
##May need to comment out code section that creates new folder if you so desire
##This only works for protein sequences *without* post-translational modifications (PTMs) such as glycosylation, citrullination, phosphorylation, SUMOlation, etc.
##You may need to change "gene" to "protein_id" in the `process_gb_file` function section, see note below

In [None]:
#Importing our modules
#!pip install re
#!pip install os
#!pip install json
#!pip install Bio
import json
import os
from Bio import SeqIO
import re

In [None]:
def create_json_structure(proteins):
    jobs = []
    for protein in proteins:
        job = {
            "name": protein['name'],
            "modelSeeds": [],
            "sequences": [
                {
                    "proteinChain": {
                        "sequence": protein['sequence'],
                        "count": 1
                    }
                }
            ]
        }
        jobs.append(job)
    return jobs

def sanitize_name(name):
    return re.sub(r'[^\w\s]', '', name).replace(' ', '_')

def extract_organism_name(record):
    for feature in record.features:
        if feature.type == "source" and "organism" in feature.qualifiers:
            return feature.qualifiers["organism"][0]
    return "Unknown_Organism"

def process_gb_file(file_path, output_directory):
    proteins = []
    with open(file_path, "r") as input_handle:
        for record in SeqIO.parse(input_handle, "genbank"):
            organism_name = sanitize_name(extract_organism_name(record))
            for feature in record.features:
                if feature.type == "CDS" and "translation" in feature.qualifiers:
                    protein_name = sanitize_name(feature.qualifiers.get("gene", ["Unknown_Protein"])[0])
                    #May need to change above from "gene" to "protein_id" if the gene encodes for multiple proteins
                    #OR IF the gene name is different than the protein name
                    protein_sequence = feature.qualifiers["translation"][0]
                    proteins.append({"name": protein_name, "sequence": protein_sequence})

    json_data = create_json_structure(proteins)

    base_name = os.path.basename(file_path)
    species_name = os.path.splitext(base_name)[0]
    species_output_directory = os.path.join(output_directory, sanitize_name(species_name))
    os.makedirs(species_output_directory, exist_ok=True)

    output_json_file = os.path.join(species_output_directory, f"{species_name}.json")
    with open(output_json_file, 'w') as jsonfile:
        json.dump(json_data, jsonfile, indent=4)

    print(f"JSON file created successfully at {output_json_file}")

def process_directory(input_directory, output_directory):
    for filename in os.listdir(input_directory):
        if filename.endswith(".gb") or filename.endswith(".gbk"):
            file_path = os.path.join(input_directory, filename)
            process_gb_file(file_path, output_directory)

#Time to use the functions :)
gb_file = "/Path/To/Your/File.gb"
output_json_file = "/This/Is/Your/Output/Directory"
process_gb_file(gb_file, output_json_file)