In [2]:
#Objective:

##This will create *separate* folders/directories for each input .gb file in your specified input directory and *separate* .json files for each input .gb file formatted for use with AF3 server

#Checklist before running:

##Change the .gb input directory as needed
##Change the output directory as needed

#Note:
##May need to comment out code section that creates new folders for each file if you want all the files in one folder
##This only works for protein sequences *without* post-translational modifications (PTMs) such as glycosylation, citrullination, phosphorylation, SUMOlation, etc.

In [3]:
#Importing modules:
import os
import json
import re
from Bio import SeqIO

In [4]:
#Defining our first function "create_json_structure" to create a json file structure based off of the AF3 server requirements
def create_json_structure(proteins):
    jobs = []
    for protein in proteins:
        job = {
            "name": protein['name'],
            "modelSeeds": [],
            "sequences": [
                {
                    "proteinChain": {
                        "sequence": protein['sequence'],
                        "count": 1
                    }
                }
            ]
        }
        jobs.append(job)
    return jobs

#Defining a second function for cleaning the protein's name so that it is in a standardized format
def sanitize_name(name):
    return re.sub(r'[^\w\s]', '', name).replace(' ', '_')

#Defining a third function to extract the organism's name which will be used for the .json file name later
def extract_organism_name(record):
    for feature in record.features:
        if feature.type == "source" and "organism" in feature.qualifiers:
            return feature.qualifiers["organism"][0]
    return "Unknown_Organism"

#Defining a fourth function to actually process the .gb files into a formatted .json file
def process_gb_file(file_path, output_directory):
    proteins = []
    with open(file_path, "r") as input_handle:
        for record in SeqIO.parse(input_handle, "genbank"):
            organism_name = sanitize_name(extract_organism_name(record))
            for feature in record.features:
                if feature.type == "CDS" and "translation" in feature.qualifiers:
                    protein_name = sanitize_name(feature.qualifiers.get("protein_id", ["Unknown_Protein"])[0])
                    protein_sequence = feature.qualifiers["translation"][0]
                    proteins.append({"name": protein_name, "sequence": protein_sequence})

    json_data = create_json_structure(proteins)

    base_name = os.path.basename(file_path)
    species_name = os.path.splitext(base_name)[0]
    species_output_directory = os.path.join(output_directory, sanitize_name(species_name))
    os.makedirs(species_output_directory, exist_ok=True)
    #Can comment out the `os.makedirs` as needed if you don't want new folders/directories
    
    output_json_file = os.path.join(species_output_directory, f"{species_name}.json")
    with open(output_json_file, 'w') as jsonfile:
        json.dump(json_data, jsonfile, indent=4)

    print(f"JSON file created successfully at {output_json_file}")

#Defining a fifth function find the input directory of .gb files, do the processing, and deposit the processed .json file in the output directory
def process_genbank_directory(input_directory, output_directory):
    for filename in os.listdir(input_directory):
        if filename.endswith(".gb") or filename.endswith(".gbk"):
            file_path = os.path.join(input_directory, filename)
            process_gb_file(file_path, output_directory)

#Time to use the functions! >.>
input_directory = "/Path/To/Your/Directory/Of/Genbank/Files"
output_directory = "/Path/To/Your/Output/Directory"
process_genbank_directory(input_directory, output_directory)

JSON file created successfully at /Users/rileyjones/Desktop/UCSD/BIOINFORMATICS/EXP003_AF3_HPV/OUTPUT/HPV_JSON/HPV166REF/HPV166REF.json
JSON file created successfully at /Users/rileyjones/Desktop/UCSD/BIOINFORMATICS/EXP003_AF3_HPV/OUTPUT/HPV_JSON/HPV123REF/HPV123REF.json
JSON file created successfully at /Users/rileyjones/Desktop/UCSD/BIOINFORMATICS/EXP003_AF3_HPV/OUTPUT/HPV_JSON/HPV97REF/HPV97REF.json
JSON file created successfully at /Users/rileyjones/Desktop/UCSD/BIOINFORMATICS/EXP003_AF3_HPV/OUTPUT/HPV_JSON/HPV222REF/HPV222REF.json
JSON file created successfully at /Users/rileyjones/Desktop/UCSD/BIOINFORMATICS/EXP003_AF3_HPV/OUTPUT/HPV_JSON/HPV57REF/HPV57REF.json
JSON file created successfully at /Users/rileyjones/Desktop/UCSD/BIOINFORMATICS/EXP003_AF3_HPV/OUTPUT/HPV_JSON/HPV12REF/HPV12REF.json
JSON file created successfully at /Users/rileyjones/Desktop/UCSD/BIOINFORMATICS/EXP003_AF3_HPV/OUTPUT/HPV_JSON/HPV73REF/HPV73REF.json
JSON file created successfully at /Users/rileyjones/Desk

JSON file created successfully at /Users/rileyjones/Desktop/UCSD/BIOINFORMATICS/EXP003_AF3_HPV/OUTPUT/HPV_JSON/HPV136REF/HPV136REF.json
JSON file created successfully at /Users/rileyjones/Desktop/UCSD/BIOINFORMATICS/EXP003_AF3_HPV/OUTPUT/HPV_JSON/HPV173REF/HPV173REF.json
JSON file created successfully at /Users/rileyjones/Desktop/UCSD/BIOINFORMATICS/EXP003_AF3_HPV/OUTPUT/HPV_JSON/HPV82REF/HPV82REF.json
JSON file created successfully at /Users/rileyjones/Desktop/UCSD/BIOINFORMATICS/EXP003_AF3_HPV/OUTPUT/HPV_JSON/HPV213REF/HPV213REF.json
JSON file created successfully at /Users/rileyjones/Desktop/UCSD/BIOINFORMATICS/EXP003_AF3_HPV/OUTPUT/HPV_JSON/HPV112REF/HPV112REF.json
JSON file created successfully at /Users/rileyjones/Desktop/UCSD/BIOINFORMATICS/EXP003_AF3_HPV/OUTPUT/HPV_JSON/HPV157REF/HPV157REF.json
JSON file created successfully at /Users/rileyjones/Desktop/UCSD/BIOINFORMATICS/EXP003_AF3_HPV/OUTPUT/HPV_JSON/HPV23REF/HPV23REF.json
JSON file created successfully at /Users/rileyjones/