In [10]:
import json, os, copy
import datetime as dt

## 1. Preprocessing

In [14]:
def fa_to_dict(fa_path):
    
    """
    convert a fasta file into a python dictionary
    input: fasta file
    output: python dict
    """
    
    # initialize some storages
    file_items = [] # empty list to store all items read from a fasta file, for easy checking the end of the file
    seqNMs = [] # empty list to store sequence headers
    seqs = [] # empty list to store sequences
    seq = "" # empty string to concatinate all lines of a sequence
    
    # open fasta file
    with open(fa_path, "r") as file:
        for line in file:
            line = line.strip() # remove the "\n" empty line
            file_items.append(line) # collect all none-empty lines 

    for i in range(len(file_items)): # using index to easily check the end of the list
        
        # if the line starts with ">", this line is a header
        if file_items[i][:1] == ">":
            # store the sequence name/header. Change to fit your needs!
            # example: ">EGX35_RS00135.1 response regulator transcription factor 28084:28705" --> "EGX35_RS00135"
            # example: ">orange1.1g019260m" --> "orange1"
            # example: ">orange1.1g014581m" --> "orange1"
            
            seqNMs.append(file_items[i][1:].split(" ")[0].split("|")[0]) # remove ">" and " " or "|" for the case of alphafoldserver.com
            
            # store the previous sequence
            if seq: # avoid saving the empty string when i=0 at the first header
                seqs.append(seq)
            seq = ""
            
        # if the line does not start with ">", this line is a sequence line
        else:
            seq = seq + file_items[i]
            
        # if i reached the end, append the last seq, which couldn't be appended during the first if statement
        if i == len(file_items) -1 :
            seqs.append(seq) 
    
    # convert two lists to a dictionary
    dic = dict(zip(seqNMs, seqs))
    
    return dic
    

## 2. Prepare json files for AlphaFold3

In [15]:
def prep_json(fa1_path, fa2_path=None, n=20, out_dir="./"):
    
    """
    prepare json files that contains pairs of 2 proteins for AF3 on both alphafoldserver.com and HiPerGator
    
    4 input parameters: 
        1) one fasta file for all possible combinations of pair of 2 proteins
        2) or two fasta files for all vs all pairing of 2 proteins
        3) n is the number of pairs in a json file, it is recommended as 20 for alphafoldserver.com, <300 for HiPerGator due to the GPU time limitation (14 days)
        4) out_dir is where you want to save the prepared json files
           
    output: json files. Each json file has at most n pairs of 2-proteins.
    """
    # get the protein dictionary of fasta file1
    proteins1 = fa_to_dict(fa1_path)
    proteins1_keys = list(proteins1.keys())

    # get the stems of fasta file names for naming json files
    fa1_stem = fa1_path.split("/")[-1].rsplit(".", 1)[0] # rsplit(separator, maxsplit). maxsplit=1, will return a list with 2 elements!
    fa2_stem = ""
    
    # if fasta file2 is avaiable
    if fa2_path:
        proteins2 = fa_to_dict(fa2_path)
        proteins2_keys = list(proteins2.keys())
        fa2_stem = fa2_path.split("/")[-1].rsplit(".", 1)[0]
    
    # helpers
    data = [] # empty list to store final results
    counter = 0 # determine when to save a json file 
    
    
    # n pairs of 2 proteins in a json file
    for i in range(len(proteins1)):
        
        if fa2_path:
            start = 0
            end = len(proteins2)
            prt2 = proteins2
            prt2_keys = proteins2_keys
        else:
            start = i
            end = len(proteins1)
            prt2 = proteins1
            prt2_keys = proteins1_keys
            
        # 3 updates
        for j in range(start, end):
            
            counter += 1
                
            # create a template dictionary to store protein-protein pair
            temp = {'name': 'protein1_protein2',
                    'modelSeeds': [2025],
                    'sequences': [{'proteinChain': {'sequence': 'ATCG', 'count': 1}},
                                  {'proteinChain': {'sequence': 'ATCG', 'count': 1}}
                                 ]
                   }

            # update job name
            temp["name"]= proteins1_keys[i] + "_" + prt2_keys[j]

            # update the first protein
            temp["sequences"][0]['proteinChain']['sequence'] = proteins1[proteins1_keys[i]]

            # update the second protein
            temp["sequences"][1]['proteinChain']['sequence'] = prt2[prt2_keys[j]]

            # add to the final list
            data.append(temp)
            
            
            # Going to save json file with all items been updated
            # get the date today for dir and joson file name
            today = dt.date.today().strftime("%Y%m%d")

            # create a folder if not exist
            new_dir = os.path.join(out_dir, f"{today}_af3_jsons")
            os.makedirs(new_dir, exist_ok=True)

            # if the data list stored n items, save as a json file
            if len(data) == n:
                path = f"{new_dir}/{today}_{fa1_stem}_{fa2_stem}-{counter//n}.json"
                with open(path, "w") as jsonFile:
                    json.dump(data, jsonFile)
                data = [] # reset the result list!
            
            # elif i and j reached the ends
            elif (i == len(proteins1) - 1) & (j == end - 1):
                if data:
                    path = f"{new_dir}/{today}_{fa1_stem}_{fa2_stem}-{counter//n + 1}.json"
                    with open(path, "w") as jsonFile:
                        json.dump(data, jsonFile)
                

## one fasta file

prepare json files from ONE fasta file. Each json file contains at most 20 protein-protein pairs

In [16]:
prep_json("BDM_complex.fa", out_dir = "BDM_20")

prepare json files from ONE fasta file. Each json file contains at most 40 protein-protein pairs

In [24]:
prep_json("ZCY_complex.fa", n=40, out_dir = "ZCY_40")

## two fasta files

prepare json files from TWO fasta files. Each json file contains at most 20 protein-protein pairs

In [21]:
prep_json("BDM_complex.fa", "ZCY_complex.fa", n=20, out_dir = "BDM_ZCY_20")

prepare json files from TWO fasta files. Each json file contains at most 100 protein-protein pairs

In [23]:
prep_json("BDM_complex.fa", "ZCY_complex.fa", n=100, out_dir = "BDM_ZCY_100")