In [1]:

import os
import glob
from Bio.PDB import *
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

from Bio.SeqUtils.ProtParam import ProteinAnalysis
import Bio.SeqUtils as SeqUtils

import numpy as np
import pandas as pd
#import truncator
#from truncator import insert_into_fasta, replace_into_fasta
#!/home/aljubetic/conda/envs/domesticator/bin/python  /home/aljubetic/gits/domesticator3/domesticator3 --no_idt --help

### Functions

In [3]:
kd_hydrophobicity = \
    {'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5,
     'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5,
     'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6,
     'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2}

# taken form http://www.petercollingridge.co.uk/book/export/html/474
pKa = {'D': 3.9,
       'E': 4.3,
       'H': 6.1,
       'C': 8.3,
       'Y': 10.1,
       'K': 10.5,
       'R': 12,
       'N-terminus': 8,
       'C-terminus': 3.1}
charges = {'D': -1., 'E': -1., 'H': 1., 'C': -1., 'Y': -1., 'K': 1., 'R': 1., 'N-terminus': 1., 'C-terminus': -1.}


def amino_acid_charge(amino_acid, pH=7.5):
    aa_pKa = pKa.get(amino_acid, 0)
    if aa_pKa == 0:
        return 0
    ratio = 1. / (1. + 10 ** (pH - aa_pKa))

    if charges[amino_acid] == 1:
        return ratio
    else:
        return ratio - 1

standard_amino_acids = "ARNDBCEQZGHILKMFPSTWYV"
def get_charge_scale(pH=7.5):
    """Returns a dictionary for the charges of each standard amino acid at a given pH"""
    return {aa: amino_acid_charge(aa, pH=pH) for aa in standard_amino_acids}


def protein_charge(sequence, pH=7.5, blocked=False):
    protein_charge = 0
    if not blocked:
        protein_charge = amino_acid_charge('N-terminus', pH)
        protein_charge += amino_acid_charge('C-terminus', pH)

    for aa in list(pKa.keys()):
        protein_charge += sequence.count(aa) * amino_acid_charge(aa, pH)

    return protein_charge


def analyse_sequence(seq, name=None, pH=7.5, initial_dict=None):
    """Returns sequnce properties in a dictionary"""
    if initial_dict is None:
        res = {}
    else:
        res = initial_dict
    
    
    ana = ProteinAnalysis(seq)
    
    if not (name is None):
        res['name'] = name
    res['sequence'] = seq
    res['molecular_weight'] = SeqUtils.molecular_weight(seq, monoisotopic=False, seq_type='protein')
    res['molecular_weight_monoisotopic'] = SeqUtils.molecular_weight(seq, monoisotopic=True, seq_type='protein')
    res['extinction_280nm_reduced'] = ana.molar_extinction_coefficient()[0]
    res['extinction_280nm_cys_cys'] = ana.molar_extinction_coefficient()[1]
    res['Abs_1mg_ml_280nm_reduced'] = res['extinction_280nm_reduced'] / res['molecular_weight']
    res['Abs_1mg_ml_280nm_cys_cys'] = res['extinction_280nm_cys_cys'] / res['molecular_weight']
    res['isoelectric_point'] = ana.isoelectric_point()
    res[f'charge_pH{pH}'] = protein_charge(seq, pH)
    res['gravy'] = ana.gravy()
    
    return res

# VECTORS (TS2)
add_N=['MSHHHHHHHHSENLYFQSGSG']
add_C=['GS']

### Generate GENEBLOCKS

In [14]:
name = "TEVp-240412"
seq_df = pd.read_csv(f"enter file")
seq_df["seq_split"] = seq_df["seq"].apply(lambda x: x.split("/")[-1])

In [15]:
out_loc = f"enter out location"
os.makedirs(out_loc, exist_ok=True)

In [16]:
seq_df.describe()

Unnamed: 0.1,Unnamed: 0,level_0,index,score,plddt,i_ptm,i_pae,i_con,rmsd,ddg,...,cms,vbuns_bound,vbuns_unbound,vbuns_int,sbuns_bound,sbuns_unbound,sbuns_int,af_binder_plddt,af_pae_int_tot,af_rmsd
count,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.0,...,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.0
mean,891.642857,2567.071429,21723.642857,0.836281,0.934489,0.864432,5.366604,1.890162,0.471005,-38485.682143,...,472.620795,13.285714,14.5,-1.214286,27.428571,25.642857,1.785714,82.0673,6.819961,1.068525
std,631.534469,843.344666,7214.286292,0.072171,0.014183,0.026588,0.629025,0.167893,0.205201,12349.904484,...,51.149919,1.266647,1.60528,1.251373,2.502746,2.060886,1.368805,5.600169,2.050238,0.099713
min,84.0,1000.0,258.0,0.729149,0.911048,0.805931,4.312702,1.520251,0.203888,-53707.0,...,407.351807,11.0,12.0,-4.0,24.0,22.0,0.0,72.921215,4.849621,0.905245
25%,445.25,2151.75,21507.5,0.792626,0.923759,0.851549,4.972703,1.787251,0.321911,-43386.5,...,432.918716,13.0,13.25,-2.0,26.0,24.0,1.0,77.720386,5.12534,1.016407
50%,722.0,2505.0,23420.5,0.814988,0.937027,0.859041,5.195222,1.902655,0.459303,-40182.0,...,476.007736,13.5,14.5,-1.0,27.0,26.0,2.0,83.60035,5.817297,1.101588
75%,1248.5,3289.5,25632.75,0.906684,0.942887,0.883917,5.708463,2.015462,0.583119,-37895.0,...,495.215393,14.0,16.0,0.0,28.0,27.0,2.75,87.098703,9.251669,1.124751
max,1957.0,3674.0,28292.0,0.953882,0.958589,0.913087,6.919047,2.144219,0.860646,-46.55,...,553.960693,15.0,17.0,0.0,32.0,28.0,4.0,88.044112,9.549179,1.199238


In [19]:
# loop through both dataframes and write the seq_split to a fasta file // and add analysis values
molecular_weight = []
molecular_weight_monoisotopic = []
extinction_280nm_reduced = []
extinction_280nm_cys_cys = []
Abs_1mg_ml_280nm_reduced = []
Abs_1mg_ml_280nm_cys_cys = []
isoelectric_point = []
charge_pH7_5 = []
gravy = []
final_seqs = []

with open(f"{out_loc}/df1.fasta", "w") as f:
    for i, row in seq_df.iterrows():
        seq = row["seq_split"]
        final_seq = seq + "W"
        name = row["model_path"].split("/")[-1].split(".")[0]

        f.write(f">{name}\n{final_seq}\n")

        full_seq = add_N[0] + final_seq + add_C[0]

        analyse_sequences = analyse_sequence(full_seq, name)
        molecular_weight.append(analyse_sequences["molecular_weight"])
        molecular_weight_monoisotopic.append(analyse_sequences["molecular_weight_monoisotopic"])
        extinction_280nm_reduced.append(analyse_sequences["extinction_280nm_reduced"])
        extinction_280nm_cys_cys.append(analyse_sequences["extinction_280nm_cys_cys"])
        Abs_1mg_ml_280nm_reduced.append(analyse_sequences["Abs_1mg_ml_280nm_reduced"])
        Abs_1mg_ml_280nm_cys_cys.append(analyse_sequences["Abs_1mg_ml_280nm_cys_cys"])
        isoelectric_point.append(analyse_sequences["isoelectric_point"])
        charge_pH7_5.append(analyse_sequences["charge_pH7.5"])
        gravy.append(analyse_sequences["gravy"])
        final_seqs.append(full_seq)

seq_df["molecular_weight"] = molecular_weight
seq_df["molecular_weight_monoisotopic"] = molecular_weight_monoisotopic
seq_df["extinction_280nm_reduced"] = extinction_280nm_reduced
seq_df["extinction_280nm_cys_cys"] = extinction_280nm_cys_cys
seq_df["Abs_1mg_ml_280nm_reduced"] = Abs_1mg_ml_280nm_reduced
seq_df["Abs_1mg_ml_280nm_cys_cys"] = Abs_1mg_ml_280nm_cys_cys
seq_df["isoelectric_point"] = isoelectric_point
seq_df["charge_pH7_5"] = charge_pH7_5
seq_df["gravy"] = gravy
seq_df["final_sequence"] = final_seqs

seq_df.to_csv(f"{out_loc}/final.csv", index=False)

In [20]:
fasta_file=f"{out_loc}/df1.fasta"
vector_file='/home/tsatler/RFdif/ClusterProteinDesign/scripts/binder_design/helper_scripts/gg_insert.gb'
#TS2_params.gb use for params - DO NOT USE FOR ORDERING
#gg_insert.gb use for ordering DNA

!cd {out_loc} &&  /home/aljubetic/conda/envs/domesticator/bin/python /home/aljubetic/gits/domesticator3/domesticator3 --nstruct 10 --no_idt {fasta_file} {vector_file} 

Attempting optimization of TEVp-240412_lcb3_9_7_17_2_6_1__GG_insert
iteration 1/10
objective:   0%|                    | 0/2 [00:00<?, ?it/s, now=MinimizeNum8mers]
location:   0%|                                 | 0/1 [00:00<?, ?it/s, now=None][A
location:   0%|                            | 0/1 [00:00<?, ?it/s, now=63-258(+)][A
objective:  50%|▌| 1/2 [00:00<00:00,  3.69it/s, now=MaximizeCAI[63-258(+)](e_...[A
location:   0%|                           | 0/13 [00:00<?, ?it/s, now=63-258(+)][A
location:   0%|                               | 0/13 [00:00<?, ?it/s, now=69-72][A
                                                                                [A
location:   0%|                               | 0/13 [00:00<?, ?it/s, now=69-72][A
DEBUG! boosting MinimizeNum8mers by 0. Value is now 10                          [A
iteration 2/10
objective:   0%|                    | 0/2 [00:00<?, ?it/s, now=MinimizeNum8mers]
location:   0%|                                 | 0/1 [00:00<?, ?it/

### Prepare Order excel file

In [22]:
gb_files = glob.glob(f"{out_loc}/*.gb")
# sort the files by the number in the name
gb_files = sorted(gb_files)
print(gb_files)

['/home/tsatler/RFdif/ClusterProteinDesign/scripts/binder_design/output/TEVp-240412/filtered_sequences/order/TEVp-240412_27_3H_14_0_25_1_3_0__GG_insert.gb', '/home/tsatler/RFdif/ClusterProteinDesign/scripts/binder_design/output/TEVp-240412/filtered_sequences/order/TEVp-240412_51_3H_15_4_4_1_2_4__GG_insert.gb', '/home/tsatler/RFdif/ClusterProteinDesign/scripts/binder_design/output/TEVp-240412/filtered_sequences/order/TEVp-240412_lcb3_15_7_3_2_1_3__GG_insert.gb', '/home/tsatler/RFdif/ClusterProteinDesign/scripts/binder_design/output/TEVp-240412/filtered_sequences/order/TEVp-240412_lcb3_17_9_23_3_3_4__GG_insert.gb', '/home/tsatler/RFdif/ClusterProteinDesign/scripts/binder_design/output/TEVp-240412/filtered_sequences/order/TEVp-240412_lcb3_21_7_35_3_5_1__GG_insert.gb', '/home/tsatler/RFdif/ClusterProteinDesign/scripts/binder_design/output/TEVp-240412/filtered_sequences/order/TEVp-240412_lcb3_22_9_49_0_24__GG_insert.gb', '/home/tsatler/RFdif/ClusterProteinDesign/scripts/binder_design/output

In [23]:
from Bio import GenBank

locus_list = []
sequence_list = []
sequence_length = []
well_positions = [f"{row}{col}" for row in "ABCDEFGH" for col in range(1, 13)]


for gb_file in gb_files:
    with open(gb_file) as file:
        record = GenBank.read(file)
        if record.locus == "GG_insert":
            continue

        # save locus and sequence to a excel file
        locus_list.append(record.locus)
        sequence_list.append(record.sequence)
        sequence_length.append(len(record.sequence))

# Create a pandas DataFrame from the lists
        

data = {"Well": well_positions[:len(locus_list)],
        "Name": locus_list, 
        "Sequence": sequence_list,
        "Length": sequence_length}
df = pd.DataFrame(data)

# Save the DataFrame to an Excel file
df.to_excel(f"{out_loc}/idt_order.xlsx", index=False)

In [24]:
final_df = seq_df
final_df["insert_name"] = final_df["model_path"].apply(lambda x: x.split("/")[-1].split(".")[0]+"__GG_insert")

In [25]:
final_merged_idt = pd.merge(final_df, df, left_on="insert_name", right_on="Name", how="right")
final_merged_idt

# Now saved only following columns: Well, Name, Sequence, Length, 'molecular_weight', 'molecular_weight_monoisotopic',
    #    'extinction_280nm_reduced', 'extinction_280nm_cys_cys',
    #    'Abs_1mg_ml_280nm_reduced', 'Abs_1mg_ml_280nm_cys_cys',
    #    'isoelectric_point'

final_merged_idt = final_merged_idt[["Well", "Name", "Sequence", "Length", 'molecular_weight', 'molecular_weight_monoisotopic',
         'extinction_280nm_reduced', 'extinction_280nm_cys_cys',
         'Abs_1mg_ml_280nm_reduced', 'Abs_1mg_ml_280nm_cys_cys',
         'isoelectric_point']]
final_merged_idt.to_excel(f"{out_loc}/idt_order_add.xlsx", index=False)

In [26]:
final_merged_idt

Unnamed: 0,Well,Name,Sequence,Length,molecular_weight,molecular_weight_monoisotopic,extinction_280nm_reduced,extinction_280nm_cys_cys,Abs_1mg_ml_280nm_reduced,Abs_1mg_ml_280nm_cys_cys,isoelectric_point
0,A1,TEVp-240412_27_3H_14_0_25_1_3_0__GG_insert,ATAATTTTGTTTAACTTTAAGAAGGAGATATACATATGGCGCGCTA...,298,8907.735,8902.36915,16960,16960,1.903963,1.903963,5.852734
1,A2,TEVp-240412_51_3H_15_4_4_1_2_4__GG_insert,ATAATTTTGTTTAACTTTAAGAAGGAGATATACATATGGCGCGCTA...,298,7551.9934,7547.528769,8480,8480,1.122882,1.122882,5.739908
2,A3,TEVp-240412_lcb3_15_7_3_2_1_3__GG_insert,ATAATTTTGTTTAACTTTAAGAAGGAGATATACATATGGCGCGCTA...,334,9491.1311,9485.568294,8480,8480,0.893466,0.893466,5.663005
3,A4,TEVp-240412_lcb3_17_9_23_3_3_4__GG_insert,ATAATTTTGTTTAACTTTAAGAAGGAGATATACATATGGCGCGCTA...,334,10136.9705,10130.825875,17420,17420,1.718462,1.718462,5.540688
4,A5,TEVp-240412_lcb3_21_7_35_3_5_1__GG_insert,ATAATTTTGTTTAACTTTAAGAAGGAGATATACATATGGCGCGCTA...,334,9788.5476,9782.719394,6990,6990,0.7141,0.7141,5.403478
5,A6,TEVp-240412_lcb3_22_9_49_0_24__GG_insert,ATAATTTTGTTTAACTTTAAGAAGGAGATATACATATGGCGCGCTA...,334,10010.7035,10004.749065,12950,12950,1.293615,1.293615,5.265644
6,A7,TEVp-240412_lcb3_22_9_49_3_0_2__GG_insert,ATAATTTTGTTTAACTTTAAGAAGGAGATATACATATGGCGCGCTA...,334,9835.72,9829.820731,9970,9970,1.013652,1.013652,5.702679
7,A8,TEVp-240412_lcb3_25_6_14_0_46__GG_insert,ATAATTTTGTTTAACTTTAAGAAGGAGATATACATATGGCGCGCTA...,334,10073.8703,10067.778591,19940,19940,1.979378,1.979378,5.568425
8,A9,TEVp-240412_lcb3_28_2_6_2_2_4__GG_insert,ATAATTTTGTTTAACTTTAAGAAGGAGATATACATATGGCGCGCTA...,334,9614.4479,9608.780913,6990,6990,0.727031,0.727031,5.387905
9,A10,TEVp-240412_lcb3_28_2_6_2_4_0__GG_insert,ATAATTTTGTTTAACTTTAAGAAGGAGATATACATATGGCGCGCTA...,334,9660.6358,9654.898237,6990,6990,0.723555,0.723555,5.568425


- Check complexity on IDT before ordering
- Add .gb files to bechling and assemble final constructs