In [2]:
#packages for utilzing the mutagenesis code
import argparse
import re #regular expressions
from Bio import Entrez, SeqIO
Entrez.email = 'sethfrazer.edu'
from skbio import TabularMSA
from skbio import Protein
from skbio.alignment import global_pairwise_align_protein
from Bio.Align import substitution_matrices
import itertools
#packages for utilizing deepBreaks
import os
import subprocess
from deepBreaks.utils import load_obj
from deepBreaks.preprocessing import read_data
import joblib
import pandas as pd
import itertools

In [3]:
def getAcc(accession, check):
    if accession == "manual":

        if check == 0:
            manual = input("Enter Reference Sequence: ")
        else:
            manual = input("Enter Target Sequence: ")

        return(manual)
    
    handle = Entrez.efetch(db="nucleotide", id=accession, rettype="gb", retmode="text")
    record = SeqIO.read(handle, "gb")
    handle.close()
    for i,feature in enumerate(record.features):
         if feature.type=='CDS':
              aa = feature.qualifiers['translation'][0]
    return(aa)

In [4]:
def getCombos(mut_list, wt_seq, mut_acc_file):
    wt_seq = f'{wt_seq}_'
    x = 1
    mut_combos = []
    while x in range(len(mut_list)+1):
        combinations = list(itertools.combinations(mut_list, x))

        mut_accs = []
        for combo in combinations:
            acc = wt_seq

            check = 1
            for i in combo:
                if check == x:    
                    acc+=i
                else:
                    acc = f'{acc}{i},'
                check+=1
            print(acc)

            mut_combos.append(acc)
        x +=1
    #print(mut_combos)
    #print(len(mut_combos))

    with open(mut_acc_file, 'w') as f:
        for mut_acc in mut_combos:
            f.write(f'{mut_acc}\n')
    
    return(mut_combos)

In [5]:
def getMut(mut_combos,raccession,output_file):
    switch = 0
    bovine = Protein(getAcc(raccession, switch))
    switch+=1

    try:
        mut_combos = open(mut_combos).readlines()
    except:
        pass


    #Fetch sequences to manipulate and align
    for lines in mut_combos:     
        
        try:
            accession = f'{lines.split("_")[0]}_{lines.split("_")[1]}'
            mutation = lines.split('_')[2]
        except:
            accession = f'{lines.split("_")[0]}'
            mutation = lines.split('_')[1]

        wt = Protein(getAcc(accession, switch))

        substitution_matrix = substitution_matrices.load("BLOSUM45")

        ##Simple example for testing
        #wt=Protein("ABCDEF")
        #bovine=Protein("ABCDEF")

        alignment, score, start_end_positions = global_pairwise_align_protein(bovine, wt, substitution_matrix=substitution_matrix)
        dic = alignment.to_dict()
        aligned_bovine = dic[0]
        aligned_wt = dic[1]
        print(aligned_wt)

        #Check if single or multiple mutations entered
        mutsuite = mutation
        if "," in mutation :
            mutations = mutation.split(",")
        else :
            #declare empty list and append single mutation so later can be used as list if only one mutation
            mutations = []
            mutations.append(mutation)

        mutated = aligned_wt #need to copy original sequence and accumulate mutations
        for mutation in mutations:

            #check that mutation is formatted properly
            MutationRegex = re.compile(r'([A-Z])(\d+)([A-Z])')
            if re.match(MutationRegex, mutation):
                mu = MutationRegex.search(mutation)
                old = mu.group(1)
                mutsite = int(mu.group(2))
                new = mu.group(3)
            else :
                raise Exception("ERROR: Expecting mutation name in the format of XaY where X=old amino acid a=number Y=new amino acid")

            #substract one to account for index starting at zero
            mutsite = mutsite-1

            #Count gaps to not count them when finding original site to change
            gaps = aligned_bovine[:mutsite].count('-')

            match = str(mutated[mutsite+gaps:mutsite+gaps+1])

            if match != old :
                print(f'The AA at this site is {old} but the mutation label is {match}')
                raise Exception("ERROR: The mutation label does not match the sequence")
            else: 
                mutated = str(mutated[:mutsite+gaps]) + new + str(mutated[mutsite+gaps+1:]) 

        mutated = mutated.replace('-','')
        newacc = accession + "_" + mutsuite

        with open(output_file, 'a') as f:
            f.write(f'>{newacc}\n')
            f.write(f'{mutated}\n')
        print(newacc, end='\t\t')
        print(mutated)

    return('All Done!')

In [6]:
def getMutPred(report_dir, seq_add, seqFileName, metaDataFileName, mafft_exe, mt, seq_type, gap_threshold, top_per_mod, prediction_report_dir):

    load_top_mod = load_obj(top_per_mod)

    mut_ali = seqFileName[2:].split("/")[1]
    mut_ali = f'{mut_ali.split(".")[0]}_mutant_aligned.fasta'
    print(mut_ali)
    cmd = [mafft_exe, '--add', seq_add, '--keeplength', seqFileName , '>', f'{report_dir}/{mut_ali}' ]
    aligner = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    out_put = aligner.communicate()[0].decode('utf8')

    #print(out_put)
    mut_test = read_data( f'./{report_dir}{mut_ali}', seq_type = seq_type, is_main=True, gap_threshold=gap_threshold)
    ref_copy = read_data( seqFileName, seq_type = seq_type, is_main=True, gap_threshold=gap_threshold)
    last_seq = ref_copy.shape[0]
    print(ref_copy.shape)
    #print(last_seq)
    mut_test = mut_test.iloc[last_seq:].copy()
    print(mut_test)

    predictions = load_top_mod.predict(mut_test)
    print(predictions)
    mut_fin = pd.DataFrame(index=mut_test.index)
    mut_fin['Prediction'] = predictions
    mut_fin.to_csv(path_or_buf= prediction_report_dir,mode="w")

    return('Model Predictions Complete!')

In [None]:
wds_mut_list = ['C162T', 'K67T', 'Y178F', 'L72Q', 'E247M', 'K311I']
wt = 'AB087809'
mut_acc_file = 'dan_rer_mut_acc.txt'
raccession = 'NM_001014890'
mutant_seq_file = 'dan_rer_sws_mut_seqs.fasta'
combos = getCombos(wds_mut_list, wt, mut_acc_file)
getMut(combos,raccession,mutant_seq_file)

In [8]:
wds_ni_mut_list = ['S298A', 'M44I', 'C162I', 'P215C', 'M257E', 'R314F']
wt = 'AB087809'
mut_acc_file = 'dan_rer_ni_mut_acc.txt'
raccession = 'NM_001014890'
mutant_seq_file = 'dan_rer_sws_ni_mut_seqs.fasta'
combos = getCombos(wds_ni_mut_list, wt, mut_acc_file)
getMut(combos,raccession,mutant_seq_file)

AB087809_S298A
AB087809_M44I
AB087809_C162I
AB087809_P215C
AB087809_M257E
AB087809_R314F
AB087809_S298A,M44I
AB087809_S298A,C162I
AB087809_S298A,P215C
AB087809_S298A,M257E
AB087809_S298A,R314F
AB087809_M44I,C162I
AB087809_M44I,P215C
AB087809_M44I,M257E
AB087809_M44I,R314F
AB087809_C162I,P215C
AB087809_C162I,M257E
AB087809_C162I,R314F
AB087809_P215C,M257E
AB087809_P215C,R314F
AB087809_M257E,R314F
AB087809_S298A,M44I,C162I
AB087809_S298A,M44I,P215C
AB087809_S298A,M44I,M257E
AB087809_S298A,M44I,R314F
AB087809_S298A,C162I,P215C
AB087809_S298A,C162I,M257E
AB087809_S298A,C162I,R314F
AB087809_S298A,P215C,M257E
AB087809_S298A,P215C,R314F
AB087809_S298A,M257E,R314F
AB087809_M44I,C162I,P215C
AB087809_M44I,C162I,M257E
AB087809_M44I,C162I,R314F
AB087809_M44I,P215C,M257E
AB087809_M44I,P215C,R314F
AB087809_M44I,M257E,R314F
AB087809_C162I,P215C,M257E
AB087809_C162I,P215C,R314F
AB087809_C162I,M257E,R314F
AB087809_P215C,M257E,R314F
AB087809_S298A,M44I,C162I,P215C
AB087809_S298A,M44I,C162I,M257E
AB08780

'All Done!'

In [11]:
nmoc_mut_list = ['F46L', 'E181H', 'T116F', 'C264A', 'A269T', 'P303S']
wt = 'AB087809'
mut_acc_file = 'dan_rer_nmoc_mut_acc.txt'
raccession = 'NM_001014890'
mutant_seq_file = 'dan_rer_sws_nmoc_mut_seqs.fasta'
combos = getCombos(nmoc_mut_list, wt, mut_acc_file)
getMut(combos,raccession,mutant_seq_file)

AB087809_F46L
AB087809_E181H
AB087809_T116F
AB087809_C264A
AB087809_A269T
AB087809_P303S
AB087809_F46L,E181H
AB087809_F46L,T116F
AB087809_F46L,C264A
AB087809_F46L,A269T
AB087809_F46L,P303S
AB087809_E181H,T116F
AB087809_E181H,C264A
AB087809_E181H,A269T
AB087809_E181H,P303S
AB087809_T116F,C264A
AB087809_T116F,A269T
AB087809_T116F,P303S
AB087809_C264A,A269T
AB087809_C264A,P303S
AB087809_A269T,P303S
AB087809_F46L,E181H,T116F
AB087809_F46L,E181H,C264A
AB087809_F46L,E181H,A269T
AB087809_F46L,E181H,P303S
AB087809_F46L,T116F,C264A
AB087809_F46L,T116F,A269T
AB087809_F46L,T116F,P303S
AB087809_F46L,C264A,A269T
AB087809_F46L,C264A,P303S
AB087809_F46L,A269T,P303S
AB087809_E181H,T116F,C264A
AB087809_E181H,T116F,A269T
AB087809_E181H,T116F,P303S
AB087809_E181H,C264A,A269T
AB087809_E181H,C264A,P303S
AB087809_E181H,A269T,P303S
AB087809_T116F,C264A,A269T
AB087809_T116F,C264A,P303S
AB087809_T116F,A269T,P303S
AB087809_C264A,A269T,P303S
AB087809_F46L,E181H,T116F,C264A
AB087809_F46L,E181H,T116F,A269T
AB08780

'All Done!'

In [27]:
#path to sequences we want to add to an existing alignment [the one we used to train the model] in FASTA format
#seq_add = mutant_seq_file
seq_add = 'dan_rer_sws_mut_seqs.fasta'
#path to the primary alignment used for training the model - if trying access a specific file later
report_dir = './gapt_60_wds_aligned_db_fmt_Lambda_Max_2023-08-10_14-38-42/'
seqFileName = f'{report_dir}wds_aligned_db_fmt.fasta' 
# path to the corresponding metadata file
metaDataFileName = f'{report_dir}wds_meta.tsv' 
#path to the mafft.bat file
mafft_exe = 'C:/Users/safra/mafft-win/mafft.bat'
# name of the phenotype
mt = 'Lambda_Max'
# type of the sequences
seq_type = 'aa'
#if the proportion of gaps at position 'x' across the whole dataset are > threshold, drop this position
gap_threshold = 0.6
#path to the top performing model from model training
#can use method below or enter path manually if coming back to notebook with no variables intialized
top_per_mod = report_dir + 'lgbm.pkl'
#Desired path for the predictions output csv...
prediction_report_dir = f'{report_dir}wds_sws_mutant_predictions.csv'

getMutPred(report_dir,seq_add,seqFileName,metaDataFileName,mafft_exe,mt,seq_type,gap_threshold,top_per_mod,prediction_report_dir)

wds_aligned_db_fmt_mutant_aligned.fasta
(891, 355)
                                              p1   p2   p3 p4 p5 p6 p7 p8 p9  \
AB087809_F46L                                NaN  NaN  NaN  M  K  F  E  D  F   
AB087809_E181H                               NaN  NaN  NaN  M  K  F  E  D  F   
AB087809_T116F                               NaN  NaN  NaN  M  K  F  E  D  F   
AB087809_C264A                               NaN  NaN  NaN  M  K  F  E  D  F   
AB087809_A269T                               NaN  NaN  NaN  M  K  F  E  D  F   
...                                          ...  ...  ... .. .. .. .. .. ..   
AB087809_F46L,E181H,T116F,A269T,P303S        NaN  NaN  NaN  M  K  F  E  D  F   
AB087809_F46L,E181H,C264A,A269T,P303S        NaN  NaN  NaN  M  K  F  E  D  F   
AB087809_F46L,T116F,C264A,A269T,P303S        NaN  NaN  NaN  M  K  F  E  D  F   
AB087809_E181H,T116F,C264A,A269T,P303S       NaN  NaN  NaN  M  K  F  E  D  F   
AB087809_F46L,E181H,T116F,C264A,A269T,P303S  NaN  NaN  NaN  M  K  F  

'Model Predictions Complete!'

In [28]:
#path to sequences we want to add to an existing alignment [the one we used to train the model] in FASTA format
#seq_add = mutant_seq_file
seq_add = 'dan_rer_sws_ni_mut_seqs.fasta'
#path to the primary alignment used for training the model - if trying access a specific file later
report_dir = './gapt_60_wds_ni_aligned_db_fmt_Lambda_Max_2023-08-10_12-01-59/'
seqFileName = f'{report_dir}wds_ni_aligned_db_fmt.fasta' 
# path to the corresponding metadata file
metaDataFileName = f'{report_dir}wds_ni_meta.tsv' 
#path to the mafft.bat file
mafft_exe = 'C:/Users/safra/mafft-win/mafft.bat'
# name of the phenotype
mt = 'Lambda_Max'
# type of the sequences
seq_type = 'aa'
#if the proportion of gaps at position 'x' across the whole dataset are > threshold, drop this position
gap_threshold = 0.61
#path to the top performing model from model training
#can use method below or enter path manually if coming back to notebook with no variables intialized
top_per_mod = report_dir + 'lgbm.pkl'
#Desired path for the predictions output csv...
prediction_report_dir = f'{report_dir}wds_ni_sws_mutant_predictions.csv'

getMutPred(report_dir,seq_add,seqFileName,metaDataFileName,mafft_exe,mt,seq_type,gap_threshold,top_per_mod,prediction_report_dir)

wds_ni_aligned_db_fmt_mutant_aligned.fasta
(791, 356)
                                              p1   p2   p3 p4 p5 p6 p7 p8 p9  \
AB087809_F46L                                NaN  NaN  NaN  M  K  F  E  D  F   
AB087809_E181H                               NaN  NaN  NaN  M  K  F  E  D  F   
AB087809_T116F                               NaN  NaN  NaN  M  K  F  E  D  F   
AB087809_C264A                               NaN  NaN  NaN  M  K  F  E  D  F   
AB087809_A269T                               NaN  NaN  NaN  M  K  F  E  D  F   
...                                          ...  ...  ... .. .. .. .. .. ..   
AB087809_F46L,E181H,T116F,A269T,P303S        NaN  NaN  NaN  M  K  F  E  D  F   
AB087809_F46L,E181H,C264A,A269T,P303S        NaN  NaN  NaN  M  K  F  E  D  F   
AB087809_F46L,T116F,C264A,A269T,P303S        NaN  NaN  NaN  M  K  F  E  D  F   
AB087809_E181H,T116F,C264A,A269T,P303S       NaN  NaN  NaN  M  K  F  E  D  F   
AB087809_F46L,E181H,T116F,C264A,A269T,P303S  NaN  NaN  NaN  M  K  

'Model Predictions Complete!'

In [14]:
#path to sequences we want to add to an existing alignment [the one we used to train the model] in FASTA format
#seq_add = mutant_seq_file
seq_add = 'dan_rer_sws_nmoc_mut_seqs.fasta'
#path to the primary alignment used for training the model - if trying access a specific file later
report_dir = './gapt_60_nmoc_mut_added_aligned_db_fmt_Lambda_Max_2023-08-10_11-37-58/'
seqFileName = f'{report_dir}nmoc_mut_added_aligned_db_fmt.fasta' 
# path to the corresponding metadata file
metaDataFileName = f'{report_dir}nmoc_meta.tsv' 
#path to the mafft.bat file
mafft_exe = 'C:/Users/safra/mafft-win/mafft.bat'
# name of the phenotype
mt = 'Lambda_Max'
# type of the sequences
seq_type = 'aa'
#if the proportion of gaps at position 'x' across the whole dataset are > threshold, drop this position
gap_threshold = 0.6
#path to the top performing model from model training
#can use method below or enter path manually if coming back to notebook with no variables intialized
top_per_mod = report_dir + 'BayesianRidge.pkl'
#Desired path for the predictions output csv...
prediction_report_dir = f'{report_dir}nmoc_sws_mutant_predictions.csv'

getMutPred(report_dir,seq_add,seqFileName,metaDataFileName,mafft_exe,mt,seq_type,gap_threshold,top_per_mod,prediction_report_dir)

nmoc_mut_added_aligned_db_fmt_mutant_aligned.fasta
(891, 355)
                                              p1   p2   p3 p4 p5 p6 p7 p8 p9  \
AB087809_F46L                                NaN  NaN  NaN  M  K  F  E  D  F   
AB087809_E181H                               NaN  NaN  NaN  M  K  F  E  D  F   
AB087809_T116F                               NaN  NaN  NaN  M  K  F  E  D  F   
AB087809_C264A                               NaN  NaN  NaN  M  K  F  E  D  F   
AB087809_A269T                               NaN  NaN  NaN  M  K  F  E  D  F   
...                                          ...  ...  ... .. .. .. .. .. ..   
AB087809_F46L,E181H,T116F,A269T,P303S        NaN  NaN  NaN  M  K  F  E  D  F   
AB087809_F46L,E181H,C264A,A269T,P303S        NaN  NaN  NaN  M  K  F  E  D  F   
AB087809_F46L,T116F,C264A,A269T,P303S        NaN  NaN  NaN  M  K  F  E  D  F   
AB087809_E181H,T116F,C264A,A269T,P303S       NaN  NaN  NaN  M  K  F  E  D  F   
AB087809_F46L,E181H,T116F,C264A,A269T,P303S  NaN  NaN  NaN

'Model Predictions Complete!'

In [None]:
wds_mut_list = ['V162T', 'K67T', 'Y178F', 'L72Q', 'E247M', 'R311I']
wt = 'U08131'
mut_acc_file = 'anol_car_mut_acc.txt'
raccession = 'NM_001014890'
mutant_seq_file = 'anol_car_lws_mut_seqs.fasta'
combos = getCombos(wds_mut_list, wt, mut_acc_file)
getMut(combos,raccession,mutant_seq_file)

In [19]:
wds_ni_mut_list = ['A298S', 'M44I', 'V162I', 'P215C', 'M257E', 'R314F']
wt = 'U08131'
mut_acc_file = 'anol_car_mut_acc.txt'
raccession = 'NM_001014890'
mutant_seq_file = 'anol_car_lws_ni_mut_seqs.fasta'
combos = getCombos(wds_ni_mut_list, wt, mut_acc_file)
getMut(combos,raccession,mutant_seq_file)

U08131_A298S
U08131_M44I
U08131_V162I
U08131_P215C
U08131_M257E
U08131_R314F
U08131_A298S,M44I
U08131_A298S,V162I
U08131_A298S,P215C
U08131_A298S,M257E
U08131_A298S,R314F
U08131_M44I,V162I
U08131_M44I,P215C
U08131_M44I,M257E
U08131_M44I,R314F
U08131_V162I,P215C
U08131_V162I,M257E
U08131_V162I,R314F
U08131_P215C,M257E
U08131_P215C,R314F
U08131_M257E,R314F
U08131_A298S,M44I,V162I
U08131_A298S,M44I,P215C
U08131_A298S,M44I,M257E
U08131_A298S,M44I,R314F
U08131_A298S,V162I,P215C
U08131_A298S,V162I,M257E
U08131_A298S,V162I,R314F
U08131_A298S,P215C,M257E
U08131_A298S,P215C,R314F
U08131_A298S,M257E,R314F
U08131_M44I,V162I,P215C
U08131_M44I,V162I,M257E
U08131_M44I,V162I,R314F
U08131_M44I,P215C,M257E
U08131_M44I,P215C,R314F
U08131_M44I,M257E,R314F
U08131_V162I,P215C,M257E
U08131_V162I,P215C,R314F
U08131_V162I,M257E,R314F
U08131_P215C,M257E,R314F
U08131_A298S,M44I,V162I,P215C
U08131_A298S,M44I,V162I,M257E
U08131_A298S,M44I,V162I,R314F
U08131_A298S,M44I,P215C,M257E
U08131_A298S,M44I,P215C,R314F
U08

'All Done!'

In [22]:
nmoc_mut_list = ['F46L', 'H181E', 'T116F', 'C264A', 'T269A', 'P303S']
wt = 'U08131'
mut_acc_file = 'anol_car_mut_acc.txt'
raccession = 'NM_001014890'
mutant_seq_file = 'anol_car_lws_nmoc_mut_seqs.fasta'
combos = getCombos(nmoc_mut_list, wt, mut_acc_file)
getMut(combos,raccession,mutant_seq_file)

U08131_F46L
U08131_H181E
U08131_T116F
U08131_C264A
U08131_T269A
U08131_P303S
U08131_F46L,H181E
U08131_F46L,T116F
U08131_F46L,C264A
U08131_F46L,T269A
U08131_F46L,P303S
U08131_H181E,T116F
U08131_H181E,C264A
U08131_H181E,T269A
U08131_H181E,P303S
U08131_T116F,C264A
U08131_T116F,T269A
U08131_T116F,P303S
U08131_C264A,T269A
U08131_C264A,P303S
U08131_T269A,P303S
U08131_F46L,H181E,T116F
U08131_F46L,H181E,C264A
U08131_F46L,H181E,T269A
U08131_F46L,H181E,P303S
U08131_F46L,T116F,C264A
U08131_F46L,T116F,T269A
U08131_F46L,T116F,P303S
U08131_F46L,C264A,T269A
U08131_F46L,C264A,P303S
U08131_F46L,T269A,P303S
U08131_H181E,T116F,C264A
U08131_H181E,T116F,T269A
U08131_H181E,T116F,P303S
U08131_H181E,C264A,T269A
U08131_H181E,C264A,P303S
U08131_H181E,T269A,P303S
U08131_T116F,C264A,T269A
U08131_T116F,C264A,P303S
U08131_T116F,T269A,P303S
U08131_C264A,T269A,P303S
U08131_F46L,H181E,T116F,C264A
U08131_F46L,H181E,T116F,T269A
U08131_F46L,H181E,T116F,P303S
U08131_F46L,H181E,C264A,T269A
U08131_F46L,H181E,C264A,P303S
U08

'All Done!'

In [26]:
#path to sequences we want to add to an existing alignment [the one we used to train the model] in FASTA format
seq_add = 'anol_car_lws_nmoc_mut_seqs.fasta'
#path to the primary alignment used for training the model - if trying access a specific file later
report_dir = './gapt_60_wds_aligned_db_fmt_Lambda_Max_2023-08-10_14-38-42/'
seqFileName = f'{report_dir}wds_aligned_db_fmt.fasta' 
# path to the corresponding metadata file
metaDataFileName = f'{report_dir}wds_meta.tsv' 
#path to the mafft.bat file
mafft_exe = 'C:/Users/safra/mafft-win/mafft.bat'
# name of the phenotype
mt = 'Lambda_Max'
# type of the sequences
seq_type = 'aa'
#if the proportion of gaps at position 'x' across the whole dataset are > threshold, drop this position
gap_threshold = 0.6
#path to the top performing model from model training
#can use method below or enter path manually if coming back to notebook with no variables intialized
top_per_mod = report_dir + 'lgbm.pkl'
#Desired path for the predictions output csv...
prediction_report_dir = f'{report_dir}nmoc_lws_mutant_predictions.csv'

getMutPred(report_dir,seq_add,seqFileName,metaDataFileName,mafft_exe,mt,seq_type,gap_threshold,top_per_mod,prediction_report_dir)

wds_aligned_db_fmt_mutant_aligned.fasta
(891, 355)
                                          p1 p2 p3 p4 p5 p6 p7 p8 p9 p10  ...  \
U08131_F46L                                F  A  A  R  R  R  D  S  L   F  ...   
U08131_H181E                               F  A  A  R  R  R  D  S  L   F  ...   
U08131_T116F                               F  A  A  R  R  R  D  S  L   F  ...   
U08131_C264A                               F  A  A  R  R  R  D  S  L   F  ...   
U08131_T269A                               F  A  A  R  R  R  D  S  L   F  ...   
...                                       .. .. .. .. .. .. .. .. ..  ..  ...   
U08131_F46L,H181E,T116F,T269A,P303S        F  A  A  R  R  R  D  S  L   F  ...   
U08131_F46L,H181E,C264A,T269A,P303S        F  A  A  R  R  R  D  S  L   F  ...   
U08131_F46L,T116F,C264A,T269A,P303S        F  A  A  R  R  R  D  S  L   F  ...   
U08131_H181E,T116F,C264A,T269A,P303S       F  A  A  R  R  R  D  S  L   F  ...   
U08131_F46L,H181E,T116F,C264A,T269A,P303S  F  A  A  R  R  

'Model Predictions Complete!'

In [25]:
#path to sequences we want to add to an existing alignment [the one we used to train the model] in FASTA format
#seq_add = mutant_seq_file
seq_add = 'anol_car_lws_ni_mut_seqs.fasta'
#path to the primary alignment used for training the model - if trying access a specific file later
report_dir = './gapt_60_wds_ni_aligned_db_fmt_Lambda_Max_2023-08-10_12-01-59/'
seqFileName = f'{report_dir}wds_ni_aligned_db_fmt.fasta' 
# path to the corresponding metadata file
metaDataFileName = f'{report_dir}wds_ni_meta.tsv' 
#path to the mafft.bat file
mafft_exe = 'C:/Users/safra/mafft-win/mafft.bat'
# name of the phenotype
mt = 'Lambda_Max'
# type of the sequences
seq_type = 'aa'
#if the proportion of gaps at position 'x' across the whole dataset are > threshold, drop this position
gap_threshold = 0.61
#path to the top performing model from model training
#can use method below or enter path manually if coming back to notebook with no variables intialized
top_per_mod = report_dir + 'lgbm.pkl'
#Desired path for the predictions output csv...
prediction_report_dir = f'{report_dir}wds_ni_lws_mutant_predictions.csv'

getMutPred(report_dir,seq_add,seqFileName,metaDataFileName,mafft_exe,mt,seq_type,gap_threshold,top_per_mod,prediction_report_dir)

wds_ni_aligned_db_fmt_mutant_aligned.fasta
(791, 356)
                                          p1 p2 p3 p4 p5 p6 p7 p8 p9 p10  ...  \
U08131_F46L                                F  A  A  R  R  R  D  S  L   F  ...   
U08131_H181E                               F  A  A  R  R  R  D  S  L   F  ...   
U08131_T116F                               F  A  A  R  R  R  D  S  L   F  ...   
U08131_C264A                               F  A  A  R  R  R  D  S  L   F  ...   
U08131_T269A                               F  A  A  R  R  R  D  S  L   F  ...   
...                                       .. .. .. .. .. .. .. .. ..  ..  ...   
U08131_F46L,H181E,T116F,T269A,P303S        F  A  A  R  R  R  D  S  L   F  ...   
U08131_F46L,H181E,C264A,T269A,P303S        F  A  A  R  R  R  D  S  L   F  ...   
U08131_F46L,T116F,C264A,T269A,P303S        F  A  A  R  R  R  D  S  L   F  ...   
U08131_H181E,T116F,C264A,T269A,P303S       F  A  A  R  R  R  D  S  L   F  ...   
U08131_F46L,H181E,T116F,C264A,T269A,P303S  F  A  A  R  

'Model Predictions Complete!'

In [24]:
#path to sequences we want to add to an existing alignment [the one we used to train the model] in FASTA format
#seq_add = mutant_seq_file
seq_add = 'anol_car_lws_nmoc_mut_seqs.fasta'
#path to the primary alignment used for training the model - if trying access a specific file later
report_dir = './gapt_60_nmoc_mut_added_aligned_db_fmt_Lambda_Max_2023-08-10_11-37-58/'
seqFileName = f'{report_dir}nmoc_mut_added_aligned_db_fmt.fasta' 
# path to the corresponding metadata file
metaDataFileName = f'{report_dir}nmoc_meta.tsv' 
#path to the mafft.bat file
mafft_exe = 'C:/Users/safra/mafft-win/mafft.bat'
# name of the phenotype
mt = 'Lambda_Max'
# type of the sequences
seq_type = 'aa'
#if the proportion of gaps at position 'x' across the whole dataset are > threshold, drop this position
gap_threshold = 0.6
#path to the top performing model from model training
#can use method below or enter path manually if coming back to notebook with no variables intialized
top_per_mod = report_dir + 'BayesianRidge.pkl'
#Desired path for the predictions output csv...
prediction_report_dir = f'{report_dir}nmoc_lws_mutant_predictions.csv'

getMutPred(report_dir,seq_add,seqFileName,metaDataFileName,mafft_exe,mt,seq_type,gap_threshold,top_per_mod,prediction_report_dir)

nmoc_mut_added_aligned_db_fmt_mutant_aligned.fasta
(891, 355)
                                          p1 p2 p3 p4 p5 p6 p7 p8 p9 p10  ...  \
U08131_F46L                                F  A  A  R  R  R  D  S  L   F  ...   
U08131_H181E                               F  A  A  R  R  R  D  S  L   F  ...   
U08131_T116F                               F  A  A  R  R  R  D  S  L   F  ...   
U08131_C264A                               F  A  A  R  R  R  D  S  L   F  ...   
U08131_T269A                               F  A  A  R  R  R  D  S  L   F  ...   
...                                       .. .. .. .. .. .. .. .. ..  ..  ...   
U08131_F46L,H181E,T116F,T269A,P303S        F  A  A  R  R  R  D  S  L   F  ...   
U08131_F46L,H181E,C264A,T269A,P303S        F  A  A  R  R  R  D  S  L   F  ...   
U08131_F46L,T116F,C264A,T269A,P303S        F  A  A  R  R  R  D  S  L   F  ...   
U08131_H181E,T116F,C264A,T269A,P303S       F  A  A  R  R  R  D  S  L   F  ...   
U08131_F46L,H181E,T116F,C264A,T269A,P303S  F  A

'Model Predictions Complete!'

PREDICTIONS FOR UNTESTED VERT OPSINS:

In [None]:
#path to sequences we want to add to an existing alignment [the one we used to train the model] in FASTA format
seq_add = 'untested_vert_seqs.fasta'
#path to the primary alignment used for training the model - if trying access a specific file later
report_dir = './gapt_60_wds_aligned_db_fmt_Lambda_Max_2023-08-10_14-38-42/'
seqFileName = f'{report_dir}wds_aligned_db_fmt.fasta' 
# path to the corresponding metadata file
metaDataFileName = f'{report_dir}wds_meta.tsv' 
#path to the mafft.bat file
mafft_exe = 'C:/Users/safra/mafft-win/mafft.bat'
# name of the phenotype
mt = 'Lambda_Max'
# type of the sequences
seq_type = 'aa'
#if the proportion of gaps at position 'x' across the whole dataset are > threshold, drop this position
gap_threshold = 0.6
#path to the top performing model from model training
#can use method below or enter path manually if coming back to notebook with no variables intialized
top_per_mod = report_dir + 'lgbm.pkl'
#Desired path for the predictions output csv...
prediction_report_dir = f'{report_dir}wds_untested_vert_seqs_predictions.csv'

getMutPred(report_dir,seq_add,seqFileName,metaDataFileName,mafft_exe,mt,seq_type,gap_threshold,top_per_mod,prediction_report_dir)

In [None]:
#path to sequences we want to add to an existing alignment [the one we used to train the model] in FASTA format
#seq_add = mutant_seq_file
seq_add = 'untested_vert_seqs.fasta'
#path to the primary alignment used for training the model - if trying access a specific file later
report_dir = './gapt_60_wds_ni_aligned_db_fmt_Lambda_Max_2023-08-10_12-01-59/'
seqFileName = f'{report_dir}wds_ni_aligned_db_fmt.fasta' 
# path to the corresponding metadata file
metaDataFileName = f'{report_dir}wds_ni_meta.tsv' 
#path to the mafft.bat file
mafft_exe = 'C:/Users/safra/mafft-win/mafft.bat'
# name of the phenotype
mt = 'Lambda_Max'
# type of the sequences
seq_type = 'aa'
#if the proportion of gaps at position 'x' across the whole dataset are > threshold, drop this position
gap_threshold = 0.60
#path to the top performing model from model training
#can use method below or enter path manually if coming back to notebook with no variables intialized
top_per_mod = report_dir + 'lgbm.pkl'
#Desired path for the predictions output csv...
prediction_report_dir = f'{report_dir}wds_ni_untested_verts_predictions.csv'

getMutPred(report_dir,seq_add,seqFileName,metaDataFileName,mafft_exe,mt,seq_type,gap_threshold,top_per_mod,prediction_report_dir)

In [None]:
#path to sequences we want to add to an existing alignment [the one we used to train the model] in FASTA format
#seq_add = mutant_seq_file
seq_add = 'untested_vert_seqs.fasta'
#path to the primary alignment used for training the model - if trying access a specific file later
report_dir = './gapt_60_nmoc_mut_added_aligned_db_fmt_Lambda_Max_2023-08-10_11-37-58/'
seqFileName = f'{report_dir}nmoc_mut_added_aligned_db_fmt.fasta' 
# path to the corresponding metadata file
metaDataFileName = f'{report_dir}nmoc_meta.tsv' 
#path to the mafft.bat file
mafft_exe = 'C:/Users/safra/mafft-win/mafft.bat'
# name of the phenotype
mt = 'Lambda_Max'
# type of the sequences
seq_type = 'aa'
#if the proportion of gaps at position 'x' across the whole dataset are > threshold, drop this position
gap_threshold = 0.6
#path to the top performing model from model training
#can use method below or enter path manually if coming back to notebook with no variables intialized
top_per_mod = report_dir + 'BayesianRidge.pkl'
#Desired path for the predictions output csv...
prediction_report_dir = f'{report_dir}nmoc_untested_verts_predictions.csv'

getMutPred(report_dir,seq_add,seqFileName,metaDataFileName,mafft_exe,mt,seq_type,gap_threshold,top_per_mod,prediction_report_dir)