# Instruction for Prott5 embedding

In [None]:
from embedding import Embedding


In [21]:
# NOTE: it is better to convert this into python file if you don't want to have an interactive section
# in embedding.py there is a line warnings.filterwarnings("ignore")
# which ignore all the warning when running, if you want to turn that off, comment on that line

EMBEDDING_INPUT_FILE  = './data/mutated_iter2.fasta' # input fasta file + its directory
EMBEDDING_OUTPUT_NAME = 'iter3_blind' # output file name
EMBEDDING_OUTPUT_DIR = "data" # output file directory, remember don't put / at the end
Embedder = Embedding(in_file=EMBEDDING_INPUT_FILE,
                     out_name = EMBEDDING_OUTPUT_NAME, 
                     out_dir = EMBEDDING_OUTPUT_DIR,
                     level='protein', # change to "protein" if want to embed in protein level
                     embed='prott5')

Embedder.embedding()

'./data/iter3_blind_protein_prott5.csv'

In [None]:
def start_embedding(iter, inputFile = None):
    
    EMBEDDING_INPUT_FILE  = f'./data/mutated_iter{iter}.fasta' # input fasta file + its directory
    if inputFile != None:
        EMBEDDING_INPUT_FILE  = inputFile
    EMBEDDING_OUTPUT_NAME = f'iter{iter+1}_blind' # output file name
    if inputFile != None:
        EMBEDDING_INPUT_FILE  = f'temp_blind'
    EMBEDDING_OUTPUT_DIR = "data" # output file directory, remember don't put / at the end
    Embedder = Embedding(in_file=EMBEDDING_INPUT_FILE,
                         out_name = EMBEDDING_OUTPUT_NAME, 
                         out_dir = EMBEDDING_OUTPUT_DIR,
                         level='protein', # change to "protein" if want to embed in protein level
                         embed='prott5')

    Embedder.embedding()

In [None]:
from Bio import SeqIO
import random
import copy
import joblib
import numpy as np
import pandas as pd
import pickle

In [None]:
amino_acids = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] # XZBJ added 
exclusion_list = ['B', 'Z', 'J', 'O', 'U', 'X'] # XZBJ added
mutated_sequences = []

In [None]:
def single_replacement(sequence, iterations):
    result = []
    data = sequence.seq
    desc = sequence.description
    new_seq = sequence
    for i in range(iterations):
        new_seq = copy.deepcopy(sequence)
        
        # pick a random position
        pos = random.randint(0, len(data)-1)
        # amino acid list without the current amino acid

        removed_aa = amino_acids.copy()
        if data[pos] not in ['X' ,'Z', 'B','J']:
            removed_aa.remove(data[pos])
        # pick a random base
        new_base = random.choice(removed_aa)
        # replace the base at that position
        # print("Replacing", data[pos], "with", new_base, "at index", pos)
        mutated_data = data[:pos] + new_base + data[pos+1:]

        new_desc = desc + f"|_{i+1}-SR-{data[pos]}{pos}{new_base}"
        new_seq.seq = mutated_data
        new_seq.description = new_desc
        result.append(new_seq)
    
    return result
    

In [None]:
def double_replacement(sequence, iterations):
    result = []
    data = sequence.seq
    desc = sequence.description
    new_seq = sequence
    for i in range(iterations):
        new_seq = copy.deepcopy(sequence)
        new_desc = desc + f"|_{i+1}-DR"
        # pick a random position
        pos1 = random.randint(0, len(data)-1)
        pos2 = random.randint(0, len(data)-1)
        while pos1 == pos2:
            pos2 = random.randint(0, len(data)-1)

        for pos in [pos1, pos2]:
            # amino acid list without the current amino acid
            removed_aa = amino_acids.copy()
            if data[pos] not in ['X' ,'Z', 'B','J']:
                removed_aa.remove(data[pos])
            # pick a random base
            new_base = random.choice(removed_aa)
            # replace the base at that position
    
            #print("Replacing", data[pos], "with", new_base, "at index", pos)
            new_desc += f"-{data[pos]}{pos}{new_base}"
            mutated_data = data[:pos] + new_base + data[pos+1:]

        new_seq.seq = mutated_data
        new_seq.description = new_desc
        result.append(new_seq)

    return result

In [None]:
def swap(sequence, iterations):
    result = []
    data = sequence.seq
    desc = sequence.description
    new_seq = sequence
    for i in range(iterations):
        new_seq = copy.deepcopy(sequence)
        new_data = data
        # pick a random position
        pos1 = random.randint(0, len(data)-1)
        pos2 = random.randint(0, len(data)-1)
        while pos1 == pos2:
            pos2 = random.randint(0, len(data)-1)
        #print("Swapping", new_data[pos1], "and", new_data[pos2], "at indices", pos1, "and", pos2)
        new_desc = desc + f"|_{i+1}-S-{new_data[pos1]}{pos1}{new_data[pos2]}{pos2}"
        new_data = new_data[:pos1] + new_data[pos2] + new_data[pos1+1:]
        new_data = new_data[:pos2] + new_data[pos1] + new_data[pos2+1:]

        new_seq.seq = new_data
        new_seq.description = new_desc
        result.append(new_seq)

    return result

In [None]:
def mutate_sequences(iterations, file):
    mutated_sequences = [] #empty the list
    fasta_file = file
    # Read the FASTA file
    sequences = SeqIO.parse(fasta_file, "fasta")

    for sequence in sequences:
        # Access the sequence ID and sequence data
        sequence_desc = sequence.description
        sequence_data = sequence.seq

        mutated_sequences.extend(single_replacement(sequence,20))
        mutated_sequences.extend(double_replacement(sequence,20))
        mutated_sequences.extend(swap(sequence,10))


    # Specify the path for the new FASTA file
    output_file = f"./data/mutated_iter{iterations}.fasta"

    # Write the sequences to the new FASTA file
    SeqIO.write(mutated_sequences, output_file, "fasta")
    

In [None]:
def grab_top_20(iterations):
    df = pd.read_csv(f"./data/iter{iterations+1}_blind_protein_prott5.csv")
    proteinId = df['ProteinID']
    embeddings = df.drop(columns=['ProteinID'])
    kcat = joblib.load(r"./ridgeModels/kcat mean 0.01.pkl")
    sco = joblib.load(r"./ridgeModels/Sco mean 0.01.pkl")
    kcat_predictions = kcat.predict(embeddings)
    sco_predictions = sco.predict(embeddings)


    kcat_predictions = pd.DataFrame(kcat_predictions, columns=['Kcat'])
    sco_predictions = pd.DataFrame(sco_predictions, columns=['Sc/o'])
    new_df = pd.concat([proteinId, kcat_predictions, sco_predictions], axis=1)
    new_df.to_csv(f"./data/Iter-{iterations+1}-predictions.csv", index=False)
    print(new_df.shape)
    top_20 = new_df.nlargest(20, 'Kcat')
    top_20 = top_20['ProteinID']
    top_20 = top_20.to_list()
    top_20 = [string.replace(">", "") for string in top_20]
    
    return top_20

In [None]:
def grab_top(iterations, number):
    df = pd.read_csv(f"./data/iter{iterations+1}_blind_protein_prott5.csv")
    proteinId = df['ProteinID']
    embeddings = df.drop(columns=['ProteinID'])
    kcat = joblib.load(r"./ridgeModels/kcat mean 0.01.pkl")
    sco = joblib.load(r"./ridgeModels/Sco mean 0.01.pkl")
    kcat_predictions = kcat.predict(embeddings)
    sco_predictions = sco.predict(embeddings)


    kcat_predictions = pd.DataFrame(kcat_predictions, columns=['Kcat'])
    sco_predictions = pd.DataFrame(sco_predictions, columns=['Sc/o'])
    new_df = pd.concat([proteinId, kcat_predictions, sco_predictions], axis=1)
    new_df.to_csv(f"./data/Iter-{iterations+1}-predictions.csv", index=False)
    print(new_df.shape)
    top_20 = new_df.nlargest(number, 'Kcat')
    top_20 = top_20['ProteinID']
    top_20 = top_20.to_list()
    top_20 = [string.replace(">", "") for string in top_20]
    
    return top_20

In [None]:
def grab_OG_sep_top(iterations, number, multi=-1):
    df = pd.read_csv(f"./data/iter{iterations+1}_blind_protein_prott5.csv")
    proteinId = df['ProteinID']
    print(proteinId.shape)

    top_ids = ['gi|313473685|dbj|BAJ40208.1|', 'gi|932247975|gb|ALG62823.1|', 'gi|932248269|gb|ALG62965.1|', 'gi|932248239|gb|ALG62950.1|', 'gi|932247944|gb|ALG62808.1|', 'gi|932247942|gb|ALG62807.1|', 'gi|932247938|gb|ALG62805.1|', 'gi|932248235|gb|ALG62948.1|', 'gi|932248233|gb|ALG62947.1|', 'gi|932247946|gb|ALG62809.1|', 'tr|A0A1C3HPS9|A0A1C3HPS9_PUCDI', 'tr|A0A1C3HPT0|A0A1C3HPT0_9POAL', 'YP_009573569.1', 'AFB70630.1', 'AGT56139.1', 'YP_899415.1', 'SCM15160.1', 'SCM15158.1', 'tr|A0A1C3HPM4|A0A1C3HPM4_9POAL', 'tr|A0A6C0SV93|A0A6C0SV93_ERATE']
    kcat = joblib.load(r"./ridgeModels/kcat mean 0.01.pkl")
    sco = joblib.load(r"./ridgeModels/Sco mean 0.01.pkl")

    top_sequences = []
    for i in top_ids:
        new_df = pd.DataFrame()
        for j in proteinId: 
            if i in j:
                print("found")
                data = df.loc[df['ProteinID'] == j]
                new_df = pd.concat([new_df, data], ignore_index=True)


        embeddings = new_df.drop(['ProteinID'], axis=1)
        
        kcat_predictions = kcat.predict(embeddings)
        sco_predictions = sco.predict(embeddings)

        kcat_predictions = pd.DataFrame(kcat_predictions, columns=['Kcat'])
        sco_predictions = pd.DataFrame(sco_predictions, columns=['Sc/o'])
        new_pred_df = pd.concat([new_df['ProteinID'], kcat_predictions, sco_predictions], axis=1)
        
        print(new_pred_df.shape)
        top_20 = new_pred_df.nlargest(number, 'Kcat') # top x number proteins with highest Kcat
        top_20 = top_20['ProteinID']
        top_20 = top_20.to_list()
        top_20 = [string.replace(">", "") for string in top_20]
        
        # add to top_sequences
        top_sequences.extend(top_20)


    embeddings = df.drop(columns=['ProteinID'])
    kcat_predictions = kcat.predict(embeddings)
    sco_predictions = sco.predict(embeddings)


    kcat_predictions = pd.DataFrame(kcat_predictions, columns=['Kcat'])
    sco_predictions = pd.DataFrame(sco_predictions, columns=['Sc/o'])
    new_df =pd. DataFrame()
    new_df = pd.concat([proteinId, kcat_predictions, sco_predictions], axis=1)

    # save the predictions of every protein
    if multi >= 0:
        new_df.to_csv(f"./data/Iter-{iterations+1}-predictions-{multi}.csv", index=False)
    else:
        new_df.to_csv(f"./data/Iter-{iterations+1}-predictions.csv", index=False)

    return top_sequences
 

In [None]:
def save_top_seq_fasta(iterations, top_20, multi=-1):
    # Specify the path to the FASTA file
    fasta_file = f"./data/mutated_iter{iterations}.fasta"


    # Read the FASTA file
    sequences = SeqIO.parse(fasta_file, "fasta")

    top_sequences = []
    # Iterate over the sequences
    for sequence in sequences:
        # Access the sequence ID and sequence data
        sequence_desc = sequence.description
        sequence_data = sequence.seq
        if sequence_desc not in top_20:
            continue
        # Do something with the sequence ID and sequence data
        #print(f"Sequence ID: {sequence_desc}")
        #print(f"Sequence Data: {sequence_data}")
        top_sequences.append(sequence)

    # Specify the path for the new FASTA file
    if multi>=0:
        output_file = f"./data/top_sequences_iter{iterations+1}-{multi}.fasta"
    else:
        output_file = f"./data/top_sequences_iter{iterations+1}.fasta"
    # Write the sequences to the new FASTA file
    SeqIO.write(top_sequences, output_file, "fasta")


In [None]:
# Grab top 20 from 100 sequences 

read the top predictions 
fetch the top 20
fetch last top 20


In [None]:
# Specify the path to the FASTA file
fasta_file = r"/users/jadvg3/Downloads/pt5_embedding/pt5_embedding/data/top_sequences.fasta"

# Read the FASTA file
sequences = SeqIO.parse(fasta_file, "fasta")

top_sequences = []
seq_id_list = [] 
# Iterate over the sequences
for sequence in sequences:
    # Access the sequence ID and sequence data
    sequence_desc = sequence.description
    sequence_data = sequence.seq
    seq_id = sequence.id


    seq_id_list.append(seq_id)

unique_seq_ids = set(seq_id_list)

if len(unique_seq_ids) == len(seq_id_list):
    print("All seq_id values are unique.")
else:
    print("There are duplicate seq_id values.")
print (seq_id_list)

In [31]:
start_embedding(9)

In [None]:
#start predicting and grab top 5

# Load models using pickle instead of joblib
kcat = joblib.load(r"./ridgeModels/kcat mean 0.01.pkl")
sco = joblib.load(r"./ridgeModels/Sco mean 0.01.pkl")

for i in range(10):
    iterations = i

    mutated_sequences = [] #empty the list
    temp_sequences = [] #empty the list
    
    fasta_file = f"./data/top_sequences_iter{iterations}.fasta"
    if i==0:
        fasta_file = f"./data/top_sequences.fasta"
    # Read the FASTA file
    sequences = SeqIO.parse(fasta_file, "fasta")

    for sequence in sequences:
        # Access the sequence ID and sequence data
        sequence_desc = sequence.description
        sequence_data = sequence.seq
        
        print(i)
        temp_sequences.append(sequence)

        temp_sequences.extend(single_replacement(sequence,20))

        temp_sequences.extend(double_replacement(sequence,20))

        temp_sequences.extend(swap(sequence,10))

        # Write the sequences to the new FASTA file
        SeqIO.write(temp_sequences, './data/tempfile.fasta', "fasta")

        start_embedding(iterations, './data/tempfile.fasta')

        df = pd.read_csv(f"./data/iter{iterations+1}_blind_protein_prott5.csv")
        proteinId = df['ProteinID']
        embeddings = df.drop(columns=['ProteinID'])
        kcat_predictions = kcat.predict(embeddings)
        sco_predictions = sco.predict(embeddings)

        kcat_predictions = pd.DataFrame(kcat_predictions, columns=['Kcat'])
        sco_predictions = pd.DataFrame(sco_predictions, columns=['Sc/o'])
        new_df = pd.concat([proteinId, kcat_predictions, sco_predictions], axis=1)
        new_df.to_csv(f"./data/Iter-{iterations+1}-predictions.csv", index=False)

        top_5 = new_df.nlargest(5, 'Kcat')
        top_5 = top_5['ProteinID']
        top_5 = top_5.to_list()
        top_5 = [string.replace(">", "") for string in top_5]


        # Specify the path to the FASTA file
        temp_fasta_file = f"./data/tempfile.fasta"

        # Read the FASTA file
        tsequences = SeqIO.parse(temp_fasta_file, "fasta")

        # Iterate over the sequences
        for tsequence in tsequences:
            # Access the sequence ID and sequence data
            sequence_desc = tsequence.description
            sequence_data = tsequence.seq
            if sequence_desc not in top_5:
                continue
            mutated_sequences.append(tsequence)


    # Specify the path for the new FASTA file
    output_file = f"./data/mutated_iter{iterations}.fasta"

    # Write the sequences to the new FASTA file
    SeqIO.write(mutated_sequences, output_file, "fasta")
    
    mutate_sequences(iterations, output_file)
    
    start_embedding(iterations)
    
    top_20 = grab_top_20(iterations)
    
    save_top_seq_fasta(iterations, top_20)


In [14]:

# Load models using pickle instead of joblib
kcat = joblib.load(r"./ridgeModels/kcat mean 0.01.pkl")
sco = joblib.load(r"./ridgeModels/Sco mean 0.01.pkl")

iterations = 0

mutated_sequences = [] #empty the list
temp_sequences = [] #empty the list

fasta_file = f"./data/top_sequences.fasta"
# Read the FASTA file
sequences = SeqIO.parse(fasta_file, "fasta")

for sequence in sequences:
    temp_sequences = []
    # Access the sequence ID and sequence data
    sequence_desc = sequence.description
    sequence_data = sequence.seq

    temp_sequences.append(sequence)

    temp_sequences.extend(single_replacement(sequence,20))

    temp_sequences.extend(double_replacement(sequence,20))

    temp_sequences.extend(swap(sequence,10))

    # Write the sequences to the new FASTA file
    SeqIO.write(temp_sequences, './data/tempfile.fasta', "fasta")

    start_embedding(iterations, './data/tempfile.fasta')

    df = pd.read_csv(f"./data/temp_blind_protein_prott5.csv")
    proteinId = df['ProteinID']
    embeddings = df.drop(columns=['ProteinID'])
    kcat_predictions = kcat.predict(embeddings)
    sco_predictions = sco.predict(embeddings)

    kcat_predictions = pd.DataFrame(kcat_predictions, columns=['Kcat'])
    sco_predictions = pd.DataFrame(sco_predictions, columns=['Sc/o'])
    new_df = pd.concat([proteinId, kcat_predictions, sco_predictions], axis=1)
    new_df.to_csv(f"./data/Iter-{iterations+1}-predictions.csv", index=False)

    top_5 = new_df.nlargest(5, 'Kcat')
    top_5 = top_5['ProteinID']
    top_5 = top_5.to_list()
    top_5 = [string.replace(">", "") for string in top_5]


    # Specify the path to the FASTA file
    temp_fasta_file = f"./data/tempfile.fasta"

    # Read the FASTA file
    tsequences = SeqIO.parse(temp_fasta_file, "fasta")

    # Iterate over the sequences
    for tsequence in tsequences:
        # Access the sequence ID and sequence data
        sequence_desc = tsequence.description
        sequence_data = tsequence.seq
        if sequence_desc not in top_5:
            continue
        mutated_sequences.append(tsequence)

output_file = f"./data/mutated_iter{iterations}.fasta"

# Write the sequences to the new FASTA file
SeqIO.write(mutated_sequences, output_file, "fasta")

start_embedding(iterations)

top_100 = grab_top(iterations, 100)

save_top_seq_fasta(iterations, top_100)




100

In [39]:
for i in range(1,10):
    print(i)
    mutate_sequences(i, f"./data/top_sequences_iter{i}.fasta")

    start_embedding(i)

    top_seq = grab_OG_sep_top(i, 5)

    save_top_seq_fasta(i, top_seq)
    

5
(5000,)
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found


found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
(250, 3)
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
f

found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
(250, 3)
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
f

found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
foun

found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
foun

found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
foun

found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
(250, 3)
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
f

found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
(250, 3)
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
f

found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
(250, 3)
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
f

found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
foun

found
found
found
found
found
found
found
found
found
found
found
found
found
found
(250, 3)
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
f

found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
foun

found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
(250, 3)
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
f

found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
(250, 3)
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
f

found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
(250, 3)
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
f

found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
(250, 3)
found
found
found
found
found
found
found
found
found
f

found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
(250, 3)
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
f

found
found
found
found
found
found
found
found
(250, 3)
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
f

In [15]:
for j in range(4):
    print(j)
    # Load models using pickle instead of joblib
    kcat = joblib.load(r"./ridgeModels/kcat mean 0.01.pkl")
    sco = joblib.load(r"./ridgeModels/Sco mean 0.01.pkl")

    iterations = 0

    mutated_sequences = [] #empty the list
    temp_sequences = [] #empty the list

    fasta_file = f"./data/top_sequences.fasta"
    # Read the FASTA file
    sequences = SeqIO.parse(fasta_file, "fasta")

    for sequence in sequences:
        temp_sequences = []
        # Access the sequence ID and sequence data
        sequence_desc = sequence.description
        sequence_data = sequence.seq

        temp_sequences.append(sequence)

        temp_sequences.extend(single_replacement(sequence,20))

        temp_sequences.extend(double_replacement(sequence,20))

        temp_sequences.extend(swap(sequence,10))

        # Write the sequences to the new FASTA file
        SeqIO.write(temp_sequences, './data/tempfile.fasta', "fasta")

        start_embedding(iterations, './data/tempfile.fasta')

        df = pd.read_csv(f"./data/temp_blind_protein_prott5.csv")
        proteinId = df['ProteinID']
        embeddings = df.drop(columns=['ProteinID'])
        kcat_predictions = kcat.predict(embeddings)
        sco_predictions = sco.predict(embeddings)

        kcat_predictions = pd.DataFrame(kcat_predictions, columns=['Kcat'])
        sco_predictions = pd.DataFrame(sco_predictions, columns=['Sc/o'])
        new_df = pd.concat([proteinId, kcat_predictions, sco_predictions], axis=1)
        new_df.to_csv(f"./data/Iter-{iterations+1}-predictions.csv", index=False)

        top_5 = new_df.nlargest(5, 'Kcat')
        top_5 = top_5['ProteinID']
        top_5 = top_5.to_list()
        top_5 = [string.replace(">", "") for string in top_5]


        # Specify the path to the FASTA file
        temp_fasta_file = f"./data/tempfile.fasta"

        # Read the FASTA file
        tsequences = SeqIO.parse(temp_fasta_file, "fasta")

        # Iterate over the sequences
        for tsequence in tsequences:
            # Access the sequence ID and sequence data
            sequence_desc = tsequence.description
            sequence_data = tsequence.seq
            if sequence_desc not in top_5:
                continue
            mutated_sequences.append(tsequence)

    output_file = f"./data/mutated_iter{iterations}.fasta"

    # Write the sequences to the new FASTA file
    SeqIO.write(mutated_sequences, output_file, "fasta")

    start_embedding(iterations)

    top_100 = grab_top(iterations, 100)

    save_top_seq_fasta(iterations, top_100)

    for i in range(1,10):
        print(i)
        mutate_sequences(i, f"./data/top_sequences_iter{i}.fasta")

        start_embedding(i)

        top_seq = grab_OG_sep_top(i, 5, j+2)

        save_top_seq_fasta(i, top_seq, j+2)
        
    

0
9
(5000,)
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
foun

found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
(250, 3)
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
f

found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
(250, 3)
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
f

found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
foun

found
found
found
found
found
found
found
found
found
found
found
found
found
(250, 3)
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
f

found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
(250, 3)
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
f

found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
foun

found
found
found
(250, 3)
2
9
(5000,)
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
f

found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
(250, 3)
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
f

(250, 3)
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
f

found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
(250, 3)
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
f

found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
(250, 3)
found
f

found
found
found
found
found
(250, 3)
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
f

found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
foun

found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
(250, 3)
