# Instruction for Prott5 embedding

In [1]:
from embedding import Embedding
from Bio import SeqIO
import random
import copy
import joblib
import numpy as np
import pandas as pd
import pickle

2024-04-25 10:42:41.108891: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
def start_embedding(inputFile=None, type='protein'):
    """
    Starts the embedding process.

    Args:
        inputFile (str): Path to the input file. Default is None.
        type (str): Type of embedding. Default is 'protein'.

    Returns:
        None
    """

    EMBEDDING_INPUT_FILE = f'./data/{inputFile}.fasta'
    EMBEDDING_OUTPUT_NAME = f'blind_{inputFile}'  # output file name

    EMBEDDING_OUTPUT_DIR = "data"  # output file directory, remember don't put / at the end
    Embedder = Embedding(
        in_file=EMBEDDING_INPUT_FILE,
        out_name=EMBEDDING_OUTPUT_NAME,
        out_dir=EMBEDDING_OUTPUT_DIR,
        level=type,  
        embed='prott5'
    )

    Embedder.embedding()

In [3]:
def make_preds(inputFile):
    """
    Generate predictions for protein properties using pre-trained models.
    Kcat and Sc/o predictions are generated and saved to a CSV file.

    Args:
        inputFile (str): The name of the input file.

    Returns:
        None

    Raises:
        FileNotFoundError: If the input file or pre-trained models are not found.

    """
    df = pd.read_csv(f"./data/blind_{inputFile}_protein_prott5.csv")
    proteinId = df['ProteinID']
    embeddings = df.drop(columns=['ProteinID'])
    kcat = joblib.load(r"./ridgeModels/kcat mean 0.01.pkl")
    sco = joblib.load(r"./ridgeModels/Sco mean 0.01.pkl")
    kcat_predictions = kcat.predict(embeddings)
    sco_predictions = sco.predict(embeddings)

    kcat_predictions = pd.DataFrame(kcat_predictions, columns=['Kcat'])
    sco_predictions = pd.DataFrame(sco_predictions, columns=['Sc/o'])
    new_df = pd.concat([proteinId, kcat_predictions, sco_predictions], axis=1)
    new_df.to_csv(f"./data/{inputFile}-predictions.csv", index=False)
    print(new_df.shape)
    print(f"Output File: ./data/{inputFile}-predictions.csv")

In [4]:
inputFile = "example" #NOTE: Make sure to put the file in the data folder
start_embedding(inputFile)
make_preds(inputFile)

(15, 3)
Output File: ./data/example-predictions.csv
