In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


<h2><b> Prompt Engineering assesses the LLM on binding affinity label prediction based on SMILES and Protein Sequence. The labels are outputted to a file from which accuracy and validity scores can be calculated. </b></h2>

---



# Create Prompts

In [None]:
def create_binding_affinity_prompt(examples):
    prompt = """[INST] <<SYS>>
    You are an expert chemist, your task is to predict the binding affinity (Label), given a compound (SMILES) and
    given protein sequence (Sequence) using your experienced chemical property prediction knowledge.
    Please strictly follow the format, no other information can be provided.
    Given the SMILES string of compound and Protein Sequence, predict the binding affinity based on training compound-protein pair dataset.
    Please answer with one word: High, Medium, Low corresponding to binding affinity label.
    <</SYS>>\n"""
    count = 0
    for i in range(len(examples)):
        prompt += f"SMILES: {examples[i][0]}\nSequence: {examples[i][1]}\n[/INST]\nLabel: {examples[i][2]}\n"
        if (i != (len(examples) - 1)): # Last example does not need to append INST
          prompt += "[INST]\n"
    return prompt

def create_bace_prompt(input_smiles, pp_examples):
    prompt = "You are an expert chemist tasked with predicting molecule properties based on chemical structure. Given a molecule's SMILES string, predict if it inhibits (Yes) the Beta-site Amyloid Precursor Protein Cleaving Enzyme 1 (BACE1) or not (No) and provide response as Yes or No."
    for example in pp_examples:
        prompt += f"SMILES: {example[0]}\nBACE-1 Inhibit: {example[-1]}\n"
    return prompt

# Generate Train and Test for Binding Affinity Data

In [None]:
import random

# Random sampling
def generate_train_random(binding_affinity_train, sample_size):
    high_values = binding_affinity_train[binding_affinity_train["Label"] == 'High'].sample(sample_size)
    medium_values = binding_affinity_train[binding_affinity_train["Label"] == 'Medium'].sample(sample_size)
    low_values = binding_affinity_train[binding_affinity_train["Label"] == 'Low'].sample(sample_size)

    print("Parsing SMILES, Sequence, Label")

    high_values_smiles = high_values["SMILES"].to_list()
    medium_values_smiles = medium_values["SMILES"].to_list()
    low_values_smiles = low_values["SMILES"].to_list()

    high_values_proteins = high_values["Sequence"].to_list()
    medium_values_proteins = medium_values["Sequence"].to_list()
    low_values_proteins = low_values["Sequence"].to_list()

    high_values_affinity = high_values["Label"].to_list()
    medium_values_affinity = medium_values["Label"].to_list()
    low_values_affinity = low_values["Label"].to_list()

    print("Successfully Parsed")

    sampled_list = []
    for i in range(len(high_values_smiles)):
      sampled_list += [[high_values_smiles[i], high_values_proteins[i], high_values_affinity[i]]]
    for i in range(len(medium_values_smiles)):
      sampled_list += [[medium_values_smiles[i], medium_values_proteins[i], medium_values_affinity[i]]]
    for i in range(len(low_values_smiles)):
      sampled_list += [[low_values_smiles[i], low_values_proteins[i], low_values_affinity[i]]]

    random.shuffle(sampled_list)

    return sampled_list

def generate_test_random(binding_affinity_test, sample_size):
    sampled_rows = binding_affinity_test.sample(n=sample_size, random_state=42)
    sampled_rows_list = sampled_rows.values.tolist()

    return sampled_rows_list

def generate_test_proportional(binding_affinity_test, sample_size):
    pass;

# Introduce Noise

In [None]:
def add_noise(data, x):
    """Makes X decimal of samples have their x-y flipped to add noise to data"""
    sample_size = int(x * len(data))
    noise_samples = data.sample(sample_size)

    # Generate random indices for swapping
    indices_to_swap = np.random.choice(len(data), size=sample_size, replace=False)

    # Swap elements in columns
    data.loc[indices_to_swap, ['SMILES', 'Sequence']], data.loc[indices_to_swap, ['Sequence', 'SMILES']] = data.loc[indices_to_swap, ['Sequence', 'SMILES']].values, data.loc[indices_to_swap, ['SMILES', 'Sequence']].values

    return data

## Generate train and test with fixed sample size

In [None]:
import pandas as pd

def create_train_test_list(train_size, test_size):
    print("Entering function")
    train_data = pd.read_csv('/content/drive/MyDrive/LLama2HealthCareChatBot-master/data/BindingAffinity_data/binding_affinity_train.csv').rename(columns={'Canonical SMILE': 'SMILES'})
    print("Read train data")
    test_data = pd.read_csv('/content/drive/MyDrive/LLama2HealthCareChatBot-master/data/BindingAffinity_data/binding_affinity_test.csv').rename(columns={'Canonical SMILE': 'SMILES'})

    train_sample_size = train_size
    test_sample_size = test_size

    print("Creating lists")

    train_list = generate_train_random(train_data, train_size)
    test_list = generate_test_random(test_data, test_size)

    return train_list, test_list

# Install required libraries

In [None]:
!pip install langchain CTransformers unstructured sentence-transformers faiss-cpu transformers pathlib huggingface-hub

Collecting langchain
  Downloading langchain-0.1.16-py3-none-any.whl (817 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.7/817.7 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting CTransformers
  Downloading ctransformers-0.2.27-py3-none-any.whl (9.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unstructured
  Downloading unstructured-0.13.6-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m69.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting faiss-cpu
  Downloading faiss_cpu-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K     [90m━━━━━━━━━

# Create random labels based on test data frequency

In [None]:
import numpy as np

def random_model_prompt(test_count):
  """Generates a test prompt with random labels that accounts for probability of occurrences in test data"""
  # Count number of occurrences of each label

  train_data = pd.read_csv('/content/drive/MyDrive/LLama2HealthCareChatBot-master/data/BindingAffinity_data/binding_affinity_train.csv').rename(columns={'Canonical SMILE': 'SMILES'})

  label_counts = train_data["Label"].value_counts()
  probabilities = label_counts / len(train_data)

  print(probabilities)

  probabilities_list = probabilities.to_list()
  labels = ["Medium", "Low", "High"]

  test_data = pd.read_csv('/content/drive/MyDrive/LLama2HealthCareChatBot-master/data/BindingAffinity_data/binding_affinity_test.csv').rename(columns={'Canonical SMILE': 'SMILES'})

  X_test = test_data.drop('Label', axis=1)

  test_list = generate_test_random(X_test, test_count)

  entries = np.random.choice(labels, size=test_count, p=probabilities_list)

  output_list = []

  iterator = 0
  test_prompt = "You are an expert chemist, your task is to predict the binding affinity (Label), given a compound (SMILES) and given protein sequence (Sequence) using your experienced chemical property prediction knowledge. Please strictly follow the format, no other information can be provided. Given the SMILES string of compound and Protein Sequence, predict the binding affinity based on training compound-protein pair dataset. Please answer with one word: High, Medium, Low corresponding to binding affinity label.\n"
  for test_example in test_list:
    test_prompt += f"SMILES: {test_example[1]}\nSequence: {test_example[0]}\nLabel:{entries[iterator]}\n"
    output_list += [[test_example[1], test_example[0], entries[iterator]]]
    iterator +=1

  return output_list, test_prompt

random_list, random_prompt = random_model_prompt(100)

Label
Medium    0.549768
Low       0.393653
High      0.056580
Name: count, dtype: float64


In [None]:
# Plot accuracy of labels against test sample size

In [None]:
import matplotlib.pyplot as plt

# Plot accuracies with increasing test sample size
def accuracy_graph(test_sample_size_list):
    accuracy_list = []
    for test_size in test_sample_size_list:
        random_list, random_prompt = random_model_prompt(test_size)
        total_count = len(random_list)  # Update total count for each test size
        accuracy = random_accuracy_calculator(random_list, total_count)
        accuracy_list.append(accuracy)

    # Plotting
    plt.plot(test_sample_size_list, accuracy_list)

    # Adding labels and legend
    plt.xlabel('Test Sample Size')
    plt.ylabel('Accuracy (%)')
    plt.title('Plot of Accuracies with Increasing Test Sample Size')

    # Display the plot
    plt.show()

def random_accuracy_calculator(predictions, total_count):
    correct_prediction_count = 0  # Move this outside the loop to accumulate correct predictions
    test_data = pd.read_csv('/content/drive/MyDrive/LLama2HealthCareChatBot-master/data/BindingAffinity_data/binding_affinity_test.csv').rename(columns={'Canonical SMILE': 'SMILES'})
    for entry in predictions:
        matched_row = test_data[(test_data["Sequence"] == entry[0]) &
                                (test_data["SMILES"] == entry[1])]

        if str(matched_row["Label"].iloc[0]) == entry[2]:
            correct_prediction_count += 1  # Increment correct_prediction_count for each correct prediction

    accuracy = float(correct_prediction_count / total_count) * 100

    return accuracy

## Create model using embeddings

In [None]:
# Import required libraries and modules
from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import CTransformers
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
import os


def textTokenizer(prompt_path):
    # Initialize an empty list to store the text lines
    text_lines = []

    # Open the text file for reading
    with open(prompt_path, "r") as file:
        # Read each line in the file
        for line in file:
            # Append the line to the list of text lines
            text_lines.append(line.strip())


    # Create embeddings storing semantic information
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",
                                       model_kwargs={'device': "cpu"})


    # Vectorstore for fast similarity search via indexing
    vector_store = FAISS.from_texts(text_lines, embeddings)

    return vector_store

# Function to create a conversational retrieval chain model
def createModel(prompt_path, model_name, temperature, kval, maxTokens, token):
    vector_store = textTokenizer(prompt_path)

    # Load Huggingface Llama2 LLM with specified hyperparmaters
    llm = CTransformers(model=model_name, model_type="llama", token=token,
                        config={'max_new_tokens': maxTokens, 'temperature': temperature}, n_ctx=4096)

    # Create memory object to store chat history
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

    # Set up conversational chain that connects LLM, the indexed vectorized data, and the chatbot
    chain = ConversationalRetrievalChain.from_llm(
        llm=llm, chain_type='stuff',
        retriever=vector_store.as_retriever(search_kwargs={"k": kval}),  # k hyperparameter
        memory=memory)

    return chain

## Create Model Without Embeddings

In [None]:
from google.colab import userdata
import time

model_engine = "/content/drive/MyDrive/LLama2HealthCareChatBot-master/llama-2-7b-chat.ggmlv3.q2_K.bin"
detail_save_folder = '/content/' # path to save the generated result

hf_token = userdata.get('HF_TOKEN')

def model_with_embeddings(model_name, test_prompt, file_path):
    llm = createModel('/content/drive/MyDrive/LLama2HealthCareChatBot-master/data/preprocessed.tsv', model_engine, 0.01, 2, 4096, hf_token)

    # Measure the start time
    start_time = time.time()

    # Get response from the model
    response = llm(test_prompt)

    # Measure the end time
    end_time = time.time()

    # Calculate the time taken
    time_taken = end_time - start_time

    # Print the response and time taken
    print("Model Response:", response)
    print("Time taken:", time_taken)

    # Write the prompt, model response, and time taken to the file
    with open(file_path, 'a') as file:
        file.write(test_prompt + response + '\n')
        file.write("Time taken: {:.2f} seconds\n".format(time_taken))


def create_binding_affinity_test_prompt_1(test_list):
    for i in range(len(test_list)):
        prompt = """[INST] <<SYS>>
        You are an expert chemist, your task is to predict the binding affinity (Label), given a compound (SMILES) and given protein sequence (Sequence) using your experienced chemical property prediction knowledge. Please strictly follow the format, no other information can be provided. Please answer with one word: High, Medium, Low corresponding to binding affinity label.
        <</SYS>>\n"""
        print("Hello")
        prompt += f"SMILES: {test_list[i][0]}\nSequence: {test_list[i][1]}\n[/INST]\nLabel:\n"
        model_with_embeddings(model_engine, prompt, '/content/smallersamplesize_no_embed_2k.txt')

In [None]:
from google.colab import userdata
import time

model_engine = ["/content/drive/MyDrive/LLama2HealthCareChatBot-master/llama-2-7b-chat.ggmlv3.q2_K.bin", "/content/drive/MyDrive/LLama2HealthCareChatBot-master/llama-2-7b-chat.ggmlv3.q4_0.bin"]
detail_save_folder = '/content/' # path to save the generated result

hf_token = userdata.get('HF_TOKEN')

def model_without_embeddings(model_name, test_prompt, smiles, sequence, label, file_path):
    llm = CTransformers(model=model_name, model_type="llama",
                        config={'max_new_tokens': 128, 'temperature': 0.01}, token=hf_token, n_ctx=4096)

    # Measure the start time
    start_time = time.time()

    # Get response from the model
    response = llm(test_prompt)

    # Measure the end time
    end_time = time.time()

    # Calculate the time taken
    time_taken = end_time - start_time

    # Print the response and time taken
    print("Model Response:", response)
    print("Time taken:", time_taken)

    # Write the prompt, model response, and time taken to the file
    with open(file_path, 'a') as file:
        file.write(test_prompt + response + '\n')
        file.write("Time taken: {:.2f} seconds\n".format(time_taken))
        file.write("Actual Label: " + label + "\n")


def create_binding_affinity_test_prompt_2(test_list):
    for i in range(len(test_list)):
        prompt = """[INST] <<SYS>>
        You are an expert chemist, your task is to predict the binding affinity (Label), given a compound (SMILES)
        and given protein sequence (Sequence) using your experienced chemical property prediction knowledge.
        Please strictly follow the format, no other information can be provided.
        Please answer with one of the 3 labels: High OR Medium OR Low, corresponding to binding affinity label.
        No other information can be provided.
        <</SYS>>\n"""
        smiles = test_list[i][2]
        sequence = test_list[i][6]
        label = test_list[i][7]
        prompt += f"SMILES: {smiles}\nSequence: {sequence}\n[/INST]\nLabel:\n"
        model_without_embeddings(model_engine[0], prompt, smiles, sequence, label, '/content/smallersamplesize_no_embed_2k.txt')

In [None]:
train_list, test_list = create_train_test_list(20, 20)


Entering function
Read train data
Creating lists
Parsing SMILES, Sequence, Label
Successfully Parsed


In [None]:
create_binding_affinity_test_prompt_2(test_list)



Model Response: High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High

Time taken: 386.18470454216003
Model Response: High

The compound you provided has a high binding affinity for the protein sequence. The molecule's structure and functional groups suggest a strong interaction with the protein, resulting in a high binding affinity.
Time taken: 273.4255323410034




Model Response: High

The compound you provided has a high binding affinity for the protein sequence. The molecule's shape and size are well-suited for binding to the protein, with good hydrophobic and hydrogen bonding interactions. The molecular interactions. The molecules. The molecular interactions. The molecular interactions. The molecular interactions. The molecular interactions. The molecular interactions. The molecular interactions. The molecular interactions. The molecular interactions. The molecular interactions. The molecular interactions. The molecular interactions. The molecular interactions. The molecular interactions. The mole
Time taken: 324.43233251571655




Model Response: High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High

Time taken: 380.25996589660645
Model Response: High
Time taken: 156.41199946403503
Model Response: High

The compound you provided has a high binding affinity for the protein sequence. The molecule's structure and functional groups suggest a strong interaction with the protein, resulting in a high binding affinity. The presence of aromatic rings and polar functional groups on the molecule will also contribute to this prediction.
Time taken: 263.31946897506714
Model Response: High

Based on my knowledge of chemistry and the SMILES string you provided, I predict that this compound will have a high binding affinity for the protein sequence you provided. The 



Model Response: High

The compound you provided has a high binding affinity for the protein sequence MTRDEALPDSHSAQDFYENYEPKEILGRGVSSVVRRCIHKPTSQEYAVKVIDVTGGGSFSPEEVRELREATLKEVREATLKEVREATLKEVREATLKEVREATLKEVREATLKEVREATLKEVREATLKEVREATLKEVRKEVREATLKEVREATKETALKEVREATLKEVREATLKE
Time taken: 319.6832675933838




Model Response: High

The compound you provided has a strong binding affinity for the protein sequence. The SMILES string you provided represents a molecule with a high degree of polarity and aromaticity, which are desirable properties for protein-ligand interactions. Additionally, the molecule has a good hydropharmory structure that many hydropharmchelong-a number of a relatively few rotationality and acidic electrons donated rings and alkyl and efficient pi-
a number of a high electron-a number of a reasonable hydropharmonic and polar functional groups that many hydropharm
Time taken: 317.15400218963623




Model Response: High
Time taken: 445.0781321525574




Model Response: High

The compound you provided has a strong interactions with high binding affinity and interesting properties that has a strong interactions with high binding affinity and interesting chemistry and interesting chemical properties that has a strong interactions with high binding affinity and interesting properties that has a strong interactions with high binding affinity and interesting properties that has a strong interactions with high binding affinity and interesting properties that has a strong interactions with high binding affinity and interesting properties that has a strong interactions with high binding affinity and interesting chemistry and interesting chemical properties that has a strong interactions with high binding affinity and interesting properties that has a strong interactions with high
Time taken: 358.4790961742401




Model Response: High]  ]  Medium]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  Medium]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]  ]
Time taken: 410.3279173374176




Model Response: High
Time taken: 308.883802652359




Model Response: High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High
High

Time taken: 452.21503043174744
Model Response: High

The compound you provided has a high binding affinity for the protein sequence. The molecule's shape and size are compatible with the active site of the protein, and its functional groups are well-positioned to interact with key residues on the protein surface. These factors contribute to a strong binding affinity.
Time taken: 246.44161677360535


