In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


<h2><b> The LLM subsystem is assessed for conditional generation of SMILES based on binding affinity label and executes cache pipeline separately to calculate predicted binding affinity labels from predicted SMILES sequences. </b></h2>

## Create Prompts

In [None]:
import random

def create_binding_affinity_prompt(examples):
    prompt = """[INST] <<SYS>>
    You are an expert chemist, your task is to generate  a new SMILES sequence that satisfies the provided binding affinity Label (High/Medium/Low)
    from a given compound (SMILES) and given protein sequence (Sequence) using your experienced molecular editing knowledge.
    Please strictly follow the format, no other information can be provided. Given the SMILES string of compound and Protein Sequence,
    only output a new SMILES based on training compound-protein pair dataset that satisfies the given binding affinity label.
    Please answer with the SMILES sequence only.
    <</SYS>>\n"""
    count = 0
    label = random.choice(['High', 'Medium', 'Low'])
    for i in range(len(examples)):
        while label == examples[i][2]:
            label = random.choice(['High', 'Medium', 'Low'])
        prompt += f"SMILES: {examples[i][0]}\nSequence: {examples[i][1]}\nLabel: {label}\n"
        if (i != (len(examples) - 1)): # Last example does not need to append INST
          prompt += "[INST]\n"
    return prompt

# Generate Train and Test for Binding Affinity Data

In [None]:
import random

# Random sampling
def generate_train_random(binding_affinity_train, sample_size):
    high_values = binding_affinity_train[binding_affinity_train["Label"] == 'High'].sample(sample_size)
    medium_values = binding_affinity_train[binding_affinity_train["Label"] == 'Medium'].sample(sample_size)
    low_values = binding_affinity_train[binding_affinity_train["Label"] == 'Low'].sample(sample_size)

    print("Parsing SMILES, Sequence, Label")

    high_values_smiles = high_values["SMILES"].to_list()
    medium_values_smiles = medium_values["SMILES"].to_list()
    low_values_smiles = low_values["SMILES"].to_list()

    high_values_proteins = high_values["Sequence"].to_list()
    medium_values_proteins = medium_values["Sequence"].to_list()
    low_values_proteins = low_values["Sequence"].to_list()

    high_values_affinity = high_values["Label"].to_list()
    medium_values_affinity = medium_values["Label"].to_list()
    low_values_affinity = low_values["Label"].to_list()

    print("Successfully Parsed")

    sampled_list = []
    for i in range(len(high_values_smiles)):
      sampled_list += [[high_values_smiles[i], high_values_proteins[i], high_values_affinity[i]]]
    for i in range(len(medium_values_smiles)):
      sampled_list += [[medium_values_smiles[i], medium_values_proteins[i], medium_values_affinity[i]]]
    for i in range(len(low_values_smiles)):
      sampled_list += [[low_values_smiles[i], low_values_proteins[i], low_values_affinity[i]]]

    random.shuffle(sampled_list)

    return sampled_list

def generate_test_random(binding_affinity_test, sample_size):
    sampled_rows = binding_affinity_test.sample(n=sample_size, random_state=42)
    sampled_rows_list = sampled_rows.values.tolist()

    return sampled_rows_list

def generate_test_proportional(binding_affinity_test, sample_size):
    pass;

# Introduce Noise

In [None]:
import numpy as np

def add_noise(data, x):
    """Makes X decimal of samples have their x-y flipped to add noise to data"""
    sample_size = int(x * len(data))
    noise_samples = data.sample(sample_size)

    # Generate random indices for swapping
    indices_to_swap = np.random.choice(len(data), size=sample_size, replace=False)

    # Swap elements in columns
    data.loc[indices_to_swap, ['SMILES', 'Sequence']], data.loc[indices_to_swap, ['Sequence', 'SMILES']] = data.loc[indices_to_swap, ['Sequence', 'SMILES']].values, data.loc[indices_to_swap, ['SMILES', 'Sequence']].values

    return data

## Generate train and test with fixed sample size

In [None]:
import pandas as pd

def create_train_test_list(train_size, test_size):
    print("Entering function")
    train_data = pd.read_csv('/content/drive/MyDrive/LLama2HealthCareChatBot-master/data/BindingAffinity_data/binding_affinity_train.csv').rename(columns={'Canonical SMILE': 'SMILES'})
    print("Read train data")
    test_data = pd.read_csv('/content/drive/MyDrive/LLama2HealthCareChatBot-master/data/BindingAffinity_data/binding_affinity_test.csv').rename(columns={'Canonical SMILE': 'SMILES'})


    train_sample_size = train_size
    test_sample_size = test_size

    print("Creating lists")

    train_list = generate_train_random(train_data, train_size)
    test_list = generate_test_random(test_data, test_size)

    return train_list, test_list

## Install libraries

In [None]:
!pip install langchain CTransformers unstructured sentence-transformers faiss-cpu transformers pathlib huggingface-hub

Collecting langchain
  Downloading langchain-0.1.16-py3-none-any.whl (817 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.7/817.7 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting CTransformers
  Downloading ctransformers-0.2.27-py3-none-any.whl (9.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unstructured
  Downloading unstructured-0.13.6-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting faiss-cpu
  Downloading faiss_cpu-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K     [90m━━━━━━━━━━

## Set up model with embeddings (not used)

In [None]:
# Import required libraries and modules
from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import CTransformers
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
import os


def textTokenizer(prompt_path):
    # Initialize an empty list to store the text lines
    text_lines = []

    # Open the text file for reading
    with open(prompt_path, "r") as file:
        # Read each line in the file
        for line in file:
            # Append the line to the list of text lines
            text_lines.append(line.strip())

    # Create embeddings storing semantic information
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",
                                       model_kwargs={'device': "cuda"})

    # Vectorstore for fast similarity search via indexing
    vector_store = FAISS.from_texts(text_lines, embeddings)

    return vector_store

# Function to create a conversational retrieval chain model
def createModel(prompt_path, model_name, temperature, kval, maxTokens, token):
    vector_store = textTokenizer(prompt_path)

    # Load Huggingface Llama2 LLM with specified hyperparmaters
    llm = CTransformers(model=model_name, model_type="llama", token=token,
                        config={'max_new_tokens': maxTokens, 'temperature': temperature}, n_ctx=4096)

    # Create memory object to store chat history
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

    # Set up conversational chain that connects LLM, the indexed vectorized data, and the chatbot
    chain = ConversationalRetrievalChain.from_llm(
        llm=llm, chain_type='stuff',
        retriever=vector_store.as_retriever(search_kwargs={"k": kval}),  # k hyperparameter
        memory=memory)

    return chain

## Set up model without embeddings for conditional generation of SMILES (used)

In [None]:
from google.colab import userdata
import time

model_engine = ["/content/drive/MyDrive/LLama2HealthCareChatBot-master/llama-2-7b-chat.ggmlv3.q2_K.bin", "/content/drive/MyDrive/LLama2HealthCareChatBot-master/llama-2-7b-chat.ggmlv3.q4_0.bin"]
sample_nums = [100, 500]
noise_percentage = 0
detail_save_folder = '/content/' # path to save the generated result

hf_token = userdata.get('HF_TOKEN')

def model_without_embeddings(model_name, test_prompt, file_path):
    llm = CTransformers(model=model_name, model_type="llama",
                        config={'max_new_tokens': 128, 'temperature': 0.01}, token=hf_token, n_ctx=4096)

    # Measure the start time
    start_time = time.time()

    # Get response from the model
    response = llm(test_prompt)

    # Measure the end time
    end_time = time.time()

    # Calculate the time taken
    time_taken = end_time - start_time

    # Print the response and time taken
    print("Model Response:", response)
    print("Time taken:", time_taken)

    # Write the prompt, model response, and time taken to the file
    with open(file_path, 'a') as file:
        file.write(test_prompt + response + '\n')
        file.write("Time taken: {:.2f} seconds\n".format(time_taken))


def create_binding_affinity_test_prompt(examples):
    for i in range(len(examples)):
        prompt = """[INST] <<SYS>>
        You are an expert chemist, your task is to generate a New SMILES sequence that satisfies the provided binding affinity Label (High/Medium/Low)
        from a given compound (SMILES) and given protein sequence (Sequence) using your experienced molecular editing knowledge.
        Please strictly follow the format, no other information can be provided.
        Given the SMILES string of compound and Protein Sequence, only perform MINOR modifications to original SMILES that satisfies the given binding affinity label.
        Please answer with the SMILES sequence only.
        <</SYS>>\n"""
        print(examples[i])
        label = random.choice(['Low', 'Medium', 'High'])
        prompt += f"SMILES: {examples[i][0]}\nSequence: {examples[i][1]}\nLabel: {label}\n[/INST]\nNew SMILES:\n"
        model_without_embeddings(model_engine[0], prompt, '/content/smallersamplesize_no_embed_2k.txt')

## Assess LLM on 50 test SMILES

In [None]:
train_list, test_list = create_train_test_list(100, 50)
create_binding_affinity_test_prompt(test_list)

Entering function
Read train data
Creating lists
Parsing SMILES, Sequence, Label
Successfully Parsed
['G42N', 'P00734', 'C1CC(N(C1)C(=O)C(CC2=CC=CC=C2)N)C(=O)NCC3=C(C=CC(=C3)Cl)Cl', 'Compound with SMILES sequence of C1CC(N(C1)C(=O)C(CC2=CC=CC=C2)N)C(=O)NCC3=C(C=CC(=C3)Cl)Cl binds to Protein G42N with Medium binding affinity.', 'Prothrombin (EC 3.4.21.5) (Coagulation factor II) [Cleaved into: Activation peptide fragment 1; Activation peptide fragment 2; Thrombin light chain; Thrombin heavy chain]', 'Homo sapiens (Human)', 'MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANTFLEEVRKGNLERECVEETCSYEEAFEALESSTATDVFWAKYTACETARTPRDKLAACLEGNCAEGLGTNYRGHVNITRSGIECQLWRSRYPHKPEINSTTHPGADLQENFCRNPDSSTTGPWCYTTDPTVRRQECSIPVCGQDQVTVAMTPRSEGSSVNLSPPLEQCVPDRGQQYQGRLAVTTHGLPCLAWASAQAKALSKHQDFNSAVQLVENFCRNPDGDEEGVWCYVAGKPGDFGYCDLNYCEEAVEEETGDGLDEDSDRAIEGRTATSEYQTFFNPRTFGSGEADCGLRPLFEKKSLEDKTERELLESYIDGRIVEGSDAEIGMSPWQVMLFRKSPQELLCGASLISDRWVLTAAHCLLYPPWDKNFTENDLLVRIGKHSRTRYERNIEKISMLEKIYIHPRYNWRENLDRDIALMKLKKPV

  warn_deprecated(


Model Response: CN1C(c1ccc(cc1)N1)

Explanation:
The original SMILES string G42N contains a tertiary amine, which is not ideal for high affinity binding. To modify the SMILES sequence to satisfy the given label of "High", I have replaced the nitrogen atom with a carbon atom (C) and added a methyl group (CH3) to the molecule. This results in a more hydrophobic and lipophilic molecule that is better suited for high affinity binding to the protein
Time taken: 189.0232288837433
['FFCA', 'O43613', 'CC1=CC(=NC2=C1C(=NN2C)C3=C(OC(=C3)C)C)OCC(=O)NC(C)C4=CC=C(C=C4)OC', 'Compound with SMILES sequence of CC1=CC(=NC2=C1C(=NN2C)C3=C(OC(=C3)C)C)OCC(=O)NC(C)C4=CC=C(C=C4)OC binds to Protein FFCA with Medium binding affinity.', 'Orexin/Hypocretin receptor type 1 (Hypocretin receptor type 1) (Orexin receptor type 1) (Ox-1-R) (Ox1-R) (Ox1R)', 'Homo sapiens (Human)', 'MEPSATPGAQMGVPPGSREPSPVPPDYEDEFLRYLWRDYLYPKQYEWVLIAAYVAVFVVALVGNTLVCLAVWRNHHMRTVTNYFIVNLSLADVLVTAICLPASLLVDITESWLFGHALCKVIPYLQAVSVSVAVLTLSF