In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


<h2><b> Fine-tuning of LLM using QLoRA and assessing binding affinity label prediction </b></h2>

Ref: https://colab.research.google.com/drive/1GGIzC3QdQmCpRUZlYusTvcomEcQYruri#scrollTo=nAMzy_0FtaUZ



## Install libraries

In [None]:
!pip install -q -U transformers datasets accelerate peft trl bitsandbytes wandb SmilesPE CTransformers langchain

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m81.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m103.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━

In [None]:
from google.colab import userdata

# Defined in the secrets tab in Google Colab
hf_token = userdata.get('HF_TOKEN')

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer

In [None]:
import pandas as pd

# Model
# base_model = "/content/drive/MyDrive/LLama2HealthCareChatBot-master/llama-2-7b-chat.ggmlv3.q2_K.bin"

# Dataset
train_dataset = pd.read_csv('/content/drive/MyDrive/LLama2HealthCareChatBot-master/data/BindingAffinity_data/binding_affinity_train.csv')
train_dataset = train_dataset.rename(columns={'Canonical SMILE': 'SMILES'})
test_dataset = pd.read_csv('/content/drive/MyDrive/LLama2HealthCareChatBot-master/data/BindingAffinity_data/binding_affinity_test.csv')
test_dataset = test_dataset.rename(columns={'Canonical SMILE': 'SMILES'})

# Set up fine-tuning configuration

In [None]:
from transformers import AutoModel

# Model
base_model = "NousResearch/Llama-2-7b-hf"
new_model = "llama-2-7b-miniplatypus"


# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "right"

Downloading readme:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.25M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [None]:
# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# LoRA configuration
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

# Load base moodel
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map={"": 0},
    do_sample=True
)

# Cast the layernorm in fp32, make output embedding layer require grads, add the upcasting of the lmhead to fp32
model = prepare_model_for_kbit_training(model)

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]



# Update Vocab List of BERT Model with SMILESPE

In [None]:
''' from transformers import BertTokenizerFast, BertModel
checkpoint = 'unikei/bert-base-smiles'
bert_tokenizer = BertTokenizerFast.from_pretrained(checkpoint)
bert_model = BertModel.from_pretrained(checkpoint)

# Initialize empty lists to store encoded inputs
input_ids_list = []
attention_mask_list = []

vocab_file_path = '/content/drive/MyDrive/LLama2HealthCareChatBot-master/SPE_ChEMBL.txt'

def update_tokens(tokeniz, modell, vocab_file_path):
  # Update BERT tokens
  tokeniz.add_tokens(vocab_file_path)

  # Save the updated tokenizer
  tokeniz.save_pretrained(checkpoint)

  # If you modified the vocabulary file, make sure to load it in the model
  modell.resize_token_embeddings(len(tokeniz))

  # Save updated model
  modell.save_pretrained(checkpoint)

  return tokeniz, modell

bert_tokenizer, bert_model = update_tokens(bert_tokenizer, bert_model, vocab_file_path) '''

" from transformers import BertTokenizerFast, BertModel\ncheckpoint = 'unikei/bert-base-smiles'\nbert_tokenizer = BertTokenizerFast.from_pretrained(checkpoint)\nbert_model = BertModel.from_pretrained(checkpoint)\n\n# Initialize empty lists to store encoded inputs\ninput_ids_list = []\nattention_mask_list = []\n\nvocab_file_path = '/content/drive/MyDrive/LLama2HealthCareChatBot-master/SPE_ChEMBL.txt'\n\ndef update_tokens(tokeniz, modell, vocab_file_path):\n  # Update BERT tokens\n  tokeniz.add_tokens(vocab_file_path)\n\n  # Save the updated tokenizer\n  tokeniz.save_pretrained(checkpoint)\n\n  # If you modified the vocabulary file, make sure to load it in the model\n  modell.resize_token_embeddings(len(tokeniz))\n\n  # Save updated model\n  modell.save_pretrained(checkpoint)\n\n  return tokeniz, modell\n\nbert_tokenizer, bert_model = update_tokens(bert_tokenizer, bert_model, vocab_file_path) "

# Tokenize the protein sequence and SMILES (using BERT) (Future Development)

In [None]:
''' from sklearn.preprocessing import LabelEncoder
import torch
import pandas as pd
from transformers import BertTokenizer

def fine_tuned_df():
    # Initialize the BERT tokenizer for character-level tokenization
    protein_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True, tokenization="char")

    # Extract specific columns for tokenization
    smiles_column = train_dataset['Canonical SMILE']
    sequence_column = train_dataset["Sequence"]
    label_column = train_dataset["Label"]

    # Calculate maximum length to allow padding
    max_smiles_length = smiles_column.str.len().max()
    max_sequence_length = sequence_column.str.len().max()
    print("Max SMILES Length: ", max_smiles_length)
    print("Max Sequence Length: ", max_sequence_length)

    # Encode the label column
    label_encoder = LabelEncoder()
    labels_encoded = label_encoder.fit_transform(train_dataset["Label"])

    # Initialize lists to store tokenized inputs
    input_ids_list = []
    attention_mask_list = []

    # Define the maximum length for the concatenated input
    max_length_smiles = max_smiles_length + 1  # Verify if model size is an issue
    max_length_proteins = max_sequence_length + 1

    # Iterate over each SMILES string and combine with other columns
    for smiles, sequence, label_encoded in zip(smiles_column, sequence_column, labels_encoded):
        # Tokenize the SMILES string
        tokenized_smiles = bert_tokenizer(smiles, padding='max_length', max_length=max_length_smiles, truncation=True, return_tensors="pt")

        # Tokenize the sequence of amino acids at character level
        tokenized_sequence = protein_tokenizer(sequence, padding='max_length', max_length=max_length_proteins, truncation=True, return_tensors="pt")

        # Encode the label
        label_tensor = torch.tensor(label_encoded).unsqueeze(0)  # Convert label to tensor

        # Concatenate tokenized SMILES, sequence, and label
        # Explore inserting wildcards to align finetuning and inference
        input_ids = torch.cat((tokenized_smiles["input_ids"], tokenized_sequence["input_ids"], label_tensor.unsqueeze(0)), dim=1)

        # Create attention mask with the same shape as input_ids
        ones_mask = torch.ones_like(input_ids, dtype=torch.long)

        # Append tokenized inputs to lists
        input_ids_list.append(input_ids)
        attention_mask_list.append(ones_mask)

    # Stack the tokenized inputs into tensors
    input_ids = torch.stack(input_ids_list)
    attention_mask = torch.stack(attention_mask_list)
    index_list = [i for i in range(len(input_ids_list))]

    # Define the fine-tuning task using the entire dataset
    task_dataset = {"input_ids": input_ids_list, "attention_mask": attention_mask_list}
    task_df = pd.DataFrame(task_dataset)

    return task_dataset, task_df

 '''

' from sklearn.preprocessing import LabelEncoder\nimport torch\nimport pandas as pd\nfrom transformers import BertTokenizer\n\ndef fine_tuned_df():\n    # Initialize the BERT tokenizer for character-level tokenization\n    protein_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True, tokenization="char")\n\n    # Extract specific columns for tokenization\n    smiles_column = train_dataset[\'Canonical SMILE\']\n    sequence_column = train_dataset["Sequence"]\n    label_column = train_dataset["Label"]\n\n    # Calculate maximum length to allow padding\n    max_smiles_length = smiles_column.str.len().max()\n    max_sequence_length = sequence_column.str.len().max()\n    print("Max SMILES Length: ", max_smiles_length)\n    print("Max Sequence Length: ", max_sequence_length)\n\n    # Encode the label column\n    label_encoder = LabelEncoder()\n    labels_encoded = label_encoder.fit_transform(train_dataset["Label"])\n\n    # Initialize lists to store tokenized inp

# Preparing the fine-tuning and testing dataset

## Data Preparation Function (deprecated)

In [None]:
from datasets import Dataset
import pandas as pd
import random

# Model
# base_model = "/content/drive/MyDrive/LLama2HealthCareChatBot-master/llama-2-7b-chat.ggmlv3.q2_K.bin"

def data_preparation(sample_size):
    # Set random seed for reproducibility
    random.seed(42)

    # Dataset
    train_dataset = pd.read_csv('/content/drive/MyDrive/LLama2HealthCareChatBot-master/data/BindingAffinity_data/binding_affinity_train.csv')
    train_dataset = train_dataset.sample(sample_size)
    test_dataset = pd.read_csv('/content/drive/MyDrive/LLama2HealthCareChatBot-master/data/BindingAffinity_data/binding_affinity_test.csv')
    test_dataset = test_dataset.sample(sample_size)
    train_dataset = train_dataset.rename(columns={'Canonical SMILE': 'SMILES'}).drop(columns=['SELFIES'])
    test_dataset = test_dataset.rename(columns={'Canonical SMILE': 'SMILES'}).drop(columns=['SELFIES'])

    train_dataset['Textual_Description'] = train_dataset.apply(lambda row: f"Compound \
    with SMILES sequence of {row['SMILES']} binds to Protein {row['Sequence']} with {row['Label']} binding affinity.", axis=1)
    test_dataset['Textual_Description'] = test_dataset.apply(lambda row: f"Compound \
    with SMILES sequence of {row['SMILES']} binds to Protein {row['Sequence']} with {row['Label']} binding affinity.", axis=1)

    train_dataset = Dataset.from_pandas(train_dataset) # Convert to datasets.Dataset object from dataframe

    test_dataset = Dataset.from_pandas(test_dataset)

    return train_dataset, test_dataset

## Data Split Function (current)

In [None]:
import numpy as np
import pandas as pd
import random
from datasets import Dataset

# Define the get_uniprot_id function here

def split_obtain_first(name):
    return name.split(",")[0]

def data_process_parts(dataset):
    random.seed(42)

    dataset = dataset.rename(columns={'Canonical SMILE': 'SMILES'})
    dataset.drop('Sequence', axis=1, inplace=True)
    dataset['Protein names'] = dataset['Protein names'].apply(split_obtain_first)
    dataset['Textual_Description'] = dataset.apply(lambda row: f"<s> [INST] SMILES: {row['SMILES']}, Protein Name: {row['Protein names']}, Protein Organism: {row['Organism']}, [/INST] Output Label:{row['Label']} </s>", axis=1)
    dataset_overall = pd.DataFrame(dataset['Textual_Description'])
    df_parts = np.array_split(dataset_overall, 5)
    part_1, part_2, part_3, part_4, part_5 = df_parts[0], df_parts[1], df_parts[2], df_parts[3], df_parts[4]

    return part_1, part_2, part_3, part_4, part_5

def data_split(sample_size):
    # Set random seed for reproducibility
    random.seed(42)

    # Read the training dataset
    train_dataset = pd.read_csv('/content/drive/MyDrive/LLama2HealthCareChatBot-master/data/BindingAffinity_data/binding_affinity_train.csv').sample(sample_size)

    # Read the test dataset
    test_dataset = pd.read_csv('/content/drive/MyDrive/LLama2HealthCareChatBot-master/data/BindingAffinity_data/binding_affinity_test.csv').sample(sample_size)

    part_1, part_2, part_3, part_4, part_5 = data_process_parts(train_dataset)
    part_6, part_7, part_8, part_9, part_10 = data_process_parts(test_dataset)

    list_of_parts = [part_1, part_2, part_3, part_4, part_5, part_6, part_7, part_8, part_9, part_10]

    for i in range(10):
      list_of_parts[i] = Dataset.from_pandas(list_of_parts[i])

    return list_of_parts, train_dataset, test_dataset


In [None]:
list_of_parts, train_dataset, test_dataset = data_split(5)
train_dataset.head(5)

Unnamed: 0,DeepAffinity Protein ID,Uniprot ID,Canonical SMILE,Textual Description,Protein names,Organism,Sequence,Label
9631,8Y2J,Q96D53,CC1=C(C=C(C=C1)O)NC2=NC(=NC=C2)NC3=CC=CC(=C3)C...,Compound with SMILES sequence of CC1=C(C=C(C=C...,"Atypical kinase COQ8B, mitochondrial (EC 2.7.-...",Homo sapiens (Human),MWLKVGGLLRGTGGQLGQTVGWPCGALGPGPHRWGPCGGSWAQKFY...,Medium
2508,988A,Q05097,COC1C(C(C(C(O1)CO)O)O)O,Compound with SMILES sequence of COC1C(C(C(C(O...,PA-I galactophilic lectin (PA-IL) (Galactose-b...,Pseudomonas aeruginosa (strain ATCC 15692 / DS...,MAWKGEVLANNEAGQVTSIIYNPGDVITIVAAGWASYGPTQKWGPQ...,Low
7759,WJNG,P43166,C1=CC(=CC=C1NC2=C(C(=NC=N2)Cl)[N+](=O)[O-])S(=...,Compound with SMILES sequence of C1=CC(=CC=C1N...,Carbonic anhydrase 7 (EC 4.2.1.1) (Carbonate d...,Homo sapiens (Human),MTGHHGWGYGQDDGPSHWHKLYPIAQGDRQSPINIISSQAVYSPSL...,Medium
12405,7IXG,P52333,CN(C1CN=C(NC1=O)NC(=O)N)C(=O)CC(CCCN)N.Cl.Cl,Compound with SMILES sequence of CN(C1CN=C(NC1...,Tyrosine-protein kinase JAK3 (EC 2.7.10.2) (Ja...,Homo sapiens (Human),MAPPSEETPLIPQRSCSLLSTEAGALHVLLPARGPGPPQRLSFSFG...,Medium
1925,XGLW,O43614,CC1=C(C(=NN1C)C)C(C)NC(=O)COC2=NC3=C(C(=C2)C(F...,Compound with SMILES sequence of CC1=C(C(=NN1C...,Orexin receptor type 2 (Ox-2-R) (Ox2-R) (Ox2R)...,Homo sapiens (Human),MSGTKLEDSPPCRNWSSASELNETQEPFLNPTDYDDEEFLRYLWRE...,Medium


In [None]:
test_dataset.rename(columns={'Textual Description': 'Textual_Description'}, inplace=True)

In [None]:
test_dataset = Dataset.from_pandas(test_dataset)
index_pairs = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9)]

## Generate prompt for the fine-tuned LLM

In [None]:
# Run text generation pipeline with our next model
def prompt_generation():
    test_dataset = pd.read_csv('/content/drive/MyDrive/LLama2HealthCareChatBot-master/data/BindingAffinity_data/binding_affinity_test.csv')
    test_dataset = test_dataset.rename(columns={'Canonical SMILE': 'SMILES'})
    test_dataset['Protein_ID'] = test_dataset['Sequence'].apply(convert_to_protein_id)
    test_dataset.drop('Sequence', axis=1, inplace=True)
    test_dataset['Textual_Description'] = test_dataset.apply(lambda row: f"SMILES: {row['SMILES']}, Protein Name: {row['Protein names']}, Protein Organism: {row['Organism']}?", axis=1)
    test_dataset = test_dataset.sample(1)

    prompt = """You are an expert chemist, your task is to predict the binding affinity (Label), given a compound (SMILES), given protein name (Protein Name), given Protein Organism (Protein Organism) using your experienced chemical property prediction knowledge.
    Please strictly follow the format, no other information can be provided. Please answer with one word: High, Medium, Low corresponding to binding affinity label."""

    instruction = test_dataset["Textual_Description"].to_string()

    # Split the string by space and remove the first element
    instruction = ' '.join(instruction.split()[1:])

    prompt += instruction

    ''' # Pass in 3 examples for few-shot prompting
    i = 0
    for index, row in test_dataset.iterrows():
      if i == 1:
        prompt += f"SMILES: {row['SMILES']}\nSequence: {row['Sequence']}\nOutput Label:\n"
      else:
        prompt += f"SMILES: {row['SMILES']}\nSequence: {row['Sequence']}\nOutput Label:{row['Label']}\n\n"
      i+=1 '''

    #pd.set_option('display.max_colwidth', None)  # Display full content of columns without truncation

    # Change protein sequence to species (ID) + name to reduce token size
    return prompt

## Perform fine-tuning in dataset batches

In [None]:
from transformers import TrainingArguments, Trainer

# Same comparison for default and fine-tuned model
    # Save your test batch somewhere (as separate files)
    # Explore eval_dataset = entire test dataset for fine-tuning
# In first 4 iterations, set train and validation set to train batch 1,2,3,4
# In last iteration, set train and validation set to train batch and overall test set

count = 0

# Set training arguments
training_arguments = TrainingArguments(
      output_dir="./results",
      num_train_epochs=5,
      per_device_train_batch_size=1,
      gradient_accumulation_steps=1,
      evaluation_strategy="steps",
      eval_steps=5,
      logging_steps=25,
      save_steps=25,
      weight_decay=0.001,
      fp16=False,
      bf16=False,
      optim="paged_adamw_8bit",
      learning_rate=2e-4,
      warmup_ratio=0.03,
)

for pair in index_pairs:
    part1_index, part2_index = pair
    train_batch = list_of_parts[part1_index]
    test_batch = list_of_parts[part2_index]

    if count == 4: # final iteration
          trainer = SFTTrainer(
              model=model,
              train_dataset=train_batch,
              eval_dataset=test_dataset,
              peft_config=peft_config,
              dataset_text_field="Textual_Description",
              max_seq_length=1024,
              args=training_arguments,
              tokenizer=tokenizer,
              dataset_batch_size=10,
          )

          # Train model
          trainer.train()

          trainer.model.save_pretrained('huggingface-test')
    else:
      # Set for loop: set different D' from D (random partition)
      # 1 epoch = |D|/b iteration where D is dataset and b is batch size

        # Set supervised fine-tuning parameters
        trainer = SFTTrainer(
            model=model,
            train_dataset=train_batch,
            eval_dataset=train_batch,
            peft_config=peft_config,
            dataset_text_field="Textual_Description",
            max_seq_length=1024,
            args=training_arguments,
            tokenizer=tokenizer,
            dataset_batch_size=10,
        )

        # Train model
        trainer.train()

        trainer.model.save_pretrained('huggingface-test')

    count += 1

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss


<b> It is very important we delete the model, pipe, trainer and load the fine-tuned model separately due to memory limitations (can load only one model) </b>

In [None]:
try:
    del model
except Exception as e:
    print("Failed to delete 'model':", e)

try:
    del pipe
except Exception as e:
    print("Failed to delete 'pipe':", e)

try:
    del trainer
except Exception as e:
    print("Failed to delete 'trainer':", e)

import gc
gc.collect()


Failed to delete 'pipe': name 'pipe' is not defined


412

In [None]:
prompt = prompt_generation()
pd.set_option('display.max_colwidth', None)  # Display full content of columns without truncation
prompt

'You are an expert chemist, your task is to predict the binding affinity (Label), given a compound (SMILES), given protein name (Protein Name), given Protein Organism (Protein Organism) using your experienced chemical property prediction knowledge. Please strictly follow the format, no other information can be provided. Please answer with one word: High, Medium, Low corresponding to binding affinity label.SMILES: C1CC(N(C1)C(=O)C(CC2=CC=CC=C2)N)C(=O)N...'

## Load fine-tuned model and perform assessment

In [None]:
pipe = pipeline(
    task="text-generation",
    model='huggingface-test',
    tokenizer=tokenizer,
    max_length=1024,
    temperature=0.50,
    do_sample=True
  )

result = pipe(prompt)
print(result[0]['generated_text'])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



You are an expert chemist, your task is to predict the binding affinity (Label), given a compound (SMILES), given protein name (Protein Name), given Protein Organism (Protein Organism) using your experienced chemical property prediction knowledge. Please strictly follow the format, no other information can be provided. Please answer with one word: High, Medium, Low corresponding to binding affinity label.SMILES: C1CC(N(C1)C(=O)C(CC2=CC=CC=C2)N)C(=O)N...
...﻿Chemistry 101 Lab Report #2: The Effect of Temperature on the Rate of a Chemical Reaction Introduction: The purpose of this lab was to determine the effect of temperature on the rate of a chemical reaction. The chemical reaction that was used in this experiment was the reaction between hydrogen peroxide and iodide ions. The reaction between hydrogen peroxide and iodide ions is a decomposition reaction. A decomposition reaction is a reaction in which a compound breaks down into two or more simpler substances. The reaction between hyd

## Uploading model to Huggingface

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
new_model = "huggingface-test"
hf_token =  userdata.get('HF_TOKEN')
model.push_to_hub(new_model, use_temp_dir=False, token=hf_token)
tokenizer.push_to_hub(new_model, use_temp_dir=False, token=hf_token)

## Alternative test

In [None]:
# Run text generation pipeline with our model
from transformers import pipeline

prompt = """[s][INST] [[SYS]]\nYou are an expert chemist, your task is to predict the binding affinity (Label), given a compound (SMILES)
and given protein sequence (Sequence) using your experienced chemical property prediction knowledge.
Please strictly follow the format, no other information can be provided.
Please answer with one word: High, Medium, Low corresponding to binding affinity label.\n[[/SYS]]\n\n"""

instruction = f"SMILES: {test_list[2][0]}, Sequence: {test_list[2][1]} [/INST]"
pipe = pipeline(task="text-generation", model=new_model, tokenizer=tokenizer, max_length=1024)
result = pipe(instruction)
print(result[0]['generated_text'][len(instruction):])