<a target="_blank" href="https://colab.research.google.com/github/gihanpanapitiya/llm/blob/main/LLM_Finetune_For_Solubility_Data_SFTT.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

# Install the packages

In [None]:
%%capture
%pip install accelerate peft bitsandbytes transformers trl

In [None]:
import os
import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from random import randrange
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import TrainingArguments
from trl import SFTTrainer
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model


# Upload train data

Few Drawin datasets are packaged in the provided 'dataasets.zip'. So, you can simply upload it using the commmand below and unzip it.

Or choose any preferred dataset from
https://github.com/MasterAI-EAM/Darwin/tree/main/dataset. Compress and upload it.




In [None]:
from google.colab import files
# Prompt user to upload a folder
uploaded = files.upload()

In [None]:
!unzip datasets
# !unzip waterStability.zip
# !unzip waterStability.zip -d datasets/

unzip:  cannot find or open ESOL.zip, ESOL.zip.zip or ESOL.zip.ZIP.


# Create train/test data

In [None]:
data = load_dataset('json', data_files='datasets/waterStability/waterStability.json')

dataset = data['train'].select([i for i in range(0, 300)])
test_dataset = data['train'].select([i for i in range(300,  len(data['train']))])

## use a smaller subset to get quick results.
# dataset = data['train'].select([i for i in range(0, 100)])
# test_dataset = data['train'].select([i for i in range(100, 200)])

# Define instruction tuning template

In [None]:
def format_instruction(sample):
    if 'input' in sample:
        return f"""The following is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
    ### Instruction:
    {sample['instruction']}
    ### Input:
    {sample['input']}
    ### Response:
    {sample['output']}
    """
    else:
        return f"""The following is an instruction that describes a task. Write a response that appropriately completes the request.
    ### Instruction:
    {instruction}
    ### Response:
    {sample['output']}
    """

In [None]:
print(format_instruction(dataset[0]))

# Prepare the model

In [None]:
print(format_instruction(dataset[0]))
use_flash_attention = False

# Hugging Face model id
model_id = "NousResearch/Llama-2-7b-hf"  # non-gated
# model_id = "meta-llama/Llama-2-7b-hf" # gated


# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    use_cache=False,
    use_flash_attention_2=use_flash_attention,
    device_map="auto",
)
model.config.pretraining_tp = 1


# define the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"



# LoRA config based on QLoRA paper
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
)


# prepare model for training
model = prepare_model_for_kbit_training(model)

### Instruction:
Write a possible SMILES of given compound. -> 

### Input:
 p-Cresol


### Output:
 Cc1ccc(O)cc1




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



# Train

In [None]:
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=2,
    per_device_train_batch_size=6 if use_flash_attention else 4,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_steps=50,
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=False,
    fp16=True,
    tf32=False,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    disable_tqdm=False,  # disable tqdm since with packing values are in correct
)

In [None]:
# Upcast layer for flash attnetion
if use_flash_attention:
    from utils.llama_patch import upcast_layer_for_flash_attention
    torch_dtype = torch.bfloat16 if args.bf16 else torch.float16 if args.fp16 else torch.float32
    model = upcast_layer_for_flash_attention(model, torch_dtype)

model = get_peft_model(model, peft_config)

In [None]:
max_seq_length = 128 # max sequence length for model and packing of the dataset

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=format_instruction,
    args=args,
)

# train
trainer.train() # there will not be a progress bar since tqdm is disabled
trainer.save_model()

# Inference

In [None]:
def generate_prompt_eval(sample):
    if 'input' in sample:
        return f"""The following is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
    ### Instruction:
    {sample['instruction']}
    ### Input:
    {sample['input']}
    ### Response:
    """
    else:
        return f"""The following is an instruction that describes a task. Write a response that appropriately completes the request.
    ### Instruction:
    {instruction}
    ### Response:
    """

select a data sample from the test set

In [None]:
sample = test_dataset[20]

In [None]:
device = torch.device('cuda')
prompt = generate_prompt_eval(sample)

# printing the prompt
print("prompt: ", prompt)

input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
generated_ids = model.generate(input_ids=input_ids, max_new_tokens=100, do_sample=True, top_p=0.9, temperature=0.9)
response = tokenizer.decode(generated_ids[0])

In [None]:
print("Generated output")
response.split('Response: ')[0].split('\n')[4]