## Workspace Setup

In [None]:
#@title Neccessary Installs

!pip install -q groq

#!pip install torch
!pip install -U accelerate
!pip install -U bitsandbytes
!pip install -U datasets
!pip install -U evaluate
!pip install -U ninja
!pip install -U packaging
!pip install -U peft
!pip install -U sentencepiece
!pip install -U transformers
!pip install -U trl

In [None]:
#@title Google Colab Drive Helper

# For Google Colab settings
from google.colab import userdata, drive

# This will prompt for authorization
drive.mount('/content/drive')

# Set the working directory
%cd '/content/drive/MyDrive/postedBlogs/llama3RE'

In [None]:
#@title Hugging Face Credentials

# For Hugging Face Hub setting
from huggingface_hub import login

# Upload the HuggingFace token (should have WRITE access) from Colab secrets
HF = userdata.get('HF')

# This is needed to upload the model to HuggingFace
login(token=HF,add_to_git_credential=True)

In [None]:
#@title Path Variables

# Create a path variable for the data folder
data_path = '/content/drive/MyDrive/postedBlogs/llama3RE/datas/'

# SFT dataset contains extracted sentences and gold_re
sft_data_path = f'{data_path}sft_dataset.json'

# Data collected from the the mini-test
mini_data_path = f'{data_path}mini_data.json'

# Test data containing all three outputs
all_tests_data = f'{data_path}all_tests.json'

# The adjusted training dataset
train_data_path = f'{data_path}sft_train_data.json'

# Create a path variable for the SFT model to be saved locally
sft_model_path = '/content/drive/MyDrive/postedBlogs/llama3RE/Llama3_RE/'

# Relation Extraction Synthetic Dataset with Llama3-70B

## Load & Prepare Dataset

In [None]:
#@title Load Dolly-15k Dataset

from datasets import load_dataset

dataset = load_dataset("databricks/databricks-dolly-15k")

# Display an instance
dataset['train'][0]

In [None]:
#@title Determine Available Categories in Dataset

dataset_categories = set([e["category"] for e in dataset["train"]])
dataset_categories

In [None]:
#@title Parse Data

# Choose the desired category from the dataset
ie_category = [e for e in dataset["train"] if e["category"]=="information_extraction"]

# Retain only the context from each instance
ie_context = [e["context"] for e in ie_category]

# Split the text into sentences (at the period) and keep the first sentence
reduced_context = [text.split('.')[0] + '.' for text in ie_context]

# Retain sequences of specified lengths only (use character length)
sampler = [e for e in reduced_context if 30 < len(e) < 170]

print(f"There are {len(sampler)} instances in the dataset.\n")

# Display several samples from the selected dataset
sampler[110:120]

## Build the Synthetic RE Dataset

In [None]:
#@title Create a System Message

system_message = """You are an experienced annontator. Extract all entities and the relations between them from the following text. Write the answer as a triple entity1|relationship|entitity2. Do not add anything else.
Example Text: Alice is from France.
Answer: Alice|is from|France.
"""

In [None]:
#@title Build the Messages List
messages = [[
    {"role": "system","content": f"{system_message}"},
    {"role": "user", "content": e}] for e in sampler]
messages[10]

In [None]:
#@title Instantiate Groq Client

import os
from groq import Groq

gclient = Groq(
    api_key=userdata.get("GROQ"),
)

In [None]:
#@title Helper Functions

import time
from tqdm import tqdm

def process_data(prompt):

    """Send one request and retrieve model's generation."""

    chat_completion = gclient.chat.completions.create(
        messages=prompt, # input prompt to send to the model
        model="llama3-70b-8192", # according to GroqCloud labeling
        temperature=0.5, # controls diversity
        max_tokens=128, # max number tokens to generate
        top_p=1, # proportion of likelihood weighted options to consider
        stop=None, # string that signals to stop generating
        stream=False, # if set partial messages are sent
    )
    return chat_completion.choices[0].message.content


def send_messages(messages):

    """Process messages in batches with a pause between batches."""

    answers=[]
    batch_size=10

    for i in tqdm(range(0, len(messages), batch_size)):

        batch = messages[i:i+10]  # get the next batch of messages

        for message in batch:
            output = process_data(message)
            answers.append(output)

        if i + 10 < len(messages):  # check if there are batches left
            time.sleep(10)  # wait for 10 seconds

    return answers

In [None]:
#@title Generate the Data

answers = send_messages(messages)
len(answers)

In [None]:
#@title Combine Data with Generated Dataset
combined_dataset = [{'text': user, 'gold_re': output} for user, output in zip(sampler, answers)]

# Print the combined list to check
combined_dataset[22]

In [None]:
#@title Save the Combined Dataset

import json

with open(sft_data_path, 'w') as file:
    json.dump(combined_dataset, file)

# Evaluate Llama3-8B on Relation Extraction Task

In [None]:
#@title Build a Samples Dataset

import random
random.seed(17)

# Select 20 random entries
mini_data = random.sample(combined_dataset, 20)

# Build conversational format
parsed_mini_data = [[{'role': 'system', 'content': system_message},
                     {'role': 'user', 'content': e['text']}] for e in mini_data]

parsed_mini_data[1]

In [None]:
#@title Create a Training Set for FineTuning

train_data = [item for item in combined_dataset if item not in mini_data]
len(train_data)

In [None]:
#@title Helper Function

def process_data(prompt):

    """Send one request and retrieve model's generation."""

    chat_completion = gclient.chat.completions.create(
        messages=prompt, # input prompt to send to the model
        model="llama3-8b-8192", # according to GroqCloud labeling
        temperature=0.5, # controls diversity
        max_tokens=128, # max number tokens to generate
        top_p=1, # proportion of likelihood weighted options to consider
        stop=None, # string that signals to stop generating
        stream=False, # if set partial messages are sent
    )
    return chat_completion.choices[0].message.content

In [None]:
#@title Perform RE on Samples Data with Llama-8B

outputs = []
for message in parsed_mini_data:
    output = process_data(message)
    outputs.append(output)

outputs[3]

In [None]:
#@title Combine the Samples Data with Generated RE Data

# Adding new key 'test_re' with values from the list
for i, dct in enumerate(mini_data):
    dct['test_re'] = outputs[i]

mini_data[2]

In [None]:
#@title Display Llama3 70B and 8B RE Outputs on Samples

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Create a dataframe from collected data
df = pd.DataFrame(mini_data)
df

In [None]:
#@title Save the Datasets

import json

# Data collected from the mini-test
with open(mini_data_path, 'w') as file:
    json.dump(mini_data, file)

# The adjusted training dataset
with open(train_data_path, 'w') as file:
    json.dump(train_data, file)

# Supervised Fine-Tuning Llama3-8B

In [None]:
#@title Display Libraries Versions

import torch
import datasets
import transformers
import trl

print(f"The PyTorch version is {torch.__version__}.")
print(f"Datasets version is {datasets.__version__}.")
print(f"Transformers version is {transformers.__version__}.")
print(f"TRL version is {trl.__version__}.")

In [None]:
#@title Assert Cuda Capabilities for Flash Attention

# Assert Cuda Capability for Flash Attention
major_version, minor_version = torch.cuda.get_device_capability()
print(f"Cuda major version: {major_version}.\nCuda minor version: {minor_version}")

# adapted from: https://github.com/mlabonne/llm-course
if torch.cuda.get_device_capability()[0] >= 8:
    # Limit the number of jobs to accomodate the compute capabilities
    %env MAX_JOBS=2 # for Google Colab

    # Install flash attention - for Ampere GPUs
    %pip install flash-attn -q --no-build-isolation

    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"

else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

print(f"torch_dtype = {torch_dtype}")
print(f"attn_implementation = {attn_implementation}")

In [None]:
#@title Resources Estimation
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
#@title LLM Model Name

model_id  =  "meta-llama/Meta-Llama-3-8B"

## Prepare the SFT Dataset

In [None]:
#@title Load the SFT Dataset
import json

with open(train_data_path, 'rb') as f:
	train_data = json.load(f)

train_data[123]

In [None]:
#@title Function to Parse to Conversational Format

# Create the System Message

system_message = """You are an experienced annontator. Extract all entities and the relations between them from the following text. Write the answer as a triple entity1|relationship|entitity2. Do not add anything else.
Example Text: Alice is from France.
Answer: Alice|is from|France.
"""

def create_conversation(sample):
    return {
        "messages": [
            {"role": "system","content": system_message},
            {"role": "user", "content": sample["text"]},
            {"role": "assistant", "content": sample["gold_re"]}
        ]
    }


In [None]:
#@title Convert Data to HuggingFace Format

from datasets import load_dataset, Dataset

train_dataset = Dataset.from_list(train_data)

# Transform to conversational format
train_dataset = train_dataset.map(create_conversation,
                      remove_columns=train_dataset.features,
                      batched=False)
print(train_dataset)

In [None]:
#@title Display a Sample
train_dataset["messages"][123]

## Tokenizer and Chat Template

In [None]:
#@title Load the Tokenizer


from transformers import AutoTokenizer

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id,
                                          use_fast=True,
                                          trust_remote_code=True)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id =  tokenizer.eos_token_id
tokenizer.padding_side = 'left'

# Set a maximum length
tokenizer.model_max_length = 512

In [None]:
#@title Quantization Parameters

from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype
)

In [None]:
#@title Device Map

device_map = {"": torch.cuda.current_device()} if torch.cuda.is_available() else None

In [None]:
#@title Load Model

from transformers import AutoModelForCausalLM
from peft import prepare_model_for_kbit_training
from trl import setup_chat_format

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device_map,
    attn_implementation=attn_implementation,
    quantization_config=bnb_config
)

model, tokenizer = setup_chat_format(model, tokenizer)
model = prepare_model_for_kbit_training(model)

In [None]:
#@title LoRA Configuration

from peft import LoraConfig

# According to Sebastian Raschka findings
peft_config = LoraConfig(
        lora_alpha=128, #32
        lora_dropout=0.05,
        r=256,  #16
        bias="none",
        target_modules=["q_proj", "o_proj", "gate_proj", "up_proj",
                        "down_proj", "k_proj", "v_proj"],
        task_type="CAUSAL_LM",
)

In [None]:
# @title Training Arguments

from transformers import TrainingArguments

# Adapted from  Phil Schmid blogpost
args = TrainingArguments(
    output_dir=sft_model_path,              # directory to save the model and repository id
    num_train_epochs=2,                     # number of training epochs
    per_device_train_batch_size=4,          # batch size per device during training
    gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory, use in distributed training
    #gradient_checkpointing_kwargs={"use_reentrant": False}, # for more stability in distributed training, it can use more memory
    optim="adamw_8bit",                     # choose paged_adamw_8bit if noy enough memory
    logging_steps=10,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=True,                              # use bfloat16 precision
    tf32=True,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=True,                      # push model to Hugging Face hub
    hub_model_id="llama3-8b-sft-qlora-re",
    report_to="tensorboard",               # report metrics to tensorboard
)

In [None]:
# @title Initialize the SFTTrainer

from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    peft_config=peft_config,
    max_seq_length=512,
    tokenizer=tokenizer,
    packing=False, # True if the dataset is large
    dataset_kwargs={
        "add_special_tokens": False,  # the template adds the special tokens
        "append_concat_token": False, # no need to add additional separator token
    }
)

In [None]:
#@title Train tand Save the Model

trainer.train()
trainer.save_model()

In [None]:
#@title Save Model Locally

#trainer.save_model()

In [None]:
#@title Clear Memory

import torch
import gc
del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()

# Inference with SFT Model

In [None]:
#@title Load Peft Model

from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, pipeline
import torch

# HF model
peft_model_id = "solanaO/llama3-8b-sft-qlora-re"

# Load Model with PEFT adapter
model = AutoPeftModelForCausalLM.from_pretrained(
  peft_model_id,
  device_map="auto",
  torch_dtype=torch.float16,
  offload_buffers=True
)

In [None]:
#@title Load Tokenizer

tokenizer = AutoTokenizer.from_pretrained(peft_model_id)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id =  tokenizer.eos_token_id
tokenizer.padding_side = 'left'

In [None]:
#@title Text Generation Pipeline

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
#@title Load the Samples Dataset
import json

with open(mini_data_path, 'rb') as f:
	mini_data = json.load(f)

mini_data[12]

In [None]:
#@title Function to Parse to Conversational Format

# Create the System Message

system_message = """You are an experienced annontator. Extract all entities and the relations between them from the following text. Write the answer as a triple entity1|relationship|entitity2. Do not add anything else.
Example Text: Alice is from France.
Answer: Alice|is from|France.
"""

def create_input_prompt(sample):
    return {
        "messages": [
            {"role": "system","content": system_message},
            {"role": "user", "content": sample["text"]},
        ]
    }

In [None]:
#@title Convert Data to HuggingFace Format

from datasets import Dataset

test_dataset = Dataset.from_list(mini_data)

# Transform to conversational format
test_dataset = test_dataset.map(create_input_prompt,
                      remove_columns=test_dataset.features,
                      batched=False)
print(test_dataset)

## One Sample Test

In [None]:
#@title Generate the Input Prompt

prompt = pipe.tokenizer.apply_chat_template(test_dataset[10]["messages"][:2],
                                            tokenize=False,
                                            add_generation_prompt=True)
print(prompt)

In [None]:
#@title Generate the Output

outputs = pipe(prompt,
              max_new_tokens=128,
              do_sample=True,
              temperature=0.01,
              top_k=50,
              top_p=0.1,
              )

In [None]:
#@title Display Sample Outputs

print(f"Question: {mini_data[10]['text']}\n")
print(f"Gold-RE: {mini_data[10]['gold_re']}\n")
print(f"LLama3-8B-RE: {mini_data[10]['test_re']}\n")
print(f"SFT-Llama3-8B-RE: {outputs[0]['generated_text'][len(prompt):].strip()}")

In [None]:
#@title Test on All 20 Samples

from tqdm import tqdm

def evaluate(sample):
    prompt = pipe.tokenizer.apply_chat_template(sample["messages"][:2],
                                                tokenize=False,
                                                add_generation_prompt=True)
    outputs = pipe(prompt,
                   max_new_tokens=128,
                   do_sample=True,
                   temperature=0.7,
                   top_k=50,
                   top_p=0.95
                   )

    predicted_answer = outputs[0]['generated_text'][len(prompt):].strip()
    return predicted_answer


# Iterate over test dataset and predict
sft_generation = []
for s in tqdm(test_dataset, desc="Processing dataset"):
    sft_generation.append(evaluate(s))

In [None]:
#@title Combine All Test Data and Save
import json

for d, s in zip(mini_data, sft_generation):
    d['sft_re'] = s

# Data collected from the mini-test
with open(all_tests_data, 'w') as file:
    json.dump(mini_data, file)

In [None]:
#@title Display the Tests Results

import pandas as pd
pd.set_option('display.max_colwidth', None)

df = pd.DataFrame(mini_data)
df