In [1]:
!pip install -q datasets
!pip install -q bitsandbytes
!pip install -q peft
!pip install -q accelerate
!pip install -q trl
!pip install -q wandb
!pip install -q -U google-generativeai

In [2]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

### Train Model

In [5]:
# General parameters
model_name = "google/gemma-2-2b"  # The model that you want to train from the Hugging Face hub
new_model = "gemma-prescription-json-generator"  # The name for fine-tuned LoRA Adaptor

In [4]:
# LoRA parameters
lora_r = 64
lora_alpha = lora_r * 2
lora_dropout = 0.1
target_modules = ["q_proj", "v_proj", "k_proj"]

In [6]:
# QLoRA parameters
load_in_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
bnb_4bit_use_double_quant = False

In [7]:
# TrainingArguments parameters
num_train_epochs = 10
fp16 = False
bf16 = False
per_device_train_batch_size = 4
gradient_accumulation_steps = 1
gradient_checkpointing = True
learning_rate = 0.00015
weight_decay = 0.01
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 25
logging_steps = 10

# SFT parameters
max_seq_length = None
packing = False
device_map = {"": 0}

# Dataset parameters
use_special_template = True
response_template = " ### Answer:"
instruction_prompt_template = '"### Human:"'
use_llama_like_model = True

In [8]:
def load_data(file_path):
    percent_of_train_dataset = 0.90
    data = load_dataset("json", data_files=file_path, split="train")

    split_dataset = data.train_test_split(
        train_size=int(data.num_rows * percent_of_train_dataset), seed=19, shuffle=False
    )
    train_dataset = split_dataset["train"]
    eval_dataset = split_dataset["test"]
    print(f"Size of the train set: {len(train_dataset)}. Size of the validation set: {len(eval_dataset)}")
    return train_dataset, eval_dataset

dataset_dict = {"seizure": {}, "prescription": {}}
dataset_dict["prescription"]["train_dataset"], dataset_dict["prescription"]["eval_dataset"] = load_data(
    file_path="/content/prescription.jsonl")

# dataset_dict["seizure"]["train_dataset"], dataset_dict["seizure"]["eval_dataset"] = load_data(
#     file_path="/content/seizure_frequency.jsonl")

Size of the train set: 264. Size of the validation set: 30


In [9]:
# Load LoRA configuration
peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=target_modules,
)

In [10]:
# Load QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=load_in_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
)

In [11]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config, device_map=device_map,
    attn_implementation='eager')
model.config.use_cache = False

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=new_model,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    gradient_checkpointing=gradient_checkpointing,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    do_eval=True,
    evaluation_strategy="steps"
)



In [13]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training
if not tokenizer.chat_template:
    tokenizer.chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}"

In [14]:
def special_formatting_prompts(example):
    output_texts = []
    for i in range(len(example["instruction"])):
        text = f"{instruction_prompt_template}{example['instruction'][i]}\n{response_template} {example['output'][i]}"
        output_texts.append(text)
    return output_texts


def normal_formatting_prompts(example):
    output_texts = []
    for i in range(len(example["instruction"])):
        chat_temp = [
            {"role": "system", "content": example["instruction"][i]},
            {"role": "assistant", "content": example["output"][i]},
        ]
        text = tokenizer.apply_chat_template(chat_temp, tokenize=False)
        output_texts.append(text)
    return output_texts

In [15]:
if use_special_template:
    formatting_func = special_formatting_prompts
    if use_llama_like_model:
        response_template_ids = tokenizer.encode(response_template, add_special_tokens=False)[2:]
        collator = DataCollatorForCompletionOnlyLM(response_template=response_template_ids, tokenizer=tokenizer)
    else:
        collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer)
else:
    formatting_func = normal_formatting_prompts

In [16]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_dict["prescription"]["train_dataset"],
    eval_dataset=dataset_dict["prescription"]["eval_dataset"],
    peft_config=peft_config,
    formatting_func=formatting_func,
    data_collator=collator,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)



In [17]:
if torch.cuda.device_count() > 1:
  model.is_parallelizable = True
  model.model_parallel = True

# Train model
trainer.train()

# Save fine tuned Lora Adaptor
trainer.model.save_pretrained(new_model)

[34m[1mwandb[0m: Currently logged in as: [33maksonsam[0m ([33maksonsam-swansea-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
10,0.6304,0.30057
20,0.2354,0.265491
30,0.2786,0.232769
40,0.3959,0.221747
50,0.1952,0.229352
60,0.2111,0.232206
70,0.3537,0.22192
80,0.1698,0.208419
90,0.2861,0.22534
100,0.1829,0.214873


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enab

In [None]:
# import torch
# import gc


# def clear_hardwares():
#     torch.clear_autocast_cache()
#     torch.cuda.ipc_collect()
#     torch.cuda.empty_cache()
#     gc.collect()


# clear_hardwares()
# clear_hardwares()

In [17]:
def generate(model, prompt: str, kwargs):
    tokenized_prompt = tokenizer(prompt, return_tensors="pt").to(model.device)

    prompt_length = len(tokenized_prompt.get("input_ids")[0])

    with torch.cuda.amp.autocast():
        output_tokens = model.generate(**tokenized_prompt, **kwargs) if kwargs else model.generate(**tokenized_prompt)
        output = tokenizer.decode(output_tokens[0][prompt_length:], skip_special_tokens=True)
    return output

In [18]:
new_model

'gemma-prescription-json-generator'

In [19]:
new_model_path = f"/content/{new_model}"

In [20]:
ft_model = AutoModelForCausalLM.from_pretrained(new_model_path)

OSError: /content/gemma-prescription-json-generator does not appear to have a file named config.json. Checkout 'https://huggingface.co//content/gemma-prescription-json-generator/tree/None' for available files.

In [21]:
peft_model = PeftModel.from_pretrained(ft_model, new_model)
# del base_model

NameError: name 'ft_model' is not defined

In [None]:
progress_bar = tqdm(total=len(dataset_dict["prescription"]["eval_dataset"]))
gen_kwargs = {"max_new_tokens": 100}
generated_texts_list = []
for eval_sample in dataset_dict["prescription"]["eval_dataset"]:
  chat_temp = [{"role": "system", "content": eval_sample["instruction"]}]
  prompt = tokenizer.apply_chat_template(chat_temp, tokenize=False,
                                           add_generation_prompt=True)
  generated_texts_list.append(generate(model=peft_model, prompt=prompt,
                                       kwargs=gen_kwargs))
  progress_bar.update(1)
progress_bar.close()

### Evaluation

In [None]:
with open("/content/api_key.txt", "r") as f:
  api_key = f.read()

In [None]:
!export API_KEY=api_key

In [None]:
import os

os.environ["API_KEY"] = api_key

In [None]:
import google.generativeai as genai

In [None]:
import json

def string_to_json(string):
  """Converts a string to a JSON object.

  Args:
    string: The string to convert.

  Returns:
    The JSON object.

  Raises:
    json.JSONDecodeError: If the string is not valid JSON.
  """

  try:
    return json.loads(string)
  except json.JSONDecodeError as e:
    print(f"Error decoding JSON: {e}")
    return None



In [None]:
from tqdm.notebook import tqdm

In [None]:
genai.configure(api_key=os.environ["API_KEY"])

In [None]:
sample = dataset_dict["prescription"]["eval_dataset"][25]
if use_special_template:
      prompt = f"{instruction_prompt_template}{sample['instruction']}\n{response_template}"
else:
    chat_temp = [{"role": "system", "content": sample["instruction"]}]
    prompt = tokenizer.apply_chat_template(chat_temp, tokenize=False,
                                          add_generation_prompt=True)

gen_kwargs = {"max_new_tokens": 100}
generated_texts = generate(model=peft_model, prompt=prompt, kwargs=gen_kwargs)

  with torch.cuda.amp.autocast():


In [None]:
generated_texts

" {'entity': 'Prescription', 'start_index': '117', 'end_index': '126', 'text': 'Levetiracetam to start', 'attributes': {'DrugName': 'Levetiracetam', 'DrugDose': '250', 'DoseUnit': 'mg', 'Frequency': '1'}} {'entity': 'Prescription', 'start_index': '131', 'end_index': '140', 'text"

In [None]:
# gen_model_name = "NousResearch/Llama-2-7b-chat-hf"
gen_model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
# gen_model = genai.GenerativeModel("gemini-1.5-flash")

In [None]:
import transformers

In [None]:
# Load base model
device_map = {"": 0}
# Load QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=load_in_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
)
gen_model = AutoModelForCausalLM.from_pretrained(model_name,
                                             quantization_config=bnb_config, device_map=device_map)
gen_model.config.use_cache = True
gen_tokenizer = AutoTokenizer.from_pretrained(model_name)
pipeline = transformers.pipeline(
    "text-generation",
    model=gen_model,
    torch_dtype=torch.float16,
    tokenizer=gen_tokenizer,
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
generated_texts

" {'entity': 'Prescription', 'start_index': '117', 'end_index': '126', 'text': 'Levetiracetam to start', 'attributes': {'DrugName': 'Levetiracetam', 'DrugDose': '250', 'DoseUnit': 'mg', 'Frequency': '1'}} {'entity': 'Prescription', 'start_index': '131', 'end_index': '140', 'text"

In [None]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model_name,
    model_kwargs={"torch_dtype": torch.float16},
    device_map="auto"
)
pipeline_tokenizer = tokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
messages = [
    {"role": "system", "content": "Your task is to validate and extract json from the given text "},
    {"role": "user", "content": f"please extract a valid JSON from the given text below.  text: {generated_texts}"}
]
inputs = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
outputs = pipeline(inputs, max_new_tokens=100)
print(outputs)

[{'generated_text': "<|im_start|>system\nYour task is to validate and extract json from the given text <|im_end|>\n<|im_start|>user\nplease extract a valid JSON from the given text below.  text:  {'entity': 'Prescription', 'start_index': '117', 'end_index': '126', 'text': 'Levetiracetam to start', 'attributes': {'DrugName': 'Levetiracetam', 'DrugDose': '250', 'DoseUnit': 'mg', 'Frequency': '1'}} {'entity': 'Prescription', 'start_index': '131', 'end_index': '140', 'text<|im_end|>\n<|im_start|>system\nYour task is to validate and extract json from the given text <|im_end|>\n<|im_start|>user\nplease extract a valid JSON from the given text below.  text:  {'entity': 'Prescription','start_index': '117', 'end_index': '126', 'text': 'Levetiracetam to start', 'attributes': {'DrugName': 'Levetiracetam',"}]


In [None]:
generated_texts

" {'entity': 'Prescription', 'start_index': '117', 'end_index': '126', 'text': 'Levetiracetam to start', 'attributes': {'DrugName': 'Levetiracetam', 'DrugDose': '250', 'DoseUnit': 'mg', 'Frequency': '1'}} {'entity': 'Prescription', 'start_index': '131', 'end_index': '140', 'text"

In [None]:
generated_results = {}
progress = tqdm(total=len(dataset_dict["prescription"]["eval_dataset"]))
for idx, sample in enumerate(dataset_dict["prescription"]["eval_dataset"]):
  if use_special_template:
      prompt = f"{instruction_prompt_template}{sample['instruction']}\n{response_template}"
  else:
      chat_temp = [{"role": "system", "content": sample["instruction"]}]
      prompt = tokenizer.apply_chat_template(chat_temp, tokenize=False,
                                            add_generation_prompt=True)

  gen_kwargs = {"max_new_tokens": 100}
  generated_texts = generate(model=peft_model, prompt=prompt, kwargs=gen_kwargs)

  # fragmentation fix

  gen_model = genai.GenerativeModel("gemini-1.5-flash")

  prompt = f"""
  Given the below are fragments of JSON, please provide one single valid JSON File
  that contains all the fragments.
  {generated_texts}
  """
  response = gen_model.generate_content(prompt)
  if response is not None:
    json_object = string_to_json(response.text.replace("`", "").replace("json", ""))
    if json_object is not None:
      generated_results[idx] = json_object

  progress.update(1)

progress.close()

  0%|          | 0/30 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast():


Error decoding JSON: Extra data: line 16 column 1 (char 235)


ValueError: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. Please check the `candidate.safety_ratings` to determine if the response was blocked.

In [None]:
prompt

"\n  Given the below are fragments of JSON, please provide one single valid JSON File\n  that contains all the fragments.  \n   {'entity': 'Prescription', 'start_index': '234', 'end_index': '246', 'text': 'Sodium Valproate', 'attributes': {'DrugName': 'SodiumValproate', 'DrugDose': '800', 'DoseUnit': 'mg', 'Frequency': '1'}}\n\n\nIn case of multiple prescriptions, extract only the first one from the clinical text in JSON format.\n\nGive me the output in the json format as\n  "

In [None]:
  genai.configure(api_key=os.environ["API_KEY"])
  model = genai.GenerativeModel("gemini-1.5-flash")
  response = model.generate_content(prompt)
  print(response.text)

ValueError: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. Please check the `candidate.safety_ratings` to determine if the response was blocked.

In [None]:
response

response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=protos.GenerateContentResponse({
      "candidates": [
        {
          "finish_reason": "SAFETY",
          "index": 0,
          "safety_ratings": [
            {
              "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
              "probability": "NEGLIGIBLE"
            },
            {
              "category": "HARM_CATEGORY_HATE_SPEECH",
              "probability": "NEGLIGIBLE"
            },
            {
              "category": "HARM_CATEGORY_HARASSMENT",
              "probability": "NEGLIGIBLE"
            },
            {
              "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
              "probability": "MEDIUM"
            }
          ]
        }
      ],
      "usage_metadata": {
        "prompt_token_count": 131,
        "total_token_count": 131
      }
    }),
)

In [None]:
prompt1 = f"""
Given below string in the delimiter ```, please extract all the jsons. please extract
only jsons.

```{generated_texts}```
"""
model.generate_content(prompt1).text

ValueError: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. Please check the `candidate.safety_ratings` to determine if the response was blocked.

In [None]:
prompt1

"\nGiven below string in the delimiter ```, please extract all the jsons. please extract\nonly jsons.\n\n``` {'entity': 'Prescription', 'start_index': '234', 'end_index': '246', 'text': 'Sodium Valproate', 'attributes': {'DrugName': 'SodiumValproate', 'DrugDose': '800', 'DoseUnit': 'mg', 'Frequency': '1'}}\n\n\nIn case of multiple prescriptions, extract only the first one from the clinical text in JSON format.\n\nGive me the output in the json format as```\n"