<a href="https://colab.research.google.com/github/Talida-M/BIO_NLP_PLABA_2023/blob/main/plaba_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Imports


In [None]:
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes
!pip install rouge_score
!pip install evaluation
!pip install datasets
!pip install numpy
!pip install torch
!pip install nltk
!pip install sacrebleu sacremoses

In [None]:
import json
import os
import unicodedata
import math
import argparse
import random
import os

import pandas as pd
from unsloth import FastLanguageModel
from sklearn.model_selection import train_test_split
import torch

# Loading base Mistral model


In [None]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

base_model, base_tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-v0.3", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

FastLanguageModel.for_inference(base_model)

==((====))==  Unsloth: Fast Mistral patching release 2024.6
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth: Will load unsloth/mistral-7b-v0.3-bnb-4bit as a legacy tokenizer.


# Data loading and preprocessing

In [None]:
# Load Data from Google Drive or locally
COLAB_ENABLED=True # Loads from Google Drive if True

if COLAB_ENABLED:
    # DATA_PATH = "/content/drive/MyDrive/biomedical_nlp/data"
    DATA_PATH = "/content/drive/MyDrive/BIO_NLP/"
    from google.colab import drive
    drive.mount('/content/drive')
    #%% md
else:
    DATA_PATH = "./data"

# Load the dataset
with open(DATA_PATH + '/data.json', 'r') as f:
    data = json.load(f)

print(json.dumps(data, indent=4))

Mounted at /content/drive


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
if not os.path.exists(DATA_PATH + '/test_dataset.csv'):
  dfs = []
  question_data = {"question_id": [], "question": [], "question_type": [], "title": "", "adaptations": [], "abstracts": []}
  for key, value in data.items():
      abstracts = []
      adaptations = []
      for sub_key, sub_value in value.items():
          if isinstance(sub_value, dict):
              for adaptation in sub_value['adaptations']:
                  for adaptation_key, adaptation_value in sub_value['adaptations'][adaptation].items():
                      adaptations.append(adaptation_value)
                      abstracts.append(sub_value['abstract'][adaptation_key])

          question_data["abstracts"] = abstracts
          question_data["adaptations"] = adaptations
          question_data["question_id"] = [key] * len(adaptations)
          question_data["question_type"] = [value["question_type"]] * len(adaptations)
          question_data["question"] = [value["question"]] * len(adaptations)
          question_data["title"] = [value["question"]] * len(adaptations)
          assert len(adaptations) == len(abstracts), f"len {len(adaptations)} not equal {len(abstracts)}"
      dfs.append(pd.DataFrame.from_dict(question_data))
  df = pd.concat(dfs)
  df.to_csv(DATA_PATH + 'test_dataset.csv', index=False)
else:
  df = pd.read_csv(DATA_PATH + 'test_dataset.csv', header=0)

# Keep rows where at least one value is not missing in both columns (using ~ for negation and all() for checking all True)
df = df[~(df["abstracts"].isnull() | df["adaptations"].isnull())]

df.head()

Unnamed: 0,question_id,question,question_type,title,adaptations,abstracts
0,1,What causes muscle spasm?,C,What causes muscle spasm?,Muscle cramps are a common problem represented...,Muscle cramps are a common problem characteriz...
1,1,What causes muscle spasm?,C,What causes muscle spasm?,"These true cramps, coming from nerves outside ...","These true cramps, which originate from periph..."
2,1,What causes muscle spasm?,C,What causes muscle spasm?,"Medical history, physical check-up, and lab sc...","Medical history, physical examination, and a l..."
3,1,What causes muscle spasm?,C,What causes muscle spasm?,"Despite their harmless nature, cramps are unco...","Despite the ""benign"" nature of cramps, many pa..."
4,1,What causes muscle spasm?,C,What causes muscle spasm?,Experience and limited medical studies guide t...,Treatment options are guided both by experienc...


In [None]:
# Split up dataset into train/val/test -> 70/15/15
if not os.path.exists(DATA_PATH + 'train.csv'):
  # Clean the question column
  df['question_id'] = df['question_id'].astype(str)

  # Define test size (15%) and validation size (15%)
  test_size = 0.15
  val_size = 0.15
  train_size = 1 - test_size - val_size

  train_val, test = train_test_split(df, test_size=test_size, random_state=42)
  train, val = train_test_split(train_val, test_size=val_size/(val_size+train_size), random_state=42)

  dfs = {'train': train, 'val': val, 'test': test}

  for key, df in dfs.items():
      df.to_csv(DATA_PATH + key + ".csv", index=False, encoding='utf-8-sig')
else:
    train = pd.read_csv(DATA_PATH + 'train.csv', header=0)
    val = pd.read_csv(DATA_PATH + 'val.csv', header=0)
    test = pd.read_csv(DATA_PATH + 'test.csv', header=0)
    dfs = {'train':train, 'val':val, 'test':test}

print("Train question IDs:", train['question_id'].tolist())
print("Number of entries in test set:", len(test))
print("Number of entries in val set:", len(val))
print("Number of entries in train set:", len(train))

Train question IDs: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6

# Mistral Finetuning using unsloth (RUN ONLY IF YOU WANT TO FINETUNE)


In [None]:
## Prompt generation
prompt = """
### Instruction:
You are a medical advisor that takes in a very abstract sentence and translates it in layman's terms, for average people to understand.

### Input:
{}

### Response:
{}"""

EOS_TOKEN = base_tokenizer.eos_token # Must add EOS_TOKEN

def formatting_prompts_func(data):
    texts = []
    for input, output in zip(data["abstracts"], data["adaptations"]):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

from datasets import Dataset, concatenate_datasets

dataset = concatenate_datasets([Dataset.from_pandas(train), Dataset.from_pandas(val)])
dataset = dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/7946 [00:00<?, ? examples/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    base_model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = base_tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1,
        # max_steps = 60, # Set num_train_epochs = 1 for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/7946 [00:00<?, ? examples/s]

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 7,946 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 993
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.472
2,2.6829
3,2.4071
4,1.9756
5,1.8166
6,1.4742
7,1.1539
8,1.1541
9,0.9877
10,0.9547


### Saving the model locally

In [None]:
model.save_pretrained(DATA_PATH + "lora/lora_model") # Local saving
tokenizer.save_pretrained(DATA_PATH + "lora/lora_model")

('/content/drive/MyDrive/BIO_NLP/lora/lora_model_5_full/tokenizer_config.json',
 '/content/drive/MyDrive/BIO_NLP/lora/lora_model_5_full/special_tokens_map.json',
 '/content/drive/MyDrive/BIO_NLP/lora/lora_model_5_full/tokenizer.model',
 '/content/drive/MyDrive/BIO_NLP/lora/lora_model_5_full/added_tokens.json')

## Inference

In [None]:
# Loading the model, change to True when loading. Be sure to have the lora_model folder in DATA_PATH
if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = DATA_PATH + "/lora/lora_model_5_full", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

==((====))==  Unsloth: Fast Mistral patching release 2024.6
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth: Will load unsloth/mistral-7b-v0.3-bnb-4bit as a legacy tokenizer.


In [None]:
def mistral_inference(model, tokenizer, input):
    ### Instruction:
    inference_prompt = """
    ### Instruction:
    You are a medical advisor that takes in an abstract sentence and translates it in layman's terms, for average people to understand.

    ### Input:
    {}

    ### Response:
    {}
    """

    FastLanguageModel.for_inference(model)
    inputs = tokenizer([inference_prompt.format(input)], return_tensors = "pt").to("cuda")

    outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True, pad_token_id=tokenizer.eos_token_id)

    decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    #Extract the response part
    response_start = decoded_output.find("### Response:") + len("### Response:")
    response = decoded_output[response_start:].strip()

    return response


Desogestrel and cyproterone acetate had the highest risk estimates.


'Desogestrel and cyproterone acetate had the highest risk estimates.'

In [None]:
mistral_inference(base_model, base_tokenizer, "Desogestrel and cyproterone acetate had the highest risk estimates: 14·6 (9·7-21·9) and 32·6 (13·2-80·6) and 15·5 (9·7-24·9) and 44·4 (16·9-116·3) respectively.")

Desogestrel and cyproterone acetate had the highest risk estimates.


'Desogestrel and cyproterone acetate had the highest risk estimates.'

In [None]:
from tqdm import tqdm

finetune_responses = []
base_model_finetune_responses = []
for abstract in tqdm(dfs['test']['abstracts']):
    base_model_finetune_responses.append(mistral_inference(base_model, base_tokenizer, abstract))

for abstract in tqdm(dfs['test']['abstracts']):
    finetune_responses.append(mistral_inference(model, base_tokenizer, abstract)

In [None]:
dfs['test']['base_model_finetune_responses'] = base_model_finetune_responses
dfs['test']['finetune_responses'] = finetune_responses

dfs['test'].to_csv(DATA_PATH + "test.csv", index=False, encoding='utf-8-sig')

# Evaluation

In [None]:
# read in test if not existing
try:
    test = dfs.test
except:
    test = pd.read_csv("test.csv")
#test = test.dropna()

1373


In [None]:
from evaluate import load
from tqdm import tqdm

# Load the SARI evaluation metric
sari = load("sari")

# Load the BLEU evaluation metric
bleu = load("bleu")

# Load the ROUGE evaluation metric
rouge = load('rouge')

tqdm.pandas()

test['sari_base'] = test.progress_apply(lambda row: sari.compute(sources=[row['abstracts']],
                                      references=[[row['adaptations']]],
                                      predictions=[row['base_model_finetune_responses']])['sari'], axis=1)

test['sari_finetuned'] = test.progress_apply(lambda row: sari.compute(sources=[row['abstracts']],
                                      references=[[row['adaptations']]],
                                      predictions=[row['finetuned_mistral_responses']])['sari'], axis=1)

test['bleu_base'] = test.progress_apply(lambda row: bleu.compute(
                                      references=[[row['adaptations']]],
                                      predictions=[row['base_model_finetune_responses']])['bleu'], axis=1)

test['bleu_finetuned'] = test.progress_apply(lambda row: bleu.compute(
                                      references=[[row['adaptations']]],
                                      predictions=[row['finetuned_mistral_responses']])['bleu'], axis=1)

test['rouge_base'] = test.progress_apply(lambda row: rouge.compute(
                                      references=[[row['adaptations']]],
                                      predictions=[row['base_model_finetune_responses']])['rougeL'], axis=1)

test['rouge_finetuned'] = test.progress_apply(lambda row: rouge.compute(
                                      references=[[row['adaptations']]],
                                      predictions=[row['finetuned_mistral_responses']])['rougeL'], axis=1)

print(f'Average SARI Base: {sum(test.sari_base)/len(test.sari_base)}, \
      Max SARI Base: {max(test.sari_base)}, \
      Min SARI Base: {min(test.sari_base)}')

print(f'Average SARI Finetuned: {sum(test.sari_finetuned)/len(test.sari_finetuned)}, \
      Max SARI Finetuned: {max(test.sari_finetuned)}, \
      Min SARI Finetuned: {min(test.sari_finetuned)}')

print(f'Average BLEU Base: {sum(test.bleu_base)/len(test.bleu_base)}, \
      Max BLEU Base: {max(test.bleu_base)}, \
      Min BLEU Base: {min(test.bleu_base)}')

print(f'Average BLEU Finetuned: {sum(test.bleu_finetuned)/len(test.bleu_finetuned)}, \
      Max BLEU Finetuned: {max(test.bleu_finetuned)}, \
      Min BLEU Finetuned: {min(test.bleu_finetuned)}')

print(f'Average ROUGE Base: {sum(test.rouge_base)/len(test.rouge_base)}, \
      Max ROUGE Base: {max(test.rouge_base)}, \
      Min ROUGE Base: {min(test.rouge_base)}')

print(f'Average ROUGE Finetuned: {sum(test.rouge_finetuned)/len(test.rouge_finetuned)}, \
      Max ROUGE Finetuned: {max(test.rouge_finetuned)}, \
      Min ROUGE Finetuned: {min(test.rouge_finetuned)}')

100%|██████████| 1345/1345 [05:20<00:00,  4.20it/s]
100%|██████████| 1345/1345 [06:30<00:00,  3.44it/s]

Average SARI Base: 45.95831370112784,       Max SARI Base: 94.74634298163708,       Min SARI Base: 3.03030303030303
Average SARI Finetuned: 50.50778739221534,       Max SARI Finetuned: 100.0,       Min SARI Finetuned: 9.37626331923324
Average BLEU Base: 0.18285801561409298,       Max BLEU Base: 0.9635749534339606,       Min BLEU Base: 0.0
Average BLEU Finetuned: 0.2809257388079612,       Max BLEU Finetuned: 1.0,       Min BLEU Finetuned: 0.0
Average ROUGE Base: 0.416878485870897,       Max ROUGE Base: 1.0,       Min ROUGE Base: 0.0
Average ROUGE Finetuned: 0.5490284189462421,       Max ROUGE Finetuned: 1.0,       Min ROUGE Finetuned: 0.0





In [None]:
test.to_csv("eval.csv", index=False, encoding='utf-8-sig')

'She decided to leave the hospital on her own, but she will have to come back for a follow-up visit.  The input sentence is a medical term that means the patient decided to leave the hospital on their own, but they'