# Imports

In [None]:
%pip install -q evaluate
%pip install -q opendatasets
%pip install -q --upgrade accelerate
%pip install -q --upgrade transformers
%pip install -q peft
%pip install -q --upgrade bitsandbytes
%pip install -q accelerate

In [1]:
import pandas as pd 
import torch
import torch.nn as nn
torch.cuda.set_per_process_memory_fraction(0.9)
torch.backends.cuda.matmul.allow_tf32 = True
import torchtext
from torch.utils.data import Dataset, random_split
from typing import List, Dict, Union
from typing import Any, TypeVar
import pandas as pd
import os
import copy
import gc
import evaluate
import opendatasets as od
from huggingface_hub import login
from typing import Optional, Tuple, Union

from datasets import load_dataset, Features, Value
import accelerate

from peft import LoftQConfig, LoraConfig, get_peft_model, PeftModel

import transformers
from transformers.modeling_outputs import QuestionAnsweringModelOutput
from transformers import BertLMHeadModel, AutoConfig, BitsAndBytesConfig,Conv1D
from transformers import AutoTokenizer, Seq2SeqTrainingArguments 
from transformers import Seq2SeqTrainer, AutoModelForCausalLM, IntervalStrategy, AutoModelForQuestionAnswering

from sklearn.model_selection import train_test_split

set a seed and confirm CUDA support

In [2]:
torch.manual_seed(2137)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.deterministic = True

print("PyTorch Version: ", torch.__version__)
print("torchtext Version: ", torchtext.__version__)
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'}.")

PyTorch Version:  2.2.1+cu121
torchtext Version:  0.17.1+cpu
Using GPU.


# Dataset Download

## Downloading MedDialog Dataset

NOTE: you will need a kaggle API key for the following to work

In [None]:
import json

# Path to your JSON file
json_file_path = "kaggle.json"

# Open the file and read the content
try:
  with open(json_file_path, "r") as f:
    json_data = json.load(f)
except FileNotFoundError:
  print(f"Error: JSON file not found at {json_file_path}")
  exit(1)

# Access username and key from the JSON data
try:
  username = json_data["username"]
  key = json_data["key"]
except KeyError:
  print("Error: 'username' or 'key' key not found in JSON data")
  exit(1)

In [None]:
os.environ['KAGGLE_USERNAME'] = username
os.environ['KAGGLE_KEY'] = key

# Assign the Kaggle data set URL into variable
dataset = 'https://www.kaggle.com/datasets/dsxavier/diagnoise-me'
# Using opendatasets let's download the data sets
od.download(dataset, "dataset")

## Downloading USMLE Dataset

In [3]:
USMLE_dataset = load_dataset("GBaker/MedQA-USMLE-4-options", split="test")

In [4]:
print(USMLE_dataset[0])
print(len(USMLE_dataset))

{'question': 'A junior orthopaedic surgery resident is completing a carpal tunnel repair with the department chairman as the attending physician. During the case, the resident inadvertently cuts a flexor tendon. The tendon is repaired without complication. The attending tells the resident that the patient will do fine, and there is no need to report this minor complication that will not harm the patient, as he does not want to make the patient worry unnecessarily. He tells the resident to leave this complication out of the operative report. Which of the following is the correct next action for the resident to take?', 'answer': 'Tell the attending that he cannot fail to disclose this mistake', 'options': {'A': 'Disclose the error to the patient and put it in the operative report', 'B': 'Tell the attending that he cannot fail to disclose this mistake', 'C': 'Report the physician to the ethics committee', 'D': 'Refuse to dictate the operative report'}, 'meta_info': 'step1', 'answer_idx': 

# Load Datasets

## Loading MedDialog Dataset

In [6]:
DATA_PATH = "dataset\\diagnoise-me\\diagnose_en_dataset.feather"
# DATA_PATH = "/kaggle/input/diagnoise-me/diagnose_en_dataset.feather"
SEQ_LEN: int = 1024
data = pd.read_feather(DATA_PATH)
SAMPLE_SIZE: int =  int(data.shape[0] * 0.015) #get 1% of the data
data = data[:SAMPLE_SIZE]
print(data.keys())
print(len(data))

Index(['id', 'Description', 'Doctor', 'Patient'], dtype='object')
3862


In [7]:
# Split data into train and eval sets with 70% for training
train_data, eval_data = train_test_split(data, test_size=0.3, random_state=42)

train_data = train_data.reset_index(drop=True)
eval_data = eval_data.reset_index(drop=True)

# Print the shapes of the train and eval sets
print("Train data shape:", train_data.shape)
print("Eval data shape:", eval_data.shape)

Train data shape: (2703, 4)
Eval data shape: (1159, 4)


## Loading USMLE Dataset

In [8]:
USMLE_dataset = pd.DataFrame({'Doctor': USMLE_dataset["answer"], 'Patient': USMLE_dataset["question"], 'Options':USMLE_dataset["options"]})
# Print the shapes of the set
print("USMLELiveEQA data shape:", USMLE_dataset.shape)

USMLELiveEQA data shape: (1273, 3)


## Create an output directory

In [9]:
os.makedirs('./results', exist_ok = True)
OUTPUT_DIR: str = './results'

# Model

In [10]:
# tokens for the datset
MODEL_NAME: str = 'UnfilteredAI/Mia-1B'

In [11]:
# Load tokenizer 
MAX_TOKEN_LENGTH = 1024

# for evaluation
ltokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
ltokenizer.padding_side = 'left'
ltokenizer.truncation_side = 'left'

# for training
rtokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
rtokenizer.padding_side = 'right'
rtokenizer.truncation_side = 'right'

In [13]:
base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
#base_model.resize_token_embeddings(len(rtokenizer))

In [14]:
print(base_model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Line

In [None]:
lora_config = LoraConfig(
    lora_alpha=16, # lora alpha for scaling
    r=16, # rank
    lora_dropout=0.05, #dropout
    use_rslora=True, #  sets the adapter scaling factor to lora_alpha/math.sqrt(r)
    bias="none", # dont train biases
    target_modules=["q_proj", "v_proj"],
    #layers_to_transform=[20]
)
model = get_peft_model(base_model, lora_config)
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
    return {"trainable": trainable_params, "all": all_param, "trainable%": 100 * trainable_params / all_param}

print_trainable_parameters(model)

# Preparing Data for Training

## Custom Dataset

In [12]:
class DoctorPatientDataset(Dataset):
    
    def __init__(self, data, split):
        
        self.input_x: List = data["Patient"]
        self.input_x = self.input_x.reset_index(drop=True)
        self.target: List = data["Doctor"]
        self.target = self.target.reset_index(drop=True)
        self.split = split

        try:
            self.options: List = data["Options"]
        except:
            pass
            
    def __len__(self):
        return len(self.input_x)
    
    def __getitem__(self, idx):
        try:
            data = {
                'input': self.input_x[idx],
                'target': self.target[idx],
                'options': self.options[idx],
                'split': self.split
            }
        except:
            data = {
                'input': self.input_x[idx],
                'target': self.target[idx],
                'split': self.split
            }
        return data

In [39]:
train_dataset = DoctorPatientDataset(data = train_data, split = "train")
eval_dataset_1 = DoctorPatientDataset(data = eval_data, split = "eval")
eval_dataset_2 = DoctorPatientDataset(data = USMLE_dataset, split = "eval")

test_dataset = DoctorPatientDataset(data = eval_data[0:1], split = "eval")

## Custom Data Collator

In [30]:
def format_text(message, tokenizer):
    text = tokenizer.apply_chat_template(
        message,
        tokenize=False,
        add_generation_prompt=False
    )
    return text

def custom_data_collator(features, return_tensors="pt"):
    batch = {}

    #questions = [feature["input"] for feature in features]
    questions = [features[i]["input"] for i in range(len(features))]
    #answers = [feature["target"] for feature in features]
    answers = [features[i]["target"] for i in range(len(features))]
    split = features[0]["split"]

    # training
    if split == 'train':
        tokenizer = rtokenizer
        bos_token = rtokenizer.bos_token
        eos_token = rtokenizer.eos_token
        prompts = [f"a medical student is preparing for her final examination. Her patient has said '{q}'. Explain to the student the most likely cause/course of action." for q in questions]
        #text = [f"{bos_token}Question:{q}.Answer:{t}{eos_token}" for q, t in zip(questions, answers)]
        messages = [[
            {"role": "system", "content": "You are a medical professional providing consultation and medical diagnostics."},
            {"role": "user", "content": prompt},
            {"role": "assistant", "content": a}
        ] for prompt, a in zip(prompts, answers)]

    # evaluation
    else:
        try:
            options = [feature["options"] for feature in features]
            multi_choice = True
        except:
            multi_choice = False


        # tokenizer for evaluation
        tokenizer = ltokenizer
        bos_token = ltokenizer.bos_token

        # Format text to be encoded
        if(multi_choice == False):
            # if we are not using the multiple choice dataset
            # text = [f"{bos_token}Question:{q}.Answer:" for q in questions]
            prompts = [f"a medical student is preparing for her final examination. Her patient has said '{q}'. Explain to the student the most likely cause/course of action." for q in questions]
            messages = [[
                {"role": "system", "content": "You are a medical professional providing consultation and medical diagnostics."},
                {"role": "user", "content": prompt},
                {"role": "assistant", "content":""}
            ] for prompt in prompts]
        else:
            # if we are using the multiple choice dataset
            # prompts = [f"provided the following text about medical symptoms: '{q}' Please state the most likely cause/course of action from the options below: A: {o['A']} B: {o['B']} C: {o['C']} D: {o['D']} Please select your answer with the format shown in the following example:'The correct option is C'" for q, o in zip(questions, options)]
            # text = [f"{bos_token}Question:{p}.Answer:" for p in prompts]
            prompts = [f"a medical student is preparing for her final examination. Her patient has said '{q}'. Please clearly state a cause/course of action from the provided options:  A: {o['A']} B: {o['B']} C: {o['C']} D: {o['D']} and explain your answer" for q, o in zip(questions, options)]
            messages = [[
                {"role": "system", "content": "You are a medical professional providing consultation and medical diagnostics."},
                {"role": "user", "content": prompt},
                {"role": "assistant", "content":""}
            ] for prompt in prompts]

    # Tokenize the text
    text = list(map(lambda x: format_text(x, tokenizer), messages))
    #print(text)
    
    encoding = tokenizer(text, truncation=True, padding='max_length', max_length=MAX_TOKEN_LENGTH, return_tensors=return_tensors, add_special_tokens=False)
    # encoding = tokenizer(text, truncation=True, padding='max_length', max_length=512, return_tensors=return_tensors, add_special_tokens=False)

    # Prepare final batch dictionary
    batch["input_ids"] = encoding["input_ids"]
    batch["attention_mask"] = encoding["attention_mask"]

    if return_tensors in ["pt", "tf"]:
        batch["labels"] = copy.deepcopy(encoding["input_ids"])
    return batch

# Training

In [15]:
training_args = Seq2SeqTrainingArguments(
    output_dir = OUTPUT_DIR, 
    num_train_epochs = 1, 
    evaluation_strategy="steps",
    #eval_steps = 50,
    #logging_steps = 50,
    save_total_limit = 1,
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=1,
    bf16=False,
    fp16=True,
    warmup_steps=0, 
    weight_decay=0.01, 
    logging_dir='./logs',
    save_steps = 0,
    load_best_model_at_end=True,
    remove_unused_columns=False,
#     generation_config=transformers.GenerationConfig(
#             max_length=2048,
#             num_beams=10,
#     ),
    #predict_with_generate=True,
    generation_max_length=MAX_TOKEN_LENGTH,
    # prediction_loss_only=True,
    # eval_accumulation_steps=10,
    report_to=['tensorboard']
    )

In [None]:
trainer = Seq2SeqTrainer(
    model=model, 
    args=training_args, 
    train_dataset=train_dataset,
    eval_dataset=eval_dataset_1, 
    data_collator=custom_data_collator
)

In [73]:
trainer = None
model = None
base_model = None
train_dataset = None
torch.cuda.empty_cache()
gc.collect()

13066

In [None]:
trainer.train()

In [None]:
trainer.model.save_pretrained(f"{OUTPUT_DIR}/model_save")

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
tok = AutoTokenizer.from_pretrained(MODEL_NAME)
tok.push_to_hub("SurtMcGert/advanced-AI-CW-Med-Chat-Bot")

In [None]:
model.push_to_hub("SurtMcGert/advanced-AI-CW-Med-Chat-Bot")

# Load the Model

In [16]:
# base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
# model = PeftModel.from_pretrained(base_model, f"{OUTPUT_DIR}/model_save")
model = AutoModelForCausalLM.from_pretrained("SurtMcGert/advanced-AI-CW-Med-Chat-Bot").to(DEVICE)
model.config.pad_token_id = ltokenizer.pad_token_id
model.config.max_length = MAX_TOKEN_LENGTH
#model.gradient_checkpointing_enable()
#model.enable_input_require_grads()
trainer = Seq2SeqTrainer(
    model=model, 
    args=training_args, 
    train_dataset=train_dataset,
    eval_dataset=eval_dataset_1, 
    data_collator=custom_data_collator
)

adapter_config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


adapter_model.safetensors:   0%|          | 0.00/9.02M [00:00<?, ?B/s]

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
base_model.config.pad_token_id = ltokenizer.pad_token_id
# test_model = AutoModelForCausalLM.from_pretrained("SurtMcGert/advanced-AI-CW-Med-Chat-Bot").to(DEVICE)
# test_model.config.pad_token_id = ltokenizer.pad_token_id

In [None]:
test_trainer = Seq2SeqTrainer(
    model=base_model, 
    args=training_args, 
    train_dataset=train_dataset,
    eval_dataset=eval_dataset_1, 
    data_collator=custom_data_collator
)

In [18]:
#base_model = None
torch.cuda.empty_cache()
gc.collect()

650

# Evaluation

In [None]:
eval_result_1 = trainer.predict(eval_dataset_1, max_new_tokens=512)

In [None]:
eval_result_2 = trainer.predict(eval_dataset_2, max_new_tokens=512)

In [None]:
logits_1 = eval_result_1.predictions
logits_1[logits_1 == -100] = ltokenizer.eos_token_id
logits_2 = eval_result_2.predictions
logits_2[logits_2 == -100] = ltokenizer.eos_token_id

In [None]:
# get the raw evaluation output
raw_text_result_1 = ltokenizer.batch_decode(logits_1, skip_special_tokens=True)
raw_text_result_2 = ltokenizer.batch_decode(logits_2, skip_special_tokens=True)

In [None]:
print(raw_text_result_1[6])

In [None]:
# get the questions and ground truths from both evaluation datasets
questions_1 = []
ground_truth_1 = []
try:
    for item in eval_dataset_1:
        questions_1.append(item["input"])
        ground_truth_1.append(item["target"])
except:
    pass

questions_2 = []
ground_truth_2 = []
try:
    for item in eval_dataset_2:
        questions_2.append(item["input"])
        ground_truth_2.append(item["target"])
except:
    pass

# create lists for the text outputs
text_result_1 = list()
text_result_2 = list()

# get the answers for the MedDialog dataset
for item in raw_text_result_1:
    index = item.find("|<assistant>|")
    output = item[index+13:]
    index = output.find(ltokenizer.eos_token)
    if(index > -1):
        output = output[:index]
    text_result_1.append(output)


# get the answers for the USMLE dataset
for item in raw_text_result_2:
    index = item.find("|<assistant>|")
    output = item[index+13:]
    index = output.find(ltokenizer.eos_token)
    if(index > -1):
        output = output[:index]
    text_result_2.append(output)



# print the first 2 results from each dataset evaluation
print("============================MedDialog Evaluation============================")
for question, answer in list(zip(questions_1, text_result_1))[:2]:
    print(f"""
    Question: {question}
    Answer: {answer}
    """)

print("============================USMLE Evaluation============================")
for question, answer in list(zip(questions_2, text_result_2))[:2]:
    print(f"""
    Question: {question}
    Answer: {answer}
    """)

# Results

## Load the Required Evaluation Metrics

In [None]:
# perplexity - measures certainty of the model.
# METEOR - extension of BLEU (measure similarity between the output and the ground truth) but accounts for word semantics.
# ROUGE - considers n-gram overlap (recall) but also precision.
# SQuAD v2 - a metric for measuring a models correctness in answering the multiple choice questions
# Accuracy - use this for the multiple choice dataset

perplexity_scorer = evaluate.load('perplexity')
meteor_scorer = evaluate.load('meteor')
rouge_scorer = evaluate.load('rouge')
squad_scorer = evaluate.load('squad_v2')
accuracy_scorer = evaluate.load('accuracy')


In [None]:
# compute the bleu and rouge scores for the MedDialog evaluation
bleu_score_1 = bleu_scorer.compute(predictions=text_result_1, references=ground_truth_1)
rouge_score_1 = rouge_scorer.compute(predictions=text_result_1, references=ground_truth_1)

# compute the bleu and rouge scores for the USMLE evaluation
bleu_score_2 = bleu_scorer.compute(predictions=text_result_1, references=ground_truth_2)
rouge_score_2 = rouge_scorer.compute(predictions=text_result_1, references=ground_truth_2)

In [None]:
# print scores for MedDialog evaluation
print("score on MedDialog Dataset")
print('BLEU1:', bleu_score_1['precisions'][0]*100)
print(f"""
ROUGE-1: {rouge_score_1['rouge1']*100}
ROUGE-2: {rouge_score_1['rouge2']*100}
ROUGE-L: {rouge_score_1['rougeL']*100}
""")

In [None]:
# print scores for USMLE evaluation
print("score on USMLE Dataset")
print('BLEU1:', bleu_score_2['precisions'][0]*100)
print(f"""
ROUGE-1: {rouge_score_2['rouge1']*100}
ROUGE-2: {rouge_score_2['rouge2']*100}
ROUGE-L: {rouge_score_2['rougeL']*100}
""")

# TESTING JUST IGNORE ALL THIS

In [45]:
initial_prompt = test_dataset[0]['input']

In [46]:
prompt = f"a medical student is preparing for her final examination. Her patient has come to her asking: '{initial_prompt}'. Explain to the student the most likely cause/course of action."
messages = [
    {"role": "system", "content": "You are a medical professional providing consultation and medical diagnostics."},
    {"role": "user", "content": prompt}
]

In [47]:
text = ltokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
test_model_inputs_1 = ltokenizer(text, return_tensors="pt", add_special_tokens=False)


test_model_inputs_2 = custom_data_collator(test_dataset)


In [48]:
print(test_model_inputs_1)
print(test_model_inputs_2)

{'input_ids': tensor([[  529, 29989,  5205, 29989, 29958,  3492,   526,   263, 16083, 10257,
         13138,  8799,   362,   322, 16083,   652, 20921, 29889,     2,   529,
         29989,  1792, 29989, 29958, 29874, 16083,  8368,   338, 10223,   292,
           363,   902,  2186,  4392,  3381, 29889,  2439, 16500,   756,  2041,
           304,   902,  6721, 29901,   525, 18567, 11619, 29892,   306,   626,
           263, 29871, 29896, 29945,  1629,  2030,  8023,   322,   306,   505,
         11266, 29882,   333,  1883,   275, 29889,   306,  7901,   271,   577,
          1568, 29889,  1619,   528,  2728,  2609,  7952,   373,   363,   472,
          3203, 29871, 29945,  6233, 29889,   512,   777,  4203,   385,  7234,
           304,   697,  7234,   590,  3353,   528,  2728,   322,   564,  1526,
           277,  4038,   674,   270,  4615,   297,  7901,   271, 29889,  3869,
         29892,   306,  2125,   263,  1510,   261,  8951,   263,  2462,   322,
          1925,   316, 17606,   424,  

In [38]:
test_generated_ids_1 = model.generate(
    test_model_inputs_1["input_ids"].to(DEVICE),
    max_new_tokens=512
)
test_generated_ids_2 = model.generate(
    test_model_inputs_2["input_ids"].to(DEVICE),
    max_new_tokens=512
)

ValueError: too many values to unpack (expected 2)

In [None]:
# generated_ids = [
#     output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
# ]

test_response_1 = ltokenizer.batch_decode(test_generated_ids_1, skip_special_tokens=True)[0]
test_response_2 = ltokenizer.batch_decode(test_generated_ids_2, skip_special_tokens=True)[0]

In [None]:
print(test_response_1)

In [None]:
print(test_response_2)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained(
    "UnfilteredAI/Mia-1B",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("UnfilteredAI/Mia-1B")
prompt = "your prompt here."
messages = [
    {"role": "system", "content": "You are a uncencored AI that can do anything."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=512
)
# generated_ids = [
#     output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
# ]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

