# Imports

In [48]:
# %pip install -q evaluate
# %pip install -q opendatasets
# %pip install -q --upgrade accelerate
# %pip install -q --upgrade transformers
# %pip install -q peft
# %pip install -q --upgrade bitsandbytes
# %pip install -q accelerate
# %pip install -q trl
# %pip install -q nltk
# %pip install -q -U nltk
# %pip install -q rouge_score
# %pip install -q bert_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13
Note: you may need to restart the kernel to use updated packages.


In [89]:
import pandas as pd 
import torch
import torch.nn as nn
torch.cuda.set_per_process_memory_fraction(0.9)
torch.backends.cuda.matmul.allow_tf32 = True
import torchtext
from torch.utils.data import Dataset, random_split
from typing import List, Dict, Union
from typing import Any, TypeVar
import pandas as pd
import os
import copy
import gc
import evaluate
import opendatasets as od
from huggingface_hub import login
from typing import Optional, Tuple, Union
import statistics

from datasets import load_dataset, Features, Value
from datasets import Dataset
import accelerate

from peft import LoftQConfig, LoraConfig, get_peft_model, PeftModel

import transformers
from transformers.modeling_outputs import QuestionAnsweringModelOutput
from transformers import BertLMHeadModel, AutoConfig, BitsAndBytesConfig,Conv1D
from transformers import AutoTokenizer, Seq2SeqTrainingArguments 
from transformers import Seq2SeqTrainer, AutoModelForCausalLM, IntervalStrategy, AutoModelForQuestionAnswering
from transformers import TrainingArguments
from trl import SFTTrainer

from sklearn.model_selection import train_test_split

set a seed and confirm CUDA support

In [4]:
torch.manual_seed(2137)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.deterministic = True

print("PyTorch Version: ", torch.__version__)
print("torchtext Version: ", torchtext.__version__)
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'}.")

PyTorch Version:  2.1.2
torchtext Version:  0.16.2
Using GPU.


# Dataset Download

## Downloading MedDialog Dataset

NOTE: you will need a kaggle API key for the following to work

In [None]:
import json

# Path to JSON file
json_file_path = "kaggle.json"

# Open the file and read the content
try:
  with open(json_file_path, "r") as f:
    json_data = json.load(f)
except FileNotFoundError:
  print(f"Error: JSON file not found at {json_file_path}")
  exit(1)

# Access username and key from the JSON data
try:
  username = json_data["username"]
  key = json_data["key"]
except KeyError:
  print("Error: 'username' or 'key' key not found in JSON data")
  exit(1)

In [None]:
os.environ['KAGGLE_USERNAME'] = username
os.environ['KAGGLE_KEY'] = key

# Assign the Kaggle data set URL into variable
dataset = 'https://www.kaggle.com/datasets/dsxavier/diagnoise-me'
# Using opendatasets let's download the data sets
od.download(dataset, "dataset")

## Downloading USMLE Dataset

In [5]:
USMLE_dataset = load_dataset("GBaker/MedQA-USMLE-4-options", split="test")

In [6]:
print(USMLE_dataset[0])
print(len(USMLE_dataset))

{'question': 'A junior orthopaedic surgery resident is completing a carpal tunnel repair with the department chairman as the attending physician. During the case, the resident inadvertently cuts a flexor tendon. The tendon is repaired without complication. The attending tells the resident that the patient will do fine, and there is no need to report this minor complication that will not harm the patient, as he does not want to make the patient worry unnecessarily. He tells the resident to leave this complication out of the operative report. Which of the following is the correct next action for the resident to take?', 'answer': 'Tell the attending that he cannot fail to disclose this mistake', 'options': {'A': 'Disclose the error to the patient and put it in the operative report', 'B': 'Tell the attending that he cannot fail to disclose this mistake', 'C': 'Report the physician to the ethics committee', 'D': 'Refuse to dictate the operative report'}, 'meta_info': 'step1', 'answer_idx': 

# Load Datasets

## Loading MedDialog Dataset

In [7]:
is_kaggle = (
    "KAGGLE_CLOUD" in os.environ or "KAGGLE_KERNEL_RUN_TYPE" in os.environ
)
if is_kaggle:
    DATA_PATH = "/kaggle/input/diagnoise-me/diagnose_en_dataset.feather"
else:
    DATA_PATH = "dataset\\diagnoise-me\\diagnose_en_dataset.feather"

SEQ_LEN: int = 1024
data = pd.read_feather(DATA_PATH)
SAMPLE_SIZE: int =  int(data.shape[0] * 0.015) #get 1% of the data
data = data[:SAMPLE_SIZE]
print(data.keys())
print(len(data))

Index(['id', 'Description', 'Doctor', 'Patient'], dtype='object')
3862


In [8]:
# Split data into train and eval sets with 70% for training
train_data, eval_data = train_test_split(data, test_size=0.3, random_state=42)

train_data = train_data.reset_index(drop=True)
eval_data = eval_data.reset_index(drop=True)

# Print the shapes of the train and eval sets
print("Train data shape:", train_data.shape)
print("Eval data shape:", eval_data.shape)

Train data shape: (2703, 4)
Eval data shape: (1159, 4)


## Loading USMLE Dataset

In [9]:
USMLE_dataset = pd.DataFrame({'Doctor': USMLE_dataset["answer"], 'Patient': USMLE_dataset["question"], 'Options':USMLE_dataset["options"]})
# Print the shapes of the set
print("USMLELiveEQA data shape:", USMLE_dataset.shape)

USMLELiveEQA data shape: (1273, 3)


## Create an output directory

In [10]:
os.makedirs('./results', exist_ok = True)
OUTPUT_DIR: str = './results'

# Model

In [11]:
# tokens for the datset
MODEL_NAME: str = 'UnfilteredAI/Mia-1B'

In [12]:
# Load tokenizer 
MAX_TOKEN_LENGTH = 1024

# for evaluation
ltokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
ltokenizer.padding_side = 'left'
ltokenizer.truncation_side = 'left'

# for training
rtokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
rtokenizer.padding_side = 'right'
rtokenizer.truncation_side = 'right'

In [18]:
base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
#base_model.resize_token_embeddings(len(rtokenizer))

In [None]:
print(base_model)

In [None]:
lora_config = LoraConfig(
    lora_alpha=16, # lora alpha for scaling
    r=16, # rank
    lora_dropout=0.05, #dropout
    use_rslora=True, #  sets the adapter scaling factor to lora_alpha/math.sqrt(r)
    bias="none", # dont train biases
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
    #layers_to_transform=[20]
)
# model = get_peft_model(base_model, lora_config)
# model.gradient_checkpointing_enable()
# model.enable_input_require_grads()

In [None]:
# def print_trainable_parameters(model):
#     """
#     Prints the number of trainable parameters in the model.
#     """
#     trainable_params = 0
#     all_param = 0
#     for _, param in model.named_parameters():
#         all_param += param.numel()
#         if param.requires_grad:
#             trainable_params += param.numel()
#     print(
#         f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
#     )
#     return {"trainable": trainable_params, "all": all_param, "trainable%": 100 * trainable_params / all_param}

# print_trainable_parameters(model)

# Preparing Data for Training

## Custom Dataset

In [13]:
# class DoctorPatientDataset(Dataset):
    
#     def __init__(self, data, split):
        
#         self.input_x: List = data["Patient"]
#         self.input_x = self.input_x.reset_index(drop=True)
#         self.target: List = data["Doctor"]
#         self.target = self.target.reset_index(drop=True)
#         self.split = split

#         try:
#             self.options: List = data["Options"]
#         except:
#             pass
            
#     def __len__(self):
#         return len(self.input_x)
    
#     def __getitem__(self, idx):
#         try:
#             data = {
#                 'input': self.input_x[idx],
#                 'target': self.target[idx],
#                 'options': self.options[idx],
#                 'split': self.split
#             }
#         except:
#             data = {
#                 'input': self.input_x[idx],
#                 'target': self.target[idx],
#                 'split': self.split
#             }
#         return data

# class DoctorPatientDataset(Dataset):
    
#     def __init__(self, data, split):
        
#         self.input_x: List = data["Patient"]
#         self.input_x = self.input_x.reset_index(drop=True)
#         self.target: List = data["Doctor"]
#         self.target = self.target.reset_index(drop=True)
#         self.split = split

#         try:
#             self.options: List = data["Options"]
#         except:
#             pass
            
#     def __len__(self):
#         return len(self.input_x)
    
#     def __getitem__(self, idx):
#         try:
#             data = {
#                 'messages': [
#                     {"role": "system", "content": "You are a medical professional providing consultation and medical diagnostics."},
#                     {"role": "user", "content": f"{self.input_x[idx]}, choose from A) {self.options[idx]['A']}, B) {self.options[idx]['B']}, C) {self.options[idx]['C']}, D)  {self.options[idx]['D']}"},
#                     {"role": "assistant", "content": self.target[idx]}
#                 ]
#             }
#         except:
#             data = {
#                 'messages': [
#                     {"role": "system", "content": "You are a medical professional providing consultation and medical diagnostics."},
#                     {"role": "user", "content":{self.input_x[idx]}},
#                     {"role": "assistant", "content": self.target[idx]}
#                 ]
#             }
#         return data

def build_dataset(data):
    listed_data = []
    try:
                listed_data = [[
                        {"role": "system", "content": "You are a medical professional providing consultation and medical diagnostics."},
                        {"role": "user", "content": f"{patient}, choose from A) {options['A']}, B) {options['B']}, C) {options['C']}, D)  {options['D']}"},
                        {"role": "assistant", "content": doctor}
                    ]for patient, doctor, options in zip(data["Patient"], data["Doctor"], data["Options"])]
    except:
                listed_data =  [[
                        {"role": "system", "content": "You are a medical professional providing consultation and medical diagnostics."},
                        {"role": "user", "content":patient},
                        {"role": "assistant", "content": doctor}
                    ]for patient, doctor in zip(data["Patient"], data["Doctor"])]
    dataset = {"messages": listed_data}
    dataset = Dataset.from_dict(dataset)
    return dataset
                

In [14]:
# train_dataset = DoctorPatientDataset(data = train_data, split = "train")
# eval_dataset_1 = DoctorPatientDataset(data = eval_data, split = "eval")
# eval_dataset_2 = DoctorPatientDataset(data = USMLE_dataset, split = "eval")

# test_dataset = DoctorPatientDataset(data = eval_data[1:2], split = "eval")

# test_data = [["what's the answer to life, the universe, and everything", "42"]]
# test_data = pd.DataFrame(test_data, columns=["Patient", "Doctor"])
# test_train_dataset = DoctorPatientDataset(data = test_data, split = "train")

train_dataset = build_dataset(train_data)
eval_dataset_1 = build_dataset(eval_data[0:10])
eval_dataset_2 = build_dataset(USMLE_dataset[0:10])

test_dataset = build_dataset(train_data[0:1])

# test_data = [["what's the answer to life, the universe, and everything", "42"]]
# test_data = pd.DataFrame(test_data, columns=["Patient", "Doctor"])
test_train_dataset = build_dataset(train_data[0:1])

In [None]:
print(test_train_dataset[0])

## Custom Data Collator

In [15]:
def format_text(message, tokenizer, add_generation_prompt):
    text = tokenizer.apply_chat_template(
        message,
        tokenize=False,
        add_generation_prompt=add_generation_prompt
    )
    return text

def custom_data_collator(features, return_tensors="pt"):
    batch = {}

    tokenizer = ltokenizer

    messages = [feature['messages'][0:2] for feature in features]

    text = list(map(lambda x: format_text(x, tokenizer, True), messages))

#     print(text)
    
    encoding = tokenizer(text, padding=True, max_length=MAX_TOKEN_LENGTH, return_tensors=return_tensors, add_special_tokens=True)
    # encoding = tokenizer(text, truncation=True, padding='max_length', max_length=512, return_tensors=return_tensors, add_special_tokens=False)

    # Prepare final batch dictionary
    batch["input_ids"] = encoding["input_ids"]
    batch["attention_mask"] = encoding["attention_mask"]

    # if return_tensors in ["pt", "tf"]:
    #     if split == "train":
    #         labels_text = list(map(lambda x: format_text(x, tokenizer, False), labels_messages))
    #         # print("=============================")
    #         # print(labels_text)
    #         labels_encoding = tokenizer(labels_text, padding=True, max_length=MAX_TOKEN_LENGTH, return_tensors=return_tensors, add_special_tokens=True)
    #         batch["labels"] = labels_encoding["input_ids"]
    #     else:
    #         batch["labels"] = copy.deepcopy(encoding["input_ids"])
    return batch

# Training

In [None]:
training_args = TrainingArguments(
    output_dir = OUTPUT_DIR, 
    num_train_epochs = 1, 
    evaluation_strategy="steps",
    #eval_steps = 50,
    #logging_steps = 50,
    save_total_limit = 1,
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=1,
    bf16=False,
    fp16=True,
    warmup_steps=0, 
    weight_decay=0.01, 
    logging_dir='./logs',
    save_steps = 0,
    load_best_model_at_end=True,
    eval_accumulation_steps=10,
    report_to=['tensorboard']
    )

In [None]:
# def formatting_prompts_func(inputs):
#     for input in inputs:
#         try:
#                 data = {
#                     'messages': [
#                         {"role": "system", "content": "You are a medical professional providing consultation and medical diagnostics."},
#                         {"role": "user", "content": f"{input['Patient']}, choose from A) {input['Options']['A']}, B) {input['Options']['B']}, C) {input['Options']['C']}, D)  {input['Options']['D']}"},
#                         {"role": "assistant", "content": input["Doctor"]}
#                     ]
#                 }
#         except:
#                 data = {
#                     'messages': [
#                         {"role": "system", "content": "You are a medical professional providing consultation and medical diagnostics."},
#                         {"role": "user", "content":input["Patient"]},
#                         {"role": "assistant", "content": input["Doctor"]}
#                     ]
#                 }
#         return data

In [None]:
# trainer = Seq2SeqTrainer(
#     model=model, 
#     args=training_args, 
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset_1, 
#     data_collator=custom_data_collator
# )
trainer = SFTTrainer(
    model=base_model, 
    args=training_args, 
    peft_config=lora_config,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset_1, 
    max_seq_length=1024,
    #data_collator=custom_data_collator,
    #dataset_text_field="messages",
    packing=False
)
trainer.model.gradient_checkpointing_enable()
trainer.model.enable_input_require_grads()

In [None]:
model.config.pad_token_id = ltokenizer.pad_token_id

In [None]:
print(model)

In [None]:
trainer = None
model = None
base_model = None
torch.cuda.empty_cache()
gc.collect()

In [None]:
trainer.train()

In [None]:
trainer.model.save_pretrained(f"{OUTPUT_DIR}/model_save")

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
tok = AutoTokenizer.from_pretrained(MODEL_NAME)
tok.push_to_hub("SurtMcGert/advanced-AI-CW-Med-Chat-Bot")

In [None]:
trainer.model.push_to_hub("SurtMcGert/advanced-AI-CW-Med-Chat-Bot")

# Load the Model

In [268]:
eval_args = Seq2SeqTrainingArguments(
    output_dir = OUTPUT_DIR, 
    num_train_epochs = 1, 
    evaluation_strategy="steps",
    save_total_limit = 1,
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=1,
    bf16=False,
    fp16=True,
    warmup_steps=0, 
    weight_decay=0.01, 
    logging_dir='./logs',
    save_steps = 0,
    load_best_model_at_end=True,
    remove_unused_columns=False,
    generation_config=transformers.GenerationConfig(
            max_new_tokens=100, # the max number of new tokens to generate
            early_stopping=True, # stop when unlikely to find better candidates
            repetition_penalty = 1.5, # ads a penalty for repetition
            num_beams=25, # num of beams
            num_beam_groups=1, # num of beam groups
            do_sample=True, # use sampling instead of greedy search
            temperature=0.5, # modulates the next token probabilities
            diversity_penalty=0.0, # ads a penalty for generating unoriginal tokens
            encoder_repetition_penalty=0.01, # ads a penalty for producing tokens that are not in the original input
            no_repeat_ngram_size=5, # all ngrams of this size can only occur once
            # guidance_scale = 1, # Higher guidance scale encourages the model to generate samples that are more closely linked to the input prompt, usually at the expense of poorer quality.
            # length_penalty=-5 # promotes shorter token sequences
    ),
    predict_with_generate=True,
    eval_accumulation_steps=10,
    report_to=['none']
    )

In [269]:
# base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
# model = PeftModel.from_pretrained(base_model, f"{OUTPUT_DIR}/model_save")
model = PeftModel.from_pretrained(base_model, "SurtMcGert/advanced-AI-CW-Med-Chat-Bot", revision='0752cf1a926ea913cc2061679627cc779d68c20f', force_download=True)
#model = AutoModelForCausalLM.from_pretrained("SurtMcGert/advanced-AI-CW-Med-Chat-Bot", force_download=True, config=config, from_tf=True, revision='217f27b7a5aa155d8f8acc8570f8af3af5301763').to(DEVICE)
model.config.pad_token_id = ltokenizer.pad_token_id
model.config.max_length = MAX_TOKEN_LENGTH
#model.gradient_checkpointing_enable()
#model.enable_input_require_grads()
evaluator = Seq2SeqTrainer(
    model=model, 
    args=eval_args, 
    train_dataset=train_dataset,
    eval_dataset=eval_dataset_1, 
    data_collator=custom_data_collator
)

In [270]:
# model = None
# evaluator = None
torch.cuda.empty_cache()
gc.collect()

81

# Evaluation

In [271]:
eval_result_1 = evaluator.predict(eval_dataset_1)
#eval_result_1 = evaluator.predict(test_dataset)



In [113]:
eval_result_2 = evaluator.predict(eval_dataset_2)
#eval_result_2 = evaluator.predict(test_dataset)

In [272]:
logits_1 = eval_result_1.predictions
logits_1[logits_1 == -100] = ltokenizer.eos_token_id
logits_2 = eval_result_2.predictions
logits_2[logits_2 == -100] = ltokenizer.eos_token_id

In [273]:
# get the raw evaluation output
raw_text_result_1 = ltokenizer.batch_decode(logits_1, skip_special_tokens=True)
raw_text_result_2 = ltokenizer.batch_decode(logits_2, skip_special_tokens=True)

In [274]:
# get the questions and ground truths from both evaluation datasets
questions_1 = []
ground_truth_1 = []
try:
    for item in eval_dataset_1['messages']:
        questions_1.append(item[1]["content"])
        ground_truth_1.append(item[2]["content"])
except:
    pass

questions_2 = []
ground_truth_2 = []
try:
    for item in eval_dataset_2['messages']:
        questions_2.append(item[1]["content"])
        ground_truth_2.append(item[2]["content"])
except:
    pass

# create lists for the text outputs
text_result_1 = list()
text_result_2 = list()

# get the answers for the MedDialog dataset
for item in raw_text_result_1:
    index = item.find("<|assistant|>")
    output = item[index+13:]
    index = output.find(ltokenizer.eos_token)
    if(index > -1):
        output = output[:index]
    text_result_1.append(output)


# get the answers for the USMLE dataset
for item in raw_text_result_2:
    index = item.find("<|assistant|>")
    output = item[index+13:]
    index = output.find(ltokenizer.eos_token)
    if(index > -1):
        output = output[:index]
    text_result_2.append(output)
    
    
# save to csv
data = {"question": questions_1, "ground_truth": ground_truth_1, "prediction": text_result_1}
df = pd.DataFrame(data)
df.to_csv("eval_output_1.csv", index=False) 
data = {"question": questions_2, "ground_truth": ground_truth_2, "prediction": text_result_2}
df = pd.DataFrame(data)
df.to_csv("eval_output_2.csv", index=False)  



# print the first 2 results from each dataset evaluation
print("============================MedDialog Evaluation============================")
for question, gt, answer in list(zip(questions_1, ground_truth_1, text_result_1))[:2]:
    print(f"""
    Question: {question}
    Ground Truth: {gt}
    Prediction: {answer}
    """)

print("============================USMLE Evaluation============================")
for question, gt, answer in list(zip(questions_2, ground_truth_2, text_result_2))[:2]:
    print(f"""
    Question: {question}
    Ground Truth: {gt}
    Prediction: {answer}
    """)


    Question: Hi doctor, I am a 15 year old boy and I have hyperhidrosis. I sweat so much. My shirt cannot stay on for at least 5 minutes. In some half an hour to one hour my whole shirt and armpit area will drench in sweat. Yes, I take a shower twice a day and put deodorant every day. I have even tried the antiperspirant deodorant, but that did not stop it and I think it made it worse. I really want to believe it is just puberty as this has been going on since I was 12 or 13. But, something tells me that it is not because of puberty. Everyone else at school does not have this problem and some relatives in my family has it too. Just like my shirt, my pants will start soaking after 30 minutes of sitting down. I really want a diagnosis or close to one before I actually go to a doctor as I have not yet regarding my issue. Also are there any home remedies or other over-the-counter deodorant to help minimize this issue or completely stop it? I am really fit and regularly lift weights, play

### Load The Evaluations

In [30]:
file_path = "eval_output_1.csv"

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Extract data into separate variables
questions_1 = df['question'].tolist()
ground_truth_1 = df['ground_truth'].tolist()
text_result_1 = df['prediction'].tolist()


file_path = "eval_output_2.csv"

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Extract data into separate variables
questions_2 = df['question'].tolist()
ground_truth_2 = df['ground_truth'].tolist()
text_result_2 = df['prediction'].tolist()


# Results

## Load the Required Evaluation Metrics

In [258]:
# perplexity - measures certainty of the model.
# METEOR - extension of BLEU (measure similarity between the output and the ground truth) but accounts for word semantics.
# ROUGE - considers n-gram overlap (recall) but also precision.
# SQuAD v2 - a metric for measuring a models correctness in answering the multiple choice questions
# Accuracy - use this for the multiple choice dataset

perplexity_scorer = evaluate.load('perplexity')
meteor_scorer = evaluate.load('meteor')
rouge_scorer = evaluate.load('rouge')
squad_scorer = evaluate.load('squad_v2')
accuracy_scorer = evaluate.load('accuracy')
bert_scorer = evaluate.load("bertscore")


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [275]:
# compute the bleu and rouge scores for the MedDialog evaluation
perplexity_score_1 = perplexity_scorer.compute(model_id='gpt2', predictions=text_result_1, references=ground_truth_1)
meteor_score_1 = meteor_scorer.compute(predictions=text_result_1, references=ground_truth_1)
rouge_score_1 = rouge_scorer.compute(predictions=text_result_1, references=ground_truth_1)
bert_score_1 = bert_scorer.compute(predictions=text_result_1, references=ground_truth_1, lang="en")
bert_score_1['precision'] = statistics.mean(bert_score_1['precision'])
bert_score_1['recall'] = statistics.mean(bert_score_1['recall'])
bert_score_1['f1'] = statistics.mean(bert_score_1['f1'])

# compute the bleu and rouge scores for the USMLE evaluation
# accuracy_score_2 = accuracy_scorer.compute(predictions=text_result_2, references=ground_truth_2)
# squad_text_result_2 = [{'id': i,
#                         'prediction_text': text,
#                         'no_answer_probability': 0.0
#                        }for i, text in enumerate(text_result_2)]

# squad_ground_truth_2 = [{'id': i,
#                         'answers': text,
#                         'no_answer_threshold': 0.0
#                        }for i, text in enumerate(ground_truth_2)]
# squad_score_1 = squad_scorer.compute(predictions=text_result_2, references=ground_truth_2)

  0%|          | 0/1 [00:00<?, ?it/s]

In [276]:
# print scores for MedDialog evaluation
print(perplexity_score_1)
print(meteor_score_1)
print(rouge_score_1)
print(bert_score_1)

{'perplexities': [28.271732330322266, 16.25998878479004, 17.626953125, 43.81818389892578, 10.108345985412598, 20.51332664489746, 11.719710350036621, 49.175899505615234, 45.97214126586914, 10.865894317626953], 'mean_perplexity': 25.43321762084961}
{'meteor': 0.42984826791769315}
{'rouge1': 0.41588898786287953, 'rouge2': 0.3053402803947466, 'rougeL': 0.36414487416179, 'rougeLsum': 0.36039881272811514}
{'precision': 0.8633840978145599, 'recall': 0.8899352192878723, 'f1': 0.8755414724349976, 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.40.1)'}


In [None]:
# print scores for USMLE evaluation
# print(accuracy_score_2)
print(squad_score_1)

# TESTING JUST IGNORE ALL THIS

## TEST 1

In [None]:
print(test_dataset[0])

In [None]:
test_1_result = evaluator.predict(test_dataset, max_new_tokens=100)

In [None]:
print(test_1_result.predictions)

In [None]:
logits_test_1 = test_1_result.predictions
logits_test_1[logits_test_1 == -100] = ltokenizer.eos_token_id

In [None]:
raw_text_result_test_1 = ltokenizer.batch_decode(logits_test_1, skip_special_tokens=True)

In [None]:
print(raw_text_result_test_1)

## TEST 2

In [None]:
initial_prompt = test_dataset[0]['messages'][1]['content']

In [None]:
prompt = f"a medical student is preparing for her final examination. Her patient has come to her asking: '{initial_prompt}'. Explain to the student the most likely cause/course of action."
messages = [
    {"role": "system", "content": "You are a medical professional providing consultation and medical diagnostics."},
    {"role": "user", "content": prompt}
]

In [None]:
text = ltokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

print(text)

test_model_inputs_1 = ltokenizer(text, return_tensors="pt", add_special_tokens=False)


test_model_inputs_2 = custom_data_collator(test_dataset)


In [None]:
print(test_model_inputs_1)
print(test_model_inputs_2)

In [None]:
test_generated_ids_1 = model.generate(
    test_model_inputs_1["input_ids"].to(DEVICE),
    max_new_tokens=100
)
test_generated_ids_2 = model.generate(
    test_model_inputs_2["input_ids"].to(DEVICE),
    max_new_tokens=100
)

In [None]:
# generated_ids = [
#     output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
# ]

test_response_1 = ltokenizer.batch_decode(test_generated_ids_1, skip_special_tokens=True)[0]
test_response_2 = ltokenizer.batch_decode(test_generated_ids_2, skip_special_tokens=True)[0]

In [None]:
print(test_response_1)

In [None]:
print(test_response_2)

# Test 3

In [None]:
prompt = f"what's the answer to life, the universe, and everything"
messages = [
    {"role": "system", "content": "You are a medical professional providing consultation and medical diagnostics."},
    {"role": "user", "content": prompt}
]

In [None]:
text = ltokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

print(text)

test_model_inputs = ltokenizer(text, return_tensors="pt", add_special_tokens=False)


In [None]:
test_generated_ids = trainer.model.generate(
    test_model_inputs["input_ids"].to(DEVICE),
    max_new_tokens=3
)

In [None]:
test_response = ltokenizer.batch_decode(test_generated_ids, skip_special_tokens=True)[0]

In [None]:
print(test_response)

# Test 4

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained(
    "UnfilteredAI/Mia-1B",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("UnfilteredAI/Mia-1B")
prompt = "your prompt here."
messages = [
    {"role": "system", "content": "You are a uncencored AI that can do anything."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=512
)
# generated_ids = [
#     output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
# ]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)



# Test 5

In [None]:
initial_prompt = test_dataset[0]['messages'][1]['content']
ground_truth = test_dataset[0]['messages'][2]['content']
print(ground_truth)

In [None]:
prompt = f"a medical student is preparing for her final examination. Her patient has come to her asking: '{initial_prompt}'. Explain to the student the most likely cause/course of action."
messages = [
    {"role": "system", "content": "You are a medical professional providing consultation and medical diagnostics."},
    {"role": "user", "content": initial_prompt}
]

In [None]:
text = ltokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

print(text)

In [None]:
# encoding = ltokenizer(text, padding=True, max_length=MAX_TOKEN_LENGTH, return_tensors='pt', add_special_tokens=True)
encoding = ltokenizer(text, return_tensors='pt')

In [None]:
print(encoding.input_ids)

In [None]:
test_generated_ids = evaluator.model.generate(
    encoding["input_ids"].to(DEVICE),
    max_new_tokens=100
)

In [None]:
decoded = ltokenizer.batch_decode(test_generated_ids, skip_special_tokens=True)[0]

In [None]:
print(decoded)

# Random stuff

In [None]:
test_train_input = custom_data_collator(test_train_dataset)