# Imports

In [63]:
import pandas as pd 
import torch
import torchtext
from torch.utils.data import Dataset, random_split
from typing import List, Dict, Union
from typing import Any, TypeVar
import pandas as pd
import os
import copy
import gc
import evaluate
import opendatasets as od

from datasets import load_dataset, Features, Value

from transformers import AutoTokenizer, TrainingArguments 
from transformers import Seq2SeqTrainer, AutoModelForCausalLM, IntervalStrategy

from sklearn.model_selection import train_test_split

set a seed and confirm CUDA support

In [34]:
torch.manual_seed(2137)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.deterministic = True

print("PyTorch Version: ", torch.__version__)
print("torchtext Version: ", torchtext.__version__)
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'}.")

PyTorch Version:  2.2.1+cu121
torchtext Version:  0.17.1+cpu
Using GPU.


# Dataset Download

## Downloading MedDialog Dataset

NOTE: you will need a kaggle API key for the following to work

In [4]:
import json

# Path to your JSON file
json_file_path = "kaggle.json"

# Open the file and read the content
try:
  with open(json_file_path, "r") as f:
    json_data = json.load(f)
except FileNotFoundError:
  print(f"Error: JSON file not found at {json_file_path}")
  exit(1)

# Access username and key from the JSON data
try:
  username = json_data["username"]
  key = json_data["key"]
except KeyError:
  print("Error: 'username' or 'key' key not found in JSON data")
  exit(1)

In [6]:
os.environ['KAGGLE_USERNAME'] = username
os.environ['KAGGLE_KEY'] = key

# Assign the Kaggle data set URL into variable
dataset = 'https://www.kaggle.com/datasets/dsxavier/diagnoise-me'
# Using opendatasets let's download the data sets
od.download(dataset, "dataset")

Downloading diagnoise-me.zip to dataset\diagnoise-me


100%|██████████| 191M/191M [00:41<00:00, 4.82MB/s] 





## Downloading LiveEQA Dataset

In [53]:
liveEQA_dataset = load_dataset("truehealth/liveqa", split="train")

In [46]:
print(liveEQA_dataset[0])

{'questionid': 'Q1', 'subject': None, 'message': 'Literature on Cardiac amyloidosis.  Please let me know where I can get literature on Cardiac amyloidosis.  My uncle died yesterday from this disorder.  Since this is such a rare disorder, and to honor his memory, I would like to distribute literature at his funeral service.  I am a retired NIH employee, so I am familiar with the campus in case you have literature at NIH that I can come and pick up.  Thank you ', 'focus': 'cardiac amyloidosis', 'type': 'information', 'answerid': 'Q1-S1-A1', 'pairid': '1', 'answer': 'Cardiac amyloidosis is a disorder caused by deposits of an abnormal protein (amyloid) in the heart tissue. These deposits make it hard for the heart to work properly.'}


# Load Datasets

## Loading MedDialog Dataset

In [49]:
DATA_PATH = "dataset\\diagnoise-me\\diagnose_en_dataset.feather"
SEQ_LEN: int = 1024
data = pd.read_feather(DATA_PATH)
print(data.keys())

# data = data['Patient'].values


# SAMPLE_SIZE: int =  int(data.shape[0] * 0.01) #get 1% of the data
# _data = [el[:SEQ_LEN]  for el in data[:SAMPLE_SIZE]]

Index(['id', 'Description', 'Doctor', 'Patient'], dtype='object')


In [50]:
# Split data into train and eval sets with 80% for training
train_data, eval_data = train_test_split(data, test_size=0.2, random_state=42)

# Print the shapes of the train and eval sets
print("Train data shape:", train_data.shape)
print("Eval data shape:", eval_data.shape)

Train data shape: (205975, 4)
Eval data shape: (51494, 4)


## Loading LiveEQA Dataset

In [54]:
liveEQA_dataset = pd.DataFrame({'Doctor': liveEQA_dataset["answer"], 'Patient': liveEQA_dataset["message"]})
# Print the shapes of the set
print("LiveEQA data shape:", liveEQA_dataset.shape)

LiveEQA data shape: (635, 2)


## Create an output directory

In [28]:
os.makedirs('./results', exist_ok = True)
OUTPUT_DIR: str = './results'

# Model

In [25]:
# tokens for the datset
MODEL_NAME: str = 'EleutherAI/gpt-neo-125M'
BOS_TOKEN: str = '<|startoftext|>'
EOS_TOKEN: str = '<|endoftext|>'
PAD_TOKEN: str = '<|pad|>'

In [56]:
# Load tokenizer 
MAX_TOKEN_LENGTH = 1024

# for evaluation
ltokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, bos_token = BOS_TOKEN, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN)
ltokenizer.padding_side = 'left'
ltokenizer.truncation_side = 'left'

# for training
rtokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, bos_token = BOS_TOKEN, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN)
rtokenizer.padding_side = 'right'
rtokenizer.truncation_side = 'right'

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [31]:
model =  AutoModelForCausalLM.from_pretrained(MODEL_NAME).cuda()
model.resize_token_embeddings(len(rtokenizer))

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Embedding(50259, 768)

# Preparing Data for Training

## Custom Dataset

In [57]:
class DoctorPatientDataset(Dataset):
    
    def __init__(self, data, split):
        
        self.input_x: List = data["Patient"]
        self.target: List = data["Doctor"]
        self.split = split
            
    def __len__(self):
        return len(self.input_x)
    
    def __getitem__(self, idx):
        data = {
            'input': self.input_x[idx],
            'target': self.target[idx],
            'split': self.split
        }
        return data

In [58]:
train_dataset = DoctorPatientDataset(data = train_data, split = "train")
eval_dataset_1 = DoctorPatientDataset(data = eval_data, split = "eval")
eval_dataset_2 = DoctorPatientDataset(data = liveEQA_dataset, split = "eval")

## Custom Data Collator

In [60]:
def custom_data_collator(features, return_tensors="pt"):
    batch = {}

    questions = [feature["input"] for feature in features]
    answers = [feature["target"] for feature in features]
    split = features[0]["split"]

    # training
    if split == 'train':
        tokenizer = rtokenizer
        bos_token = rtokenizer.bos_token
        eos_token = rtokenizer.eos_token
        text = [f"{bos_token}Question:{q}.Answer:{t}{eos_token}" for q, t in zip(questions, answers)]

    # evaluation
    else:
        # Format text to be encoded
        tokenizer = ltokenizer
        bos_token = ltokenizer.bos_token
        text = [f"{bos_token}Context:{q}.Target:" for q in questions]


    # Tokenize the text
    encoding = tokenizer(text, truncation=True, padding='max_length', max_length=MAX_TOKEN_LENGTH, return_tensors=return_tensors, add_special_tokens=False)

    # Prepare final batch dictionary
    batch["input_ids"] = encoding["input_ids"]
    batch["attention_mask"] = encoding["attention_mask"]

    if return_tensors in ["pt", "tf"]:
        batch["labels"] = copy.deepcopy(encoding["input_ids"])

# Training

In [29]:
training_args = TrainingArguments(
    output_dir = OUTPUT_DIR, 
    num_train_epochs = 2, 
    evaluation_strategy="steps",
    eval_steps = 5000,
    logging_steps = 5000,
    save_strategy="no",
    save_total_limit = 1,
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=1, 
    warmup_steps=50, 
    weight_decay=0.01, 
    logging_dir='./logs', 
    load_best_model_at_end=True,
    report_to=['tensorboard']
    )

In [None]:
trainer = Seq2SeqTrainer(
    model=model, 
    args=training_args, 
    train_dataset=train_dataset,
    eval_dataset=eval_dataset_1, 
    data_collator=custom_data_collator
)

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
trainer.train()

In [None]:
trainer.model.save_pretrained(f"{OUTPUT_DIR}/model_save")

# Load the Model

In [None]:
model = AutoModelForCausalLM.from_pretrained(f"{OUTPUT_DIR}/model_save")
trainer = Seq2SeqTrainer(
    model=model, 
    args=training_args, 
    train_dataset=train_dataset,
    eval_dataset=eval_dataset_1, 
    data_collator=custom_data_collator
)

# Evaluation

In [None]:
eval_result_1 = trainer.predict(eval_dataset_1, max_new_tokens=MAX_TOKEN_LENGTH)
eval_result_2 = trainer.predict(eval_dataset_2, max_new_tokens=MAX_TOKEN_LENGTH)
logits_1 = eval_result_1.predictions
logits_1[logits_1 == -100] = ltokenizer.eos_token_id
logits_2 = eval_result_2.predictions
logits_2[logits_2 == -100] = ltokenizer.eos_token_id

In [None]:
# get the raw evaluation output
raw_text_result_1 = ltokenizer.batch_decode(logits_1, skip_special_tokens=True)
raw_text_result_2 = ltokenizer.batch_decode(logits_2, skip_special_tokens=True)

# get the questions and ground truths from both evaluation datasets
questions_1 = []
ground_truth_1 = []
try:
    for item in eval_dataset_1:
        questions_1.append(item["input"])
        ground_truth_1.append(item["target"])
except:
    pass

questions_2 = []
ground_truth_2 = []
try:
    for item in eval_dataset_2:
        questions_2.append(item["input"])
        ground_truth_2.append(item["target"])
except:
    pass

# create lists for the text outputs
text_result_1 = list()
text_result_2 = list()

# get the answers for the MedDialog dataset
for item in raw_text_result_1:
    index = item.find("Answer:")
    output = item[index+7:]
    index = output.find(ltokenizer.eos_token)
    if(index > -1):
        output = output[:index]
    text_result_1.append(output)


# get the answers for the LiveEQA dataset
for item in raw_text_result_2:
    index = item.find("Answer:")
    output = item[index+7:]
    index = output.find(ltokenizer.eos_token)
    if(index > -1):
        output = output[:index]
    text_result_2.append(output)



# print the first 2 results from each dataset evaluation
print("============================MedDialog Evaluation============================")
for question, answer in list(zip(questions_1, text_result_1))[:2]:
    print(f"""
    Question: {question}
    Answer: {answer}
    """)

print("============================LiveEQA Evaluation============================")
for question, answer in list(zip(questions_2, text_result_2))[:2]:
    print(f"""
    Question: {question}
    Answer: {answer}
    """)

# Results

In [64]:
# load the evaluation metrics
bleu_scorer = evaluate.load('bleu')
rouge_scorer = evaluate.load('rouge')

In [None]:
# compute the bleu and rouge scores for the MedDialog evaluation
bleu_score_1 = bleu_scorer.compute(predictions=text_result_1, references=ground_truth_1)
rouge_score_1 = rouge_scorer.compute(predictions=text_result_1, references=ground_truth_1)

# compute the bleu and rouge scores for the LiveEAQ evaluation
bleu_score_2 = bleu_scorer.compute(predictions=text_result_1, references=ground_truth_2)
rouge_score_2 = rouge_scorer.compute(predictions=text_result_1, references=ground_truth_2)

In [None]:
# print scores for MedDialog evaluation
print("score on MedDialog Dataset")
print('BLEU1:', bleu_score_1['precisions'][0]*100)
print(f"""
ROUGE-1: {rouge_score_1['rouge1']*100}
ROUGE-2: {rouge_score_1['rouge2']*100}
ROUGE-L: {rouge_score_1['rougeL']*100}
""")

In [None]:
# print scores for LiveEAQ evaluation
print("score on LiveEQA Dataset")
print('BLEU1:', bleu_score_2['precisions'][0]*100)
print(f"""
ROUGE-1: {rouge_score_2['rouge1']*100}
ROUGE-2: {rouge_score_2['rouge2']*100}
ROUGE-L: {rouge_score_2['rougeL']*100}
""")