<a href="https://colab.research.google.com/github/Ramosh99/MedicineQandA/blob/main/medicine_student_Q_and_A.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install datasets accelerate -U transformers[torch]




In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments


In [3]:
# Load the dataset
dataset = load_dataset("Malikeh1375/medical-question-answering-datasets", "all-processed")

# Define a preprocessing function
def preprocess_function(examples):
    inputs = examples['input']
    targets = examples['output']
    ins = examples['instruction']
    return {
        'input_text': inputs,
        'target_text': targets,
        'instruction': ins,
    }

# Preprocess the dataset
preprocessed_dataset = dataset.map(preprocess_function, remove_columns=['input', 'output'])

# Select a subset of examples (e.g., first 500 for training, 200 for evaluation)
train_dataset = preprocessed_dataset['train'].select(range(20000))
eval_dataset = train_dataset.select(range(500))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")

def tokenize_function(examples):
    inputs = [f"{ins} {inp}" for ins, inp in zip(examples['instruction'], examples['input_text'])]
    targets = examples['target_text']
    tokenized_inputs = tokenizer(
        inputs,
        padding="max_length",
        truncation=True,
        max_length=512,
    )
    tokenized_targets = tokenizer(
        targets,
        padding="max_length",
        truncation=True,
        max_length=512,
    )
    return {
        'input_ids': tokenized_inputs['input_ids'],
        'attention_mask': tokenized_inputs['attention_mask'],
        'labels': tokenized_targets['input_ids'],
    }

# Tokenize the datasets
tokenized_datasets = {
    'train': train_dataset.map(tokenize_function, batched=True),
    'validation': eval_dataset.map(tokenize_function, batched=True),
}





In [5]:

# Define training arguments with logging steps
training_args = Seq2SeqTrainingArguments(
    per_device_train_batch_size=4,
    eval_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,  # Log training loss every 10 steps
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    output_dir='./results',
    overwrite_output_dir=True,
)

# Initialize the Seq2Seq model
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

# Initialize Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
)

# Start training
trainer.train()

# Save the model
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")

Epoch,Training Loss,Validation Loss
1,0.6701,0.628342
2,0.7857,0.60964


Epoch,Training Loss,Validation Loss
1,0.6701,0.628342
2,0.7857,0.60964
3,0.7001,0.605199


('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/spiece.model',
 './saved_model/added_tokens.json',
 './saved_model/tokenizer.json')

In [6]:
# Load the trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("./saved_model")
model = AutoModelForSeq2SeqLM.from_pretrained("./saved_model")


# Define the user instruction and input
instruction = "If you are a doctor, please answer the medical questions based on the patient's description."
input_text = ("Hey Just wondering. I am a 39 year old female, pretty smallMy heart rate is around 97 to 106 at rest, and my BP is 140/90 and twice I get 175/118I did visit a doctor because I didnt feel well past month or twoThen the doctor gave me a heart medicine to take the pulse down and BP (its still in further examination.)But I wondering what it can be? Do I need the medicine really? Is that bad ?")

# Prepare the input for the model
input_sequence = f"{instruction} {input_text}"

# Truncate the input sequence to avoid token length issues
input_sequence = input_sequence[:512]

inputs = tokenizer(input_sequence, return_tensors="pt", padding="max_length", truncation=True, max_length=512)

# Generate the output using the model
outputs = model.generate(
    inputs['input_ids'],
    attention_mask=inputs['attention_mask'],
    max_length=512,
    num_beams=4,
    early_stopping=True
)

# Decode the output
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated output:")
print(output_text)


Generated output:
hi, welcome to chatbot. i have been asked to answer your query. i hope you have answered your query. i will be happy to answer your query. i will be happy to answer your query. i will be happy to answer your query. i will be happy to answer your query. i will be happy to answer your query. i will be happy to answer your query. i will be happy to answer your query. i will be happy to answer your query. i will be happy to answer your query.
