In [2]:
import pandas as pd
from datasets import Dataset

In [3]:
df=pd.read_csv("/kaggle/input/medibot-dataset/mimic_iv_training_data_fixed.csv")
df.head()

Unnamed: 0,Symptoms,Diagnoses,Drugs,Dose_val_rx,Route
0,Patient reports vomiting with joint pain and o...,Pneumonia,"Atorvastatin, Spironolactone, Lisinopril",1 g,IV
1,Patient reports joint pain with vomiting and o...,Liver Cirrhosis,"Amoxicillin, Metformin, Albuterol",40 mg,IV
2,cough and joint pain present with nausea and c...,Diabetes Mellitus,"Amoxicillin, Atorvastatin",250 mg,IV
3,confusion and chest pain present with rapid he...,COPD,Albuterol,100 mg,IV
4,Experiencing diarrhea along with chills and fe...,COPD,"Prednisone, Furosemide",10 mg,IV


In [4]:
df["text_input"] = "Symptoms: " + df["Symptoms"]
df["text_output"] = "Diagnosis: " + df["Diagnoses"] + " | Drugs: " + df["Drugs"] + " | Dose: " + df["Dose_val_rx"] + " | Route: " + df["Route"]

In [8]:
df["text_input"].head()

0    Symptoms: Patient reports vomiting with joint ...
1    Symptoms: Patient reports joint pain with vomi...
2    Symptoms: cough and joint pain present with na...
3    Symptoms: confusion and chest pain present wit...
4    Symptoms: Experiencing diarrhea along with chi...
Name: text_input, dtype: object

In [9]:
df["text_output"].head()

0    Diagnosis: Pneumonia | Drugs: Atorvastatin, Sp...
1    Diagnosis: Liver Cirrhosis | Drugs: Amoxicilli...
2    Diagnosis: Diabetes Mellitus | Drugs: Amoxicil...
3    Diagnosis: COPD | Drugs: Albuterol | Dose: 100...
4    Diagnosis: COPD | Drugs: Prednisone, Furosemid...
Name: text_output, dtype: object

In [5]:
dataset = Dataset.from_pandas(df)
print("converted data frame into Hugging face Dataset format")

converted data frame into Hugging face Dataset format


In [6]:
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

In [7]:
print(train_dataset)


Dataset({
    features: ['Symptoms', 'Diagnoses', 'Drugs', 'Dose_val_rx', 'Route', 'text_input', 'text_output'],
    num_rows: 18000
})


In [8]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

In [None]:
model_name = "google-t5/t5-base"  
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Tokenization function
def tokenize_function(example):
    return tokenizer(example["text_input"], padding="max_length", truncation=True, max_length=512)

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask"])

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    push_to_hub=True,  # Upload to Hugging Face
    hub_model_id="Saurav-exe/medical-chatbot"
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

# Train the model
trainer.train()