In [50]:
import pandas as pd
from datasets import Dataset

In [51]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

In [52]:
df=pd.read_csv("/kaggle/input/medibot-dataset/mimic_iv_training_data_fixed.csv")
df.head()

Unnamed: 0,Symptoms,Diagnoses,Drugs,Dose_val_rx,Route
0,Patient reports vomiting with joint pain and o...,Pneumonia,"Atorvastatin, Spironolactone, Lisinopril",1 g,IV
1,Patient reports joint pain with vomiting and o...,Liver Cirrhosis,"Amoxicillin, Metformin, Albuterol",40 mg,IV
2,cough and joint pain present with nausea and c...,Diabetes Mellitus,"Amoxicillin, Atorvastatin",250 mg,IV
3,confusion and chest pain present with rapid he...,COPD,Albuterol,100 mg,IV
4,Experiencing diarrhea along with chills and fe...,COPD,"Prednisone, Furosemide",10 mg,IV


In [53]:
df["text_input"] = "Symptoms: " + df["Symptoms"]
df["text_output"] = "Diagnosis: " + df["Diagnoses"] + " | Drugs: " + df["Drugs"] + " | Dose: " + df["Dose_val_rx"] + " | Route: " + df["Route"]

In [54]:
df["text_input"].head()

0    Symptoms: Patient reports vomiting with joint ...
1    Symptoms: Patient reports joint pain with vomi...
2    Symptoms: cough and joint pain present with na...
3    Symptoms: confusion and chest pain present wit...
4    Symptoms: Experiencing diarrhea along with chi...
Name: text_input, dtype: object

In [55]:
df["text_output"].head()

0    Diagnosis: Pneumonia | Drugs: Atorvastatin, Sp...
1    Diagnosis: Liver Cirrhosis | Drugs: Amoxicilli...
2    Diagnosis: Diabetes Mellitus | Drugs: Amoxicil...
3    Diagnosis: COPD | Drugs: Albuterol | Dose: 100...
4    Diagnosis: COPD | Drugs: Prednisone, Furosemid...
Name: text_output, dtype: object

In [56]:
dataset = Dataset.from_pandas(df)
print("converted data frame into Hugging face Dataset format")

converted data frame into Hugging face Dataset format


In [57]:
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

In [58]:
print(train_dataset)


Dataset({
    features: ['Symptoms', 'Diagnoses', 'Drugs', 'Dose_val_rx', 'Route', 'text_input', 'text_output'],
    num_rows: 18000
})


In [59]:
# Check if GPU is available

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


Using device: cuda


In [60]:
from huggingface_hub import notebook_login
from transformers import T5Tokenizer

# Authenticate to Hugging Face
notebook_login()

# Download model with fresh cache
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name, force_download=True, local_files_only=False)

print("Tokenizer loaded successfully!")


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Tokenizer loaded successfully!


In [62]:

# Load tokenizer and model
model_name = "t5-base"  # Try "google/bio-t5-large" for medical applications
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)  # Move model to GPU

# Tokenization function
def tokenize_function(example):
    return tokenizer(example["text_input"], padding="max_length", truncation=True, max_length=512)

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask"])

# Move dataset to GPU
tokenized_datasets = tokenized_datasets.map(lambda x: {"input_ids": x["input_ids"].to(device), "attention_mask": x["attention_mask"].to(device)})

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,  # Adjust based on GPU memory
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    push_to_hub=True,  # Upload to Hugging Face
    hub_model_id="Saurav-exe/Test-medical-chatbot",
    run_name="Medical_Chatbot_Training",
    fp16=True,  # Enable mixed precision for faster training on GPU
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

# Train the model using GPU
trainer.train()

# Save model locally
model.save_pretrained("./medical_chatbot")
tokenizer.save_pretrained("./medical_chatbot")

print("Training complete! Model saved successfully.")


Map:   0%|          | 0/18000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/18000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

<IPython.core.display.Javascript object>

KeyboardInterrupt: 