<a href="https://colab.research.google.com/github/Sanika712/Sanika712.github.io/blob/main/Medical_Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install packages once at the start (run in a fresh Colab)
!pip install -q --upgrade transformers datasets accelerate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/3.1 MB[0m [31m7.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━[0m [32m2.2/3.1 MB[0m [31m30.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.1/3.1 MB[0m [31m38.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Imports
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
)

In [None]:
# Load pretrained model and tokenizer
model_name = "Chromik/medical-chatbot-explainable-response-model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Upload kaggle.json and setup Kaggle CLI
from google.colab import files
files.upload()  # Upload fresh kaggle.json
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle (1).json


In [None]:
# Download and unzip datasets
!kaggle datasets download -d tboyle10/medicaltranscriptions
!unzip -o medicaltranscriptions.zip

!kaggle datasets download -d saifulislamsarfaraz/medical-chatbot-dataset
!unzip -o medical-chatbot-dataset.zip

Dataset URL: https://www.kaggle.com/datasets/tboyle10/medicaltranscriptions
License(s): CC0-1.0
medicaltranscriptions.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  medicaltranscriptions.zip
  inflating: mtsamples.csv           
Dataset URL: https://www.kaggle.com/datasets/saifulislamsarfaraz/medical-chatbot-dataset
License(s): unknown
medical-chatbot-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  medical-chatbot-dataset.zip
  inflating: train_data_chatbot.csv  
  inflating: validation_data_chatbot.csv  


In [None]:
# Load mtsamples.csv
df1 = pd.read_csv("mtsamples.csv")

# Keep only specified columns
keep_cols = ["description", "medical_specialty", "sample_name", "transcription", "keywords"]
df1 = df1[keep_cols]

# Load train_data_chatbot.csv
df2 = pd.read_csv("train_data_chatbot.csv")

# Keep only specified columns
keep_cols_2 = ["short_question", "short_answer", "tags"]
df2 = df2[keep_cols_2]

# Rename columns for preprocessing function
df2 = df2.rename(columns={"short_question": "input", "short_answer": "output"})

# Drop rows with missing inputs or outputs
df2 = df2.dropna(subset=["input", "output"])

In [None]:
# Create Hugging Face Dataset object from df2 (chatbot data)
dataset = Dataset.from_pandas(df2)

In [None]:
def preprocess_function(examples):
    inputs = examples["input"]
    targets = examples["output"]

    # Ensure lists for tokenization
    if isinstance(inputs, str):
        inputs = [inputs]
    if isinstance(targets, str):
        targets = [targets]

    # Handle None values in targets before tokenizing
    processed_targets = [target if target is not None else "" for target in targets]

    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")

    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(processed_targets, max_length=128, truncation=True, padding="max_length")

    # Replace pad token ids with -100
    labels_ids = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label]
        for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels_ids
    return model_inputs

In [None]:
# Tokenize dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

from datasets import DatasetDict

splits = tokenized_dataset.train_test_split(test_size=0.1)
tokenized_dataset = DatasetDict({
    "train": splits["train"],
    "test": splits["test"]
})

Map:   0%|          | 0/47603 [00:00<?, ? examples/s]



In [None]:
!pip install -q --upgrade transformers==4.41.2  # latest stable as of July 2025

from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

# Define training arguments
# Define training arguments - FIXED: changed evaluation_strategy to eval_strategy
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Changed from evaluation_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=50,
)
# Define data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Seq2SeqTrainer(


In [None]:
# Train model
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

API Key here: 2c2567e24c9cefb74be9e87916f13b1ce24ebee3

In [None]:
# Save the fine-tuned model & tokenizer
trainer.save_model("./medical-chatbot-finetuned")
tokenizer.save_pretrained("./medical-chatbot-finetuned")

print("Fine-tuning complete and model saved.")


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load fine-tuned model and tokenizer
model_path = "./medical-chatbot-finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [None]:
 #Define your input prompt
prompt = "Patient reports sudden leg pain today and also has a visible rash. What could be the possible causes? Please provide a medically explainable answer."



In [None]:
# Tokenize input and generate output tokens
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=128)

# Generate model response (you can tweak max_new_tokens or other generation params)
outputs = model.generate(
    **inputs,
    max_new_tokens=150,
    num_beams=4,             # beam search for better output
    early_stopping=True
)

# Decode output tokens to string
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Model Response:", response)