**Using BERT (DistilBERT)** :- Training the model on a Q&A CSV Dataset for utilizing inside a Chatbot for Clinical Domain. TASK:- Question Answering.

In [None]:
####################  TRAINER for BERT / DISTILBERT / ROBERTA / ClinicalBERT / MedBERT   with CSV DATASET   ###############################

import os
os.environ["WANDB_DISABLED"] = "true"

import pandas as pd
import torch
from datasets import Dataset, DatasetDict, load_metric
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from sklearn.metrics import f1_score, accuracy_score

# Load and sample dataset
df = pd.read_csv('/content/CancerQA.csv')
df.rename(columns={'Question': 'question', 'Answer': 'context'}, inplace=True)
#df = df.sample(n=700, random_state=42).reset_index(drop=True)

# Tokenizer and model checkpoint
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

max_length = 512
stride = 128

# Preprocessing function
def preprocess_examples(examples):
    questions = [str(q) for q in examples['question']]
    contexts = [str(c) for c in examples['context']]

    tokenized_inputs = tokenizer(
        questions,
        contexts,
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = tokenized_inputs.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_inputs.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        sample_index = sample_mapping[i]
        context = str(examples['context'][sample_index]).lower()
        answer = context  # assuming full context is the answer

        start_char = context.find(answer)
        end_char = start_char + len(answer)

        start_pos = end_pos = 0  # default fallback
        for idx, (start, end) in enumerate(offsets):
            if start_char >= start and start_char < end:
                start_pos = idx
            if end_char > start and end_char <= end:
                end_pos = idx

        start_positions.append(start_pos)
        end_positions.append(end_pos)

    tokenized_inputs.update({
        'start_positions': start_positions,
        'end_positions': end_positions
    })
    return tokenized_inputs

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.map(preprocess_examples, batched=True, remove_columns=dataset.column_names)

# Split dataset
train_test_split = dataset.train_test_split(test_size=0.1)
dataset = DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['test']
})

# Load model
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

# Define metrics
def compute_metrics(p):
    pred_start = torch.argmax(torch.tensor(p.predictions[0]), dim=1)
    pred_end = torch.argmax(torch.tensor(p.predictions[1]), dim=1)
    true_start = torch.tensor(p.label_ids[0])
    true_end = torch.tensor(p.label_ids[1])

    # F1 and EM for positions
    exact_match = (pred_start == true_start) & (pred_end == true_end)
    f1 = f1_score(true_start.cpu(), pred_start.cpu(), average='macro')
    acc = accuracy_score(true_start.cpu(), pred_start.cpu())

    return {
        'f1': f1,
        'exact_match': exact_match.float().mean().item(),
        'accuracy': acc,
    }

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    run_name="CancerQA-DistilBERT-Run11",
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none",
    logging_dir='./logs',
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train
trainer.train()


Map:   0%|          | 0/999 [00:00<?, ? examples/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Exact Match,Accuracy
1,No log,0.098644,0.906788,0.952381,0.952381
2,No log,0.101602,0.926949,0.957143,0.957143
3,0.360700,0.104184,0.926949,0.957143,0.957143


TrainOutput(global_step=708, training_loss=0.26330079197210104, metrics={'train_runtime': 19894.4927, 'train_samples_per_second': 0.285, 'train_steps_per_second': 0.036, 'total_flos': 740019158581248.0, 'train_loss': 0.26330079197210104, 'epoch': 3.0})

1. (8GB RAM / No GPU or CUDA support), for a Medical Q&A dataset from Kaggle of 16K records it took more than 4 hours to go from 0.00 to 0.01/3 of the 3 Epochs.
2. used "google-bert/bert-base-uncased" with this 16K rows. It ran for more than 4 hours with very very slow progress.
3. shifted to google colab for its GPU/TPU for faster computations and RAM. Even then with a GPU, shifted from BERT 110 Million parameters to DistilBERT 66 Million for ~40% faster Training Time. (Because the Session would crash and Restart on Colab, and afte some time the Utility for GPU/TPU would expire)
4. With only a 1000 records and DistilBERT it finally Trained and saved the model after 3 Epochs.




In [None]:
#############  Script to Load Trained Model and Run Inference  ###########################
#############  Module with Basic Start-End-Token Handling   ###########################

from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# Load model and tokenizer from best checkpoint
model_path = "/content/drive/MyDrive/DistilBERT_Trained/checkpoint-708"  # <-- Replace with actual best checkpoint path
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForQuestionAnswering.from_pretrained(model_path)

def answer_question(question, context):
    inputs = tokenizer(question, context, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)

    start_idx = torch.argmax(outputs.start_logits)
    end_idx = torch.argmax(outputs.end_logits) + 1
    answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_idx:end_idx])
    )
    return answer

# Example usage
question = "what is tinnitus?"
context = "Tinnitus is the perception of ringing, buzzing, hissing, or other sounds in the ears or head when no external sound is present. Key Facts: It’s not a disease, but a symptom of an underlying condition. Commonly described as ringing in the ears, though some may hear clicking, roaring, or whooshing sounds. Can be intermittent or constant, and vary in loudness."
print("Answer:", answer_question(question, context))


Answer: tinnitus is the perception of ringing, buzzing, hissing, or other sounds in the ears or head when no external sound is present. key facts : it ’ s not a disease, but a symptom of an underlying condition. commonly described as ringing in the ears, though some may hear clicking, roaring, or whooshing sounds. can be intermittent or constant, and vary in loudness.


 It detects the Start_Token_ID and End_Token_ID +1 ID from context corpus to only extract the answer and not Generate, only after training on a dataset with SQuAD (JSON) formatted dataset or equivalent CSV formating.

1.   Though it doesn't understand "NO" or "Invalid Question or Answer" for as an output answer, even if the Question and the Context are irrelevant, or even if a single token is passed as the context corpus. It will just try to fetch something as an answer and return as the output.
2.   Fine tuning a pre-trained BERT model with gpt-2 would be great for low resource chatbot implementation.



In [None]:
#############  Script to Load Trained Model and Run Inference  ###########################
#############  Module with robust Start-End-Token Handling   ###########################

from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# Load model and tokenizer from best checkpoint
model_path = "/content/drive/MyDrive/DistilBERT_Trained/checkpoint-708"  # <-- Replace with actual best checkpoint path
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForQuestionAnswering.from_pretrained(model_path)

def answer_question(question, context):
    inputs = tokenizer(question, context, return_tensors='pt', truncation=True, max_length=512)
    input_ids = inputs['input_ids'][0]

    with torch.no_grad():
        outputs = model(**inputs)

    # Get start and end token scores
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    # Get most likely beginning and end of answer with .item()
    start_idx = torch.argmax(start_scores)
    end_idx = torch.argmax(end_scores)

    # Ensure the end is after the start
    if start_idx > end_idx:
        end_idx = start_idx

    answer_ids = input_ids[start_idx:end_idx+1]
    answer = tokenizer.decode(answer_ids, skip_special_tokens=True)

    return answer.strip()


# Example usage
question = "What is Tinnitus?"
context = "Tinnitus is the perception of ringing, buzzing, hissing, or other sounds in the ears or head when no external sound is present. Key Facts: It’s not a disease, but a symptom of an underlying condition. Commonly described as ringing in the ears, though some may hear clicking, roaring, or whooshing sounds. Can be intermittent or constant, and vary in loudness."
print("Answer:", answer_question(question, context))


Answer: tinnitus is the perception of ringing, buzzing, hissing, or other sounds in the ears or head when no external sound is present. key facts : it ’ s not a disease, but a symptom of an underlying condition. commonly described as ringing in the ears, though some may hear clicking, roaring, or whooshing sounds. can be intermittent or constant, and vary in loudness.


In [None]:
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained("/content/drive/MyDrive/DistilBERT_Trained/checkpoint-708") #medicalai/ClinicalBERT    #emilyalsentzer/Bio_ClinicalBERT
print(model.base_model_prefix)
print(model.num_parameters())


distilbert
66364418


In [None]:
from transformers import AutoModelForQuestionAnswering

# Load model
model_path = "/content/drive/MyDrive/DistilBERT_Trained/checkpoint-708"
model = AutoModelForQuestionAnswering.from_pretrained(model_path)

# Check trainable parameters
trainable_params = [p for p in model.parameters() if p.requires_grad]
total_params = sum(p.numel() for p in trainable_params)

if total_params == 0:
    print("✅ All parameters are frozen. No trainable parameters left.")
else:
    print(f"⚠️ There are still trainable parameters. Total trainable: {total_params}")


⚠️ There are still trainable parameters. Total trainable: 66364418


In [None]:
pip install gradio

In [None]:
##########################   UI for Q&A Testing  ##########################
import gradio as gr
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# Load model and tokenizer
model_path = "/content/drive/MyDrive/DistilBERT_Trained/checkpoint-708"  # Replace checkpoint folder  #emilyalsentzer/Bio_ClinicalBERT   #medicalai/ClinicalBERT
#################  # Replace checkpoint folder  #emilyalsentzer/Bio_ClinicalBERT   #medicalai/ClinicalBERT
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForQuestionAnswering.from_pretrained(model_path)

def get_answer(question, context):
    inputs = tokenizer(question, context, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)

    start_idx = torch.argmax(outputs.start_logits)
    end_idx = torch.argmax(outputs.end_logits) + 1
    answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_idx:end_idx])
    )
    return answer

gr.Interface(
    fn=get_answer,
    inputs=[
        gr.Textbox(lines=2, label="Question"),
        gr.Textbox(lines=5, label="Context Paragraph")
    ],
    outputs="text",
    title="DistilBERT Medical QA",
    description="Ask a question based on the provided medical context."
).launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f35c45b83eb5bc34f2.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
!pip install streamlit -q
!pip install pyngrok -q

In [None]:
%%writefile app.py

import streamlit as st
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from pyngrok import ngrok

ngrok.kill()

NGROK_AUTH_TOKEN = "YOUR_AUTHTOKEN"
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

public_url = ngrok.connect(8501)
print(f"Streamlit App URL: {public_url}")


model_path = "/content/drive/MyDrive/DistilBERT_Trained/checkpoint-708"

@st.cache_resource
def load_qa_model(model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForQuestionAnswering.from_pretrained(model_path)
    return tokenizer, model

# Load the model
tokenizer, model = load_qa_model(model_path)

def answer_question(question, context):
    inputs = tokenizer(question, context, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        start_idx = torch.argmax(outputs.start_logits)
        end_idx = torch.argmax(outputs.end_logits) + 1
        answer = tokenizer.convert_tokens_to_string(
            tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_idx:end_idx])
        )
        return answer

# Streamlit UI
st.title("DistilBERT Medical QA")
st.write("Ask a question based on the provided medical context.")

question = st.text_input("Question:", "")
context = st.text_area("Context Paragraph:", "")

if st.button("Get Answer"):
    if question and context:
        answer = answer_question(question, context)
        st.subheader("Answer:")
        st.write(answer)
    else:
        st.warning("Please enter both a question and context.")


!streamlit run app.py &>/dev/null&
print(f"Streamlit App URL: {public_url}")

Overwriting app.py


Studying and Researching BERT models with hands on experimentations has profoundly deepen my knowledge regarding the powerful Bi-Directional Encoder with attention span Part of the Transformers.
Not only that but also how to choose a system hardware setup to train larger models with larger and complex datasets as well.
Under the guidance of our NLP Professor Abdul sir from DESPU Pune, this has been a great learning experiance with new technologies.
Thank You.
-Siddhant Mutha (MSc. DS , SY)