In [1]:
import pandas as pd
import torch
import spacy
import ast
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader, random_split

In [2]:
df = pd.read_csv("healthcare_intent_dataset.csv")

In [3]:
# Ensure all entity values are strings and handle missing values
df["entities"] = df["entities"].astype(str)  # Convert to string
df["entities"] = df["entities"].apply(lambda x: x if x.strip().startswith("{") else "{}")  # Replace invalid values

# Convert valid JSON-like strings to dictionaries
df["entities"] = df["entities"].apply(lambda x: ast.literal_eval(x) if x.strip() else {})
df.sample()

Unnamed: 0,query,intent,entities
376,Cancel my appointment for August 30th,cancel_test,{'DATE': 'August 30th'}


In [4]:
# Encode intent labels
label_encoder = LabelEncoder()
df["intent_label"] = label_encoder.fit_transform(df["intent"])

In [5]:
# Display intent mappings
intent_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
print("Intent Mapping:", intent_mapping)

Intent Mapping: {'book_test': 0, 'cancel_test': 1, 'medical_query': 2, 'retrieve_document': 3, 'upload_document': 4}


In [6]:
# Load tokenizer (RoBERTa for better NLP handling)
MODEL_NAME = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [7]:
# Tokenize queries
tokens = tokenizer(list(df["query"]), padding=True, truncation=True, return_tensors="pt")
labels = torch.tensor(df["intent_label"].values)

In [8]:
# Convert entities to tensor-friendly format
entity_labels = [str(e) for e in df["entities"]]

In [9]:
# Create Dataset class
class IntentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

dataset = IntentDataset(tokens, labels)

In [10]:
# Split dataset (80% train, 20% validation)
train_size = int(0.8 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [11]:
# DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [12]:
from transformers import AdamW

# Load model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(intent_mapping))

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Training loop
EPOCHS = 3
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch.values()]

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

print("Training complete!")

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["labels"] = torch.tensor(self.labels[idx])


In [14]:
# Validation
from sklearn.metrics import accuracy_score

model.eval()
preds, true_labels = [], []

with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch.values()]

        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

        preds.extend(predictions.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, preds)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["labels"] = torch.tensor(self.labels[idx])


Validation Accuracy: 100.00%


In [15]:
# Load spaCy model for entity extraction
nlp = spacy.load("en_core_web_sm")

In [16]:
# Function for extracting entities
def extract_entities(query, intent):
    entities = {}
    doc = nlp(query)

    if intent in ["book_test", "cancel_test"]:
        for ent in doc.ents:
            if ent.label_ in ["DATE", "TIME"]:
                entities["TIME"] = ent.text
            elif ent.label_ in ["EVENT", "ORG", "PRODUCT"]:
                entities["TEST_TYPE"] = ent.text

    elif intent in ["upload_record", "retrieve_record"]:
        for ent in doc.ents:
            if ent.label_ in ["PRODUCT", "WORK_OF_ART"]:
                entities["REPORT_TYPE"] = ent.text

    return entities

In [17]:
# Intent classification and entity extraction
def predict_intent_entities(query):
    model.eval()
    tokens = tokenizer(query, return_tensors="pt", padding=True, truncation=True).to(device)

    with torch.no_grad():
        output = model(tokens["input_ids"], attention_mask=tokens["attention_mask"])
        pred_label = torch.argmax(output.logits, dim=1).cpu().item()

    intent = label_encoder.inverse_transform([pred_label])[0]
    entities = extract_entities(query, intent)

    return {"intent": intent, "entities": entities}

In [23]:
# Test Cases
queries = [
    "Schedule a blood test for next week.",
    "Cancel my MRI appointment for tomorrow.",
    "I need to upload my X-ray report.",
    "Fetch my ultrasound records from last month.",
    "I want to book an ECG appointment for next Friday.",
    "I'm confused about my availability for my blood test scheduled in tomorrow.",
    "Can i get my X-ray report from last week?",
    "Can i get my X-ray done in next week?",
    "Fetch my ultrasound report from last week.",
]

for query in queries:
    result = predict_intent_entities(query)
    print(f"Query: {query}\nPrediction: {result}\n")

Query: Schedule a blood test for next week.
Prediction: {'intent': 'book_test', 'entities': {'TIME': 'next week'}}

Query: Cancel my MRI appointment for tomorrow.
Prediction: {'intent': 'cancel_test', 'entities': {'TEST_TYPE': 'Cancel', 'TIME': 'tomorrow'}}

Query: I need to upload my X-ray report.
Prediction: {'intent': 'upload_record', 'entities': {}}

Query: Fetch my ultrasound records from last month.
Prediction: {'intent': 'retrieve_record', 'entities': {}}

Query: I want to book an ECG appointment for next Friday.
Prediction: {'intent': 'book_test', 'entities': {'TEST_TYPE': 'ECG', 'TIME': 'next Friday'}}

Query: I'm confused about my availability for my blood test scheduled in tomorrow.
Prediction: {'intent': 'cancel_test', 'entities': {'TIME': 'tomorrow'}}

Query: Can i get my X-ray report from last week?
Prediction: {'intent': 'retrieve_record', 'entities': {}}

Query: Can i get my X-ray done in next week?
Prediction: {'intent': 'book_test', 'entities': {'TIME': 'next week'}}
