In [None]:
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader, random_split

In [2]:
# Load dataset
df = pd.read_csv("intent_classification_dataset_v2.csv")

# Encode intent labels
label_encoder = LabelEncoder()
df["intent_label"] = label_encoder.fit_transform(df["intent"])

# Display intent mappings
intent_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
print("Intent Mapping:", intent_mapping)


Intent Mapping: {'analyze_report': 0, 'book_test': 1, 'cancel_appointment': 2, 'retrieve_record': 3, 'upload_record': 4}


In [3]:
# Load tokenizer (RoBERTa for better NLP handling)
MODEL_NAME = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Tokenize queries
tokens = tokenizer(list(df["query"]), padding=True, truncation=True, return_tensors="pt")
labels = torch.tensor(df["intent_label"].values)

# Create Dataset class
class IntentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

dataset = IntentDataset(tokens, labels)

# Split dataset (80% train, 20% validation)
train_size = int(0.8 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [4]:
from transformers import AdamW

# Load model (RoBERTa fine-tuned for classification)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(intent_mapping))

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
EPOCHS = 3
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch.values()]

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

print("Training complete!")


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["labels"] = torch.tensor(self.labels[idx])


Epoch 1, Loss: 0.6584485589899123
Epoch 2, Loss: 0.008899581022560596
Epoch 3, Loss: 0.004442874589003622
Training complete!


In [6]:
from sklearn.metrics import accuracy_score

model.eval()
preds, true_labels = [], []

with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch.values()]

        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

        preds.extend(predictions.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, preds)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["labels"] = torch.tensor(self.labels[idx])


Validation Accuracy: 100.00%


In [None]:
def predict_intent(query):
    model.eval()
    tokens = tokenizer(query, return_tensors="pt", padding=True, truncation=True).to(device)
    
    with torch.no_grad():
        output = model(tokens["input_ids"], attention_mask=tokens["attention_mask"])
        pred_label = torch.argmax(output.logits, dim=1).cpu().item()
    
    return label_encoder.inverse_transform([pred_label])[0]

# Test Cases
print(predict_intent("Schedule a blood test for next week."))
print(predict_intent("Cancel my MRI appointment for tomorrow."))  # cancel_appointment
print(predict_intent("Analyze my blood report and give insights."))  # analyze_report
print(predict_intent("I'm not sure about my availability for the scheduled appointment."))  # cancel_appointment (should be correct now)
print(predict_intent("Fetch my recent medical records."))  # retrieve_record
print(predict_intent("Upload my latest blood test report."))  # upload_record

book_test
cancel_appointment
analyze_report
cancel_appointment
retrieve_record
upload_record
cancel_appointment
