In [1]:
import pandas as pd
import torch
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

In [2]:
# Load dataset
df = pd.read_csv("intent_classification_dataset.csv")  # Ensure this file has 'Query' and 'Intent' columns

In [3]:
# Encode labels
label_encoder = LabelEncoder()
df["Intent"] = label_encoder.fit_transform(df["Intent"])  # Convert intent labels to numbers

In [4]:
# Save label encoder
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)
print("Label encoder saved as label_encoder.pkl")

Label encoder saved as label_encoder.pkl


In [5]:
# Split into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["Query"].tolist(), df["Intent"].tolist(), test_size=0.2, random_state=42
)

In [6]:
# Load tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [7]:
class IntentDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding="max_length", max_length=128)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

# Convert data to dataset objects
train_dataset = IntentDataset(train_texts, train_labels)
val_dataset = IntentDataset(val_texts, val_labels)


In [8]:
# Load RoBERTa model
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_encoder.classes_))

# Set training arguments
training_args = TrainingArguments(
    output_dir="./roberta_intent_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    load_best_model_at_end=True,
    push_to_hub=False
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train model
trainer.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.0003,0.000118
2,0.0001,5.3e-05
3,0.0001,4e-05


TrainOutput(global_step=3000, training_loss=0.022972415339201687, metrics={'train_runtime': 786.067, 'train_samples_per_second': 30.532, 'train_steps_per_second': 3.816, 'total_flos': 1578737203200000.0, 'train_loss': 0.022972415339201687, 'epoch': 3.0})

In [9]:
model.save_pretrained("./roberta_intent_model")
tokenizer.save_pretrained("./roberta_intent_model")

('./roberta_intent_model\\tokenizer_config.json',
 './roberta_intent_model\\special_tokens_map.json',
 './roberta_intent_model\\vocab.json',
 './roberta_intent_model\\merges.txt',
 './roberta_intent_model\\added_tokens.json')

In [10]:
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def predict_intent(query):
    inputs = tokenizer(query, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move inputs to the same device
    model.eval()
    
    with torch.no_grad():
        outputs = model(**inputs)
        predicted_label = torch.argmax(outputs.logits, dim=1).item()

    return label_encoder.inverse_transform([predicted_label])[0]

In [11]:
# Test predictions
test_queries = [
    "Schedule a blood test for next week.",
    "Cancel my MRI appointment for tomorrow.",
    "Do I need to fast before a blood test?",
    "I need to upload my X-ray report.",
    "Fetch my ultrasound records from last month.",
    "What are the side effects of chemotherapy?",
    "Hey, how are you?",
    "Im getting an error for checking blood report via your platform"
]

for query in test_queries:
    print(f"Query: {query}\nPredicted Intent: {predict_intent(query)}\n")

Query: Schedule a blood test for next week.
Predicted Intent: book_test

Query: Cancel my MRI appointment for tomorrow.
Predicted Intent: cancel_test

Query: Do I need to fast before a blood test?
Predicted Intent: medical_query

Query: I need to upload my X-ray report.
Predicted Intent: upload_document

Query: Fetch my ultrasound records from last month.
Predicted Intent: retrieve_document

Query: What are the side effects of chemotherapy?
Predicted Intent: medical_query

Query: Hey, how are you?
Predicted Intent: other

Query: Im getting an error for checking blood report via your platform
Predicted Intent: analyze_report

