In [1]:

# Symptom Classifier using DistilBERT

from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments, 
    Trainer
)
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import torch
import evaluate

In [2]:
# Load and prepare data
df = pd.read_csv("urgentcare_symptoms_dataset.csv")
print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())
print("Sample data:")
print(df.head())
print("\nLabel distribution:")
print(df['label'].value_counts())



Dataset shape: (250, 2)
Columns: ['text', 'label']
Sample data:
                                                text        label
0  Feeling like compound fracture for a couple of...           ER
1  I'm experiencing nausea and vomiting for 1 day...  Urgent Care
2  Started having nausea and vomiting for 1 day a...  Urgent Care
3    Noticed bruised knee from fall and it's painful  Urgent Care
4  I'm experiencing severe burn on arm and it's p...           ER

Label distribution:
label
ER             125
Urgent Care    125
Name: count, dtype: int64


In [3]:
print(df.columns)


Index(['text', 'label'], dtype='object')


In [4]:
unique_labels = sorted(df['label'].unique())
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {v: k for k, v in label2id.items()}

print(f"\nLabel mapping:")
for label, idx in label2id.items():
    print(f"  {label} -> {idx}")

# Apply label encoding
df['label_id'] = df['label'].map(label2id)

# Verify label encoding
print(f"\nLabel ID range: {df['label_id'].min()} to {df['label_id'].max()}")
print("Sample with encoded labels:")
print(df[['text', 'label', 'label_id']].head())


Label mapping:
  ER -> 0
  Urgent Care -> 1

Label ID range: 0 to 1
Sample with encoded labels:
                                                text        label  label_id
0  Feeling like compound fracture for a couple of...           ER         0
1  I'm experiencing nausea and vomiting for 1 day...  Urgent Care         1
2  Started having nausea and vomiting for 1 day a...  Urgent Care         1
3    Noticed bruised knee from fall and it's painful  Urgent Care         1
4  I'm experiencing severe burn on arm and it's p...           ER         0


In [5]:
# Split data with stratification
df_train, df_test = train_test_split(
    df, 
    test_size=0.2, 
    random_state=42, 
    stratify=df['label']  # Ensure balanced splits
)

print(f"\nTrain set size: {len(df_train)}")
print(f"Test set size: {len(df_test)}")

# Create datasets with only necessary columns
train_data = df_train[['text', 'label_id']].copy()
test_data = df_test[['text', 'label_id']].copy()

# Rename label_id to labels for transformers
train_data = train_data.rename(columns={'label_id': 'labels'})
test_data = test_data.rename(columns={'label_id': 'labels'})

# Convert to HuggingFace datasets
dataset_train = Dataset.from_pandas(train_data)
dataset_test = Dataset.from_pandas(test_data)

print(f"\nDataset train features: {dataset_train.features}")
print(f"Dataset test features: {dataset_test.features}")



Train set size: 200
Test set size: 50

Dataset train features: {'text': Value('string'), 'labels': Value('int64'), '__index_level_0__': Value('int64')}
Dataset test features: {'text': Value('string'), 'labels': Value('int64'), '__index_level_0__': Value('int64')}


In [6]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Add padding token if not present
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [7]:
def tokenize_function(examples):
    """Tokenize the text with proper padding and truncation"""
    return tokenizer(
        examples["text"], 
        truncation=True, 
        padding=True, 
        max_length=512,
        return_tensors=None  # Don't convert to tensors here
    )

# Apply tokenization
print("\nTokenizing datasets...")
dataset_train = dataset_train.map(tokenize_function, batched=True)
dataset_test = dataset_test.map(tokenize_function, batched=True)

# Remove unnecessary columns and set format
dataset_train = dataset_train.remove_columns(['text'])
dataset_test = dataset_test.remove_columns(['text'])

print(f"Final train dataset features: {dataset_train.features}")
print(f"Final test dataset features: {dataset_test.features}")



Tokenizing datasets...


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Final train dataset features: {'labels': Value('int64'), '__index_level_0__': Value('int64'), 'input_ids': List(Value('int32')), 'attention_mask': List(Value('int8'))}
Final test dataset features: {'labels': Value('int64'), '__index_level_0__': Value('int64'), 'input_ids': List(Value('int32')), 'attention_mask': List(Value('int8'))}


In [8]:
# Load model
print(f"\nLoading model with {len(label2id)} classes...")
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Loading model with 2 classes...


In [9]:
# Load accuracy metric
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    """Compute accuracy metrics"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)


In [10]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,  # Reduced for faster training
    weight_decay=0.01,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    report_to="none",  # Disable wandb/tensorboard
    save_total_limit=2,  # Only keep 2 best models
    warmup_steps=100,
    learning_rate=2e-5,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    compute_metrics=compute_metrics,
    processing_class=tokenizer,  # Use processing_class instead of deprecated tokenizer
)

print("\nStarting training...")
try:
    # Train the model
    train_result = trainer.train()
    
    print(f"\nTraining completed!")
    print(f"Training loss: {train_result.training_loss:.4f}")
    
    # Evaluate the model
    print("\nEvaluating model...")
    eval_result = trainer.evaluate()
    
    print(f"Evaluation results:")
    for key, value in eval_result.items():
        print(f"  {key}: {value:.4f}")
    
    # Save the model
    print("\nSaving model...")
    trainer.save_model("./symptom_classifier_model")
    tokenizer.save_pretrained("./symptom_classifier_model")
    
    # Save label mappings
    import json
    with open("./symptom_classifier_model/label_mappings.json", "w") as f:
        json.dump({
            "label2id": label2id,
            "id2label": id2label
        }, f, indent=2)
    
    print("Model saved successfully!")
    
    # Test prediction function
    def predict_symptom(text):
        """Test prediction on new text"""
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        # Move inputs to GPU
        inputs = {k: v.to('cuda') for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_class = torch.argmax(predictions, dim=-1).item()
            confidence = predictions.max().item()
            
        return {
            "predicted_label": id2label[predicted_class],
            "confidence": confidence,
            "all_scores": {id2label[i]: score.item() for i, score in enumerate(predictions[0])}
        }
    
    # Test with sample texts
    print("\nTesting predictions:")
    test_texts = [
        "I have a severe headache and feel nauseous",
        "My chest hurts and I'm having trouble breathing",
        "I fell and my ankle is swollen and painful"
    ]
    
    for text in test_texts:
        result = predict_symptom(text)
        print(f"\nText: '{text}'")
        print(f"Predicted: {result['predicted_label']} (confidence: {result['confidence']:.3f})")
        print("Top 3 scores:")
        sorted_scores = sorted(result['all_scores'].items(), key=lambda x: x[1], reverse=True)[:3]
        for label, score in sorted_scores:
            print(f"  {label}: {score:.3f}")

except Exception as e:
    print(f"Error during training: {str(e)}")
    print(f"Error type: {type(e).__name__}")
    
    # Debug information
    print("\nDebug information:")
    print(f"Train dataset sample: {dataset_train[0]}")
    print(f"Labels type: {type(dataset_train[0]['labels'])}")
    print(f"Labels value: {dataset_train[0]['labels']}")


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6929,0.690459,0.56
2,0.6826,0.661152,0.78
3,0.6201,0.516841,0.88



Training completed!
Training loss: 0.6642

Evaluating model...


Evaluation results:
  eval_loss: 0.5168
  eval_accuracy: 0.8800
  eval_runtime: 0.0221
  eval_samples_per_second: 2265.7220
  eval_steps_per_second: 317.2010
  epoch: 3.0000

Saving model...
Model saved successfully!

Testing predictions:

Text: 'I have a severe headache and feel nauseous'
Predicted: ER (confidence: 0.589)
Top 3 scores:
  ER: 0.589
  Urgent Care: 0.411

Text: 'My chest hurts and I'm having trouble breathing'
Predicted: ER (confidence: 0.599)
Top 3 scores:
  ER: 0.599
  Urgent Care: 0.401

Text: 'I fell and my ankle is swollen and painful'
Predicted: Urgent Care (confidence: 0.523)
Top 3 scores:
  Urgent Care: 0.523
  ER: 0.477
